pppm_disp_intel.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Sep 28, 03:27

pppm_disp_intel.cpp
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing authors: William McDoniel (RWTH Aachen University)
	------------------------------------------------------------------------- */

	#include <mpi.h>
	#include <stdlib.h>
	#include <math.h>
	#include "pppm_disp_intel.h"
	#include "atom.h"
	#include "error.h"
	#include "fft3d_wrap.h"
	#include "gridcomm.h"
	#include "math_const.h"
	#include "math_special.h"
	#include "memory.h"
	#include "suffix.h"

	using namespace LAMMPS_NS;
	using namespace MathConst;
	using namespace MathSpecial;

	#define MAXORDER 7
	#define OFFSET 16384
	#define SMALL 0.00001
	#define LARGE 10000.0
	#define EPS_HOC 1.0e-7

	enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER};
	enum{REVERSE_RHO, REVERSE_RHO_G, REVERSE_RHO_A, REVERSE_RHO_NONE};
	enum{FORWARD_IK, FORWARD_AD, FORWARD_IK_PERATOM, FORWARD_AD_PERATOM,
	FORWARD_IK_G, FORWARD_AD_G, FORWARD_IK_PERATOM_G, FORWARD_AD_PERATOM_G,
	FORWARD_IK_A, FORWARD_AD_A, FORWARD_IK_PERATOM_A, FORWARD_AD_PERATOM_A,
	FORWARD_IK_NONE, FORWARD_AD_NONE, FORWARD_IK_PERATOM_NONE,
	FORWARD_AD_PERATOM_NONE};

	#ifdef FFT_SINGLE
	#define ZEROF 0.0f
	#define ONEF 1.0f
	#else
	#define ZEROF 0.0
	#define ONEF 1.0
	#endif

	/* ---------------------------------------------------------------------- */

	PPPMDispIntel::PPPMDispIntel(LAMMPS lmp, int narg, char *arg) :
	PPPMDisp(lmp, narg, arg)
	{
	suffix_flag \|= Suffix::INTEL;

	order = 7;
	order_6 = 7; //sets default stencil sizes to 7

	perthread_density = NULL;
	particle_ekx = particle_eky = particle_ekz = NULL;
	particle_ekx0 = particle_eky0 = particle_ekz0 = NULL;
	particle_ekx1 = particle_eky1 = particle_ekz1 = NULL;
	particle_ekx2 = particle_eky2 = particle_ekz2 = NULL;
	particle_ekx3 = particle_eky3 = particle_ekz3 = NULL;
	particle_ekx4 = particle_eky4 = particle_ekz4 = NULL;
	particle_ekx5 = particle_eky5 = particle_ekz5 = NULL;
	particle_ekx6 = particle_eky6 = particle_ekz6 = NULL;

	rho_lookup = drho_lookup = NULL;
	rho6_lookup = drho6_lookup = NULL;
	rho_points = 0;

	_use_table = _use_packing = _use_lrt = 0;
	}

	PPPMDispIntel::~PPPMDispIntel()
	{
	memory->destroy(perthread_density);
	memory->destroy(particle_ekx);
	memory->destroy(particle_eky);
	memory->destroy(particle_ekz);

	memory->destroy(rho_lookup);
	memory->destroy(drho_lookup);
	memory->destroy(rho6_lookup);
	memory->destroy(drho6_lookup);
	}



	/* ----------------------------------------------------------------------
	called once before run
	------------------------------------------------------------------------- */


	void PPPMDispIntel::init()
	{

	PPPMDisp::init();
	int ifix = modify->find_fix("package_intel");
	if (ifix < 0)
	error->all(FLERR,
	"The 'package intel' command is required for /intel styles");
	fix = static_cast<FixIntel *>(modify->fix[ifix]);

	#ifdef _LMP_INTEL_OFFLOAD
	_use_base = 0;
	if (fix->offload_balance() != 0.0) {
	_use_base = 1;
	return;
	}
	#endif

	fix->kspace_init_check();

	_use_lrt = fix->lrt();
	if (_use_lrt)
	error->all(FLERR,
	"LRT mode is currently not supported for pppm/disp/intel");


	// For vectorization, we need some padding in the end
	// The first thread computes on the global density
	if ((comm->nthreads > 1) && !_use_lrt) {
	memory->destroy(perthread_density);
	memory->create(perthread_density, comm->nthreads-1,
	ngrid + INTEL_P3M_ALIGNED_MAXORDER,
	"pppmdispintel:perthread_density");
	}

	_use_table = fix->pppm_table();
	if (_use_table) {
	rho_points = 5000;
	memory->destroy(rho_lookup);
	memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
	"pppmdispintel:rho_lookup");
	memory->destroy(rho6_lookup);
	memory->create(rho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
	"pppmdispintel:rho6_lookup");

	if(differentiation_flag == 1) {
	memory->destroy(drho_lookup);
	memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
	"pppmdispintel:drho_lookup");
	memory->destroy(drho6_lookup);
	memory->create(drho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
	"pppmdispintel:drho6_lookup");
	}
	precompute_rho();
	}
	if (order > INTEL_P3M_MAXORDER)
	error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
	}

	/* ----------------------------------------------------------------------
	compute the PPPMDispIntel long-range force, energy, virial
	------------------------------------------------------------------------- */

	void PPPMDispIntel::compute(int eflag, int vflag)
	{
	#ifdef _LMP_INTEL_OFFLOAD
	if (_use_base) {
	PPPMDisp::compute(eflag, vflag);
	return;
	}
	#endif
	int i;
	// convert atoms from box to lamda coords

	if (eflag \|\| vflag) ev_setup(eflag,vflag);
	else evflag = evflag_atom = eflag_global = vflag_global =
	eflag_atom = vflag_atom = 0;

	if (evflag_atom && !peratom_allocate_flag) {
	allocate_peratom();
	if (function[0]) {
	cg_peratom->ghost_notify();
	cg_peratom->setup();
	}
	if (function[1] + function[2] + function[3]) {
	cg_peratom_6->ghost_notify();
	cg_peratom_6->setup();
	}
	peratom_allocate_flag = 1;
	}
	if (triclinic == 0) boxlo = domain->boxlo;
	else {
	boxlo = domain->boxlo_lamda;
	domain->x2lamda(atom->nlocal);
	}
	// extend size of per-atom arrays if necessary

	if (atom->nmax > nmax) {

	if (function[0]) memory->destroy(part2grid);
	if (function[1] + function[2] + function[3]) memory->destroy(part2grid_6);
	if (differentiation_flag == 1) {
	memory->destroy(particle_ekx);
	memory->destroy(particle_eky);
	memory->destroy(particle_ekz);
	if (function[2] == 1){
	memory->destroy(particle_ekx0);
	memory->destroy(particle_eky0);
	memory->destroy(particle_ekz0);
	memory->destroy(particle_ekx1);
	memory->destroy(particle_eky1);
	memory->destroy(particle_ekz1);
	memory->destroy(particle_ekx2);
	memory->destroy(particle_eky2);
	memory->destroy(particle_ekz2);
	memory->destroy(particle_ekx3);
	memory->destroy(particle_eky3);
	memory->destroy(particle_ekz3);
	memory->destroy(particle_ekx4);
	memory->destroy(particle_eky4);
	memory->destroy(particle_ekz4);
	memory->destroy(particle_ekx5);
	memory->destroy(particle_eky5);
	memory->destroy(particle_ekz5);
	memory->destroy(particle_ekx6);
	memory->destroy(particle_eky6);
	memory->destroy(particle_ekz6);
	}

	}
	nmax = atom->nmax;
	if (function[0]) memory->create(part2grid,nmax,3,"pppm/disp:part2grid");
	if (function[1] + function[2] + function[3])
	memory->create(part2grid_6,nmax,3,"pppm/disp:part2grid_6");
	if (differentiation_flag == 1) {
	memory->create(particle_ekx, nmax, "pppmdispintel:pekx");
	memory->create(particle_eky, nmax, "pppmdispintel:peky");
	memory->create(particle_ekz, nmax, "pppmdispintel:pekz");
	if (function[2] == 1){
	memory->create(particle_ekx0, nmax, "pppmdispintel:pekx0");
	memory->create(particle_eky0, nmax, "pppmdispintel:peky0");
	memory->create(particle_ekz0, nmax, "pppmdispintel:pekz0");
	memory->create(particle_ekx1, nmax, "pppmdispintel:pekx1");
	memory->create(particle_eky1, nmax, "pppmdispintel:peky1");
	memory->create(particle_ekz1, nmax, "pppmdispintel:pekz1");
	memory->create(particle_ekx2, nmax, "pppmdispintel:pekx2");
	memory->create(particle_eky2, nmax, "pppmdispintel:peky2");
	memory->create(particle_ekz2, nmax, "pppmdispintel:pekz2");
	memory->create(particle_ekx3, nmax, "pppmdispintel:pekx3");
	memory->create(particle_eky3, nmax, "pppmdispintel:peky3");
	memory->create(particle_ekz3, nmax, "pppmdispintel:pekz3");
	memory->create(particle_ekx4, nmax, "pppmdispintel:pekx4");
	memory->create(particle_eky4, nmax, "pppmdispintel:peky4");
	memory->create(particle_ekz4, nmax, "pppmdispintel:pekz4");
	memory->create(particle_ekx5, nmax, "pppmdispintel:pekx5");
	memory->create(particle_eky5, nmax, "pppmdispintel:peky5");
	memory->create(particle_ekz5, nmax, "pppmdispintel:pekz5");
	memory->create(particle_ekx6, nmax, "pppmdispintel:pekx6");
	memory->create(particle_eky6, nmax, "pppmdispintel:peky6");
	memory->create(particle_ekz6, nmax, "pppmdispintel:pekz6");
	}
	}
	}
	energy = 0.0;
	energy_1 = 0.0;
	energy_6 = 0.0;
	if (vflag) for (i = 0; i < 6; i++) virial_6[i] = virial_1[i] = 0.0;

	// find grid points for all my particles
	// distribute partcles' charges/dispersion coefficients on the grid
	// communication between processors and remapping two fft
	// Solution of poissons equation in k-space and backtransformation
	// communication between processors
	// calculation of forces

	if (function[0]) {

	//perform calculations for coulomb interactions only

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	particle_map<float,double>(delxinv, delyinv, delzinv, shift, part2grid,
	nupper, nlower, nxlo_out, nylo_out, nzlo_out,
	nxhi_out, nyhi_out, nzhi_out,
	fix->get_mixed_buffers());
	make_rho_c<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	particle_map<double,double>(delxinv, delyinv, delzinv, shift, part2grid,
	nupper, nlower, nxlo_out, nylo_out,
	nzlo_out, nxhi_out, nyhi_out, nzhi_out,
	fix->get_double_buffers());
	make_rho_c<double,double>(fix->get_double_buffers());
	} else {
	particle_map<float,float>(delxinv, delyinv, delzinv, shift, part2grid,
	nupper, nlower, nxlo_out, nylo_out, nzlo_out,
	nxhi_out, nyhi_out, nzhi_out,
	fix->get_single_buffers());
	make_rho_c<float,float>(fix->get_single_buffers());
	}

	cg->reverse_comm(this,REVERSE_RHO);

	brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
	density_brick, density_fft, work1,remap);

	if (differentiation_flag == 1) {
	poisson_ad(work1, work2, density_fft, fft1, fft2,
	nx_pppm, ny_pppm, nz_pppm, nfft,
	nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
	nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
	energy_1, greensfn, virial_1, vg,vg2, u_brick, v0_brick,
	v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);

	cg->forward_comm(this,FORWARD_AD);

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	fieldforce_c_ad<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	fieldforce_c_ad<double,double>(fix->get_double_buffers());
	} else {
	fieldforce_c_ad<float,float>(fix->get_single_buffers());
	}

	if (vflag_atom) cg_peratom->forward_comm(this, FORWARD_AD_PERATOM);

	} else {
	poisson_ik(work1, work2, density_fft, fft1, fft2,
	nx_pppm, ny_pppm, nz_pppm, nfft,
	nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
	nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
	energy_1, greensfn, fkx, fky, fkz,fkx2, fky2, fkz2,
	vdx_brick, vdy_brick, vdz_brick, virial_1, vg,vg2,
	u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick,
	v5_brick);

	cg->forward_comm(this, FORWARD_IK);

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	fieldforce_c_ik<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	fieldforce_c_ik<double,double>(fix->get_double_buffers());
	} else {
	fieldforce_c_ik<float,float>(fix->get_single_buffers());
	}

	if (evflag_atom) cg_peratom->forward_comm(this, FORWARD_IK_PERATOM);
	}
	if (evflag_atom) fieldforce_c_peratom();
	}

	if (function[1]) {
	//perfrom calculations for geometric mixing

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
	part2grid_6, nupper_6, nlower_6, nxlo_out_6,
	nylo_out_6, nzlo_out_6, nxhi_out_6,
	nyhi_out_6, nzhi_out_6,
	fix->get_mixed_buffers());
	make_rho_g<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
	part2grid_6, nupper_6, nlower_6, nxlo_out_6,
	nylo_out_6, nzlo_out_6, nxhi_out_6,
	nyhi_out_6, nzhi_out_6,
	fix->get_double_buffers());
	make_rho_g<double,double>(fix->get_double_buffers());
	} else {
	particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
	part2grid_6, nupper_6, nlower_6, nxlo_out_6,
	nylo_out_6, nzlo_out_6, nxhi_out_6,
	nyhi_out_6, nzhi_out_6,
	fix->get_single_buffers());
	make_rho_g<float,float>(fix->get_single_buffers());
	}


	cg_6->reverse_comm(this, REVERSE_RHO_G);

	brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
	density_brick_g, density_fft_g, work1_6,remap_6);

	if (differentiation_flag == 1) {

	poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
	nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6,
	nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6,
	nyhi_fft_6, nzhi_fft_6, nxlo_in_6, nylo_in_6, nzlo_in_6,
	nxhi_in_6, nyhi_in_6, nzhi_in_6, energy_6, greensfn_6,
	virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, v1_brick_g,
	v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);

	cg_6->forward_comm(this,FORWARD_AD_G);

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	fieldforce_g_ad<double,double>(fix->get_double_buffers());
	} else {
	fieldforce_g_ad<float,float>(fix->get_single_buffers());
	}

	if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_G);

	} else {
	poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
	nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
	nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
	nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
	nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,
	fkx2_6, fky2_6, fkz2_6, vdx_brick_g, vdy_brick_g,
	vdz_brick_g, virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g,
	v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);

	cg_6->forward_comm(this,FORWARD_IK_G);

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	fieldforce_g_ik<double,double>(fix->get_double_buffers());
	} else {
	fieldforce_g_ik<float,float>(fix->get_single_buffers());
	}


	if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_G);
	}
	if (evflag_atom) fieldforce_g_peratom();
	}

	if (function[2]) {
	//perform calculations for arithmetic mixing

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
	part2grid_6, nupper_6, nlower_6,
	nxlo_out_6, nylo_out_6, nzlo_out_6,
	nxhi_out_6, nyhi_out_6, nzhi_out_6,
	fix->get_mixed_buffers());
	make_rho_a<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
	part2grid_6, nupper_6, nlower_6, nxlo_out_6,
	nylo_out_6, nzlo_out_6, nxhi_out_6,
	nyhi_out_6, nzhi_out_6,
	fix->get_double_buffers());
	make_rho_a<double,double>(fix->get_double_buffers());
	} else {
	particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
	part2grid_6, nupper_6, nlower_6, nxlo_out_6,
	nylo_out_6, nzlo_out_6, nxhi_out_6,
	nyhi_out_6, nzhi_out_6,
	fix->get_single_buffers());
	make_rho_a<float,float>(fix->get_single_buffers());
	}

	cg_6->reverse_comm(this, REVERSE_RHO_A);

	brick2fft_a();

	if ( differentiation_flag == 1) {

	poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
	nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
	nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
	nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
	nzhi_in_6, energy_6, greensfn_6, virial_6, vg_6, vg2_6,
	u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3,
	v3_brick_a3, v4_brick_a3, v5_brick_a3);
	poisson_2s_ad(density_fft_a0, density_fft_a6, u_brick_a0, v0_brick_a0,
	v1_brick_a0, v2_brick_a0, v3_brick_a0, v4_brick_a0,
	v5_brick_a0, u_brick_a6, v0_brick_a6, v1_brick_a6,
	v2_brick_a6, v3_brick_a6, v4_brick_a6, v5_brick_a6);
	poisson_2s_ad(density_fft_a1, density_fft_a5, u_brick_a1, v0_brick_a1,
	v1_brick_a1, v2_brick_a1, v3_brick_a1, v4_brick_a1,
	v5_brick_a1, u_brick_a5, v0_brick_a5, v1_brick_a5,
	v2_brick_a5, v3_brick_a5, v4_brick_a5, v5_brick_a5);
	poisson_2s_ad(density_fft_a2, density_fft_a4, u_brick_a2, v0_brick_a2,
	v1_brick_a2, v2_brick_a2, v3_brick_a2, v4_brick_a2,
	v5_brick_a2, u_brick_a4, v0_brick_a4, v1_brick_a4,
	v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);

	cg_6->forward_comm(this, FORWARD_AD_A);

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	fieldforce_a_ad<double,double>(fix->get_double_buffers());
	} else {
	fieldforce_a_ad<float,float>(fix->get_single_buffers());
	}

	if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_AD_PERATOM_A);

	} else {

	poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
	nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
	nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
	nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
	nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,fkx2_6,
	fky2_6, fkz2_6, vdx_brick_a3, vdy_brick_a3, vdz_brick_a3,
	virial_6, vg_6, vg2_6, u_brick_a3, v0_brick_a3, v1_brick_a3,
	v2_brick_a3, v3_brick_a3, v4_brick_a3, v5_brick_a3);
	poisson_2s_ik(density_fft_a0, density_fft_a6, vdx_brick_a0,
	vdy_brick_a0, vdz_brick_a0, vdx_brick_a6, vdy_brick_a6,
	vdz_brick_a6, u_brick_a0, v0_brick_a0, v1_brick_a0,
	v2_brick_a0, v3_brick_a0, v4_brick_a0, v5_brick_a0,
	u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6,
	v3_brick_a6, v4_brick_a6, v5_brick_a6);
	poisson_2s_ik(density_fft_a1, density_fft_a5, vdx_brick_a1,
	vdy_brick_a1, vdz_brick_a1, vdx_brick_a5, vdy_brick_a5,
	vdz_brick_a5, u_brick_a1, v0_brick_a1, v1_brick_a1,
	v2_brick_a1, v3_brick_a1, v4_brick_a1, v5_brick_a1,
	u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5,
	v3_brick_a5, v4_brick_a5, v5_brick_a5);
	poisson_2s_ik(density_fft_a2, density_fft_a4, vdx_brick_a2,
	vdy_brick_a2, vdz_brick_a2, vdx_brick_a4, vdy_brick_a4,
	vdz_brick_a4, u_brick_a2, v0_brick_a2, v1_brick_a2,
	v2_brick_a2, v3_brick_a2, v4_brick_a2, v5_brick_a2,
	u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4,
	v3_brick_a4, v4_brick_a4, v5_brick_a4);

	cg_6->forward_comm(this, FORWARD_IK_A);

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	fieldforce_a_ik<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	fieldforce_a_ik<double,double>(fix->get_double_buffers());
	} else {
	fieldforce_a_ik<float,float>(fix->get_single_buffers());
	}

	if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_A);
	}
	if (evflag_atom) fieldforce_a_peratom();
	}

	if (function[3]) {
	//perform calculations if no mixing rule applies

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
	part2grid_6, nupper_6, nlower_6, nxlo_out_6,
	nylo_out_6, nzlo_out_6, nxhi_out_6,
	nyhi_out_6, nzhi_out_6,
	fix->get_mixed_buffers());
	make_rho_none<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
	part2grid_6, nupper_6, nlower_6, nxlo_out_6,
	nylo_out_6, nzlo_out_6, nxhi_out_6,
	nyhi_out_6, nzhi_out_6,
	fix->get_double_buffers());
	make_rho_none<double,double>(fix->get_double_buffers());
	} else {
	particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
	part2grid_6, nupper_6, nlower_6, nxlo_out_6,
	nylo_out_6, nzlo_out_6, nxhi_out_6,
	nyhi_out_6, nzhi_out_6,
	fix->get_single_buffers());
	make_rho_none<float,float>(fix->get_single_buffers());
	}

	cg_6->reverse_comm(this, REVERSE_RHO_NONE);

	brick2fft_none();

	if (differentiation_flag == 1) {

	int n = 0;
	for (int k = 0; k<nsplit_alloc/2; k++) {
	poisson_none_ad(n,n+1,density_fft_none[n],density_fft_none[n+1],
	u_brick_none[n],u_brick_none[n+1],
	v0_brick_none, v1_brick_none, v2_brick_none,
	v3_brick_none, v4_brick_none, v5_brick_none);
	n += 2;
	}

	cg_6->forward_comm(this,FORWARD_AD_NONE);

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	fieldforce_none_ad<double,double>(fix->get_double_buffers());
	} else {
	fieldforce_none_ad<float,float>(fix->get_single_buffers());
	}

	if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_NONE);

	} else {
	int n = 0;
	for (int k = 0; k<nsplit_alloc/2; k++) {

	poisson_none_ik(n,n+1,density_fft_none[n], density_fft_none[n+1],
	vdx_brick_none[n], vdy_brick_none[n],
	vdz_brick_none[n], vdx_brick_none[n+1],
	vdy_brick_none[n+1], vdz_brick_none[n+1],
	u_brick_none, v0_brick_none, v1_brick_none,
	v2_brick_none, v3_brick_none, v4_brick_none,
	v5_brick_none);
	n += 2;
	}

	cg_6->forward_comm(this,FORWARD_IK_NONE);

	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	fieldforce_none_ik<double,double>(fix->get_double_buffers());
	} else {
	fieldforce_none_ik<float,float>(fix->get_single_buffers());
	}

	if (evflag_atom)
	cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_NONE);
	}
	if (evflag_atom) fieldforce_none_peratom();
	}

	// update qsum and qsqsum, if atom count has changed and energy needed

	if ((eflag_global \|\| eflag_atom) && atom->natoms != natoms_original) {
	qsum_qsq();
	natoms_original = atom->natoms;
	}

	// sum energy across procs and add in volume-dependent term

	const double qscale = force->qqrd2e * scale;
	if (eflag_global) {
	double energy_all;
	MPI_Allreduce(&energy_1,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
	energy_1 = energy_all;
	MPI_Allreduce(&energy_6,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
	energy_6 = energy_all;

	energy_1 = 0.5volume;
	energy_6 = 0.5volume;

	energy_1 -= g_ewald*qsqsum/MY_PIS +
	MY_PI2qsumqsum / (g_ewaldg_ewaldvolume);
	energy_6 += - MY_PIMY_PIS/(6volume)pow(g_ewald_6,3)csumij +
	1.0/12.0pow(g_ewald_6,6)csum;
	energy_1 *= qscale;
	}

	// sum virial across procs

	if (vflag_global) {
	double virial_all[6];
	MPI_Allreduce(virial_1,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
	for (i = 0; i < 6; i++) virial[i] = 0.5qscalevolume*virial_all[i];
	MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
	for (i = 0; i < 6; i++) virial[i] += 0.5volumevirial_all[i];
	if (function[1]+function[2]+function[3]){
	double a = MY_PIMY_PIS/(6volume)pow(g_ewald_6,3)csumij;
	virial[0] -= a;
	virial[1] -= a;
	virial[2] -= a;
	}
	}

	if (eflag_atom) {
	if (function[0]) {
	double *q = atom->q;
	for (i = 0; i < atom->nlocal; i++) {
	eatom[i] -= qscaleg_ewaldq[i]q[i]/MY_PIS + qscaleMY_PI2q[i]
	qsum / (g_ewaldg_ewaldvolume); //coulomb self energy correction
	}
	}
	if (function[1] + function[2] + function[3]) {
	int tmp;
	for (i = 0; i < atom->nlocal; i++) {
	tmp = atom->type[i];
	eatom[i] += - MY_PIMY_PIS/(6volume)pow(g_ewald_6,3)csumi[tmp] +
	1.0/12.0pow(g_ewald_6,6)cii[tmp];
	}
	}
	}

	if (vflag_atom) {
	if (function[1] + function[2] + function[3]) {
	int tmp;
	for (i = 0; i < atom->nlocal; i++) {
	tmp = atom->type[i];
	//dispersion self virial correction
	for (int n = 0; n < 3; n++) vatom[i][n] -= MY_PIMY_PIS/(6volume)*
	pow(g_ewald_6,3)*csumi[tmp];
	}
	}
	}


	// 2d slab correction

	if (slabflag) slabcorr(eflag);
	if (function[0]) energy += energy_1;
	if (function[1] + function[2] + function[3]) energy += energy_6;

	// convert atoms back from lamda to box coords

	if (triclinic) domain->lamda2x(atom->nlocal);
	}


	/* ---------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	find center grid pt for each of my particles
	check that full stencil for the particle will fit in my 3d brick
	store central grid pt indices in part2grid array
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t>
	void PPPMDispIntel::particle_map(double delx, double dely, double delz,
	double sft, int** p2g, int nup, int nlow,
	int nxlo, int nylo, int nzlo,
	int nxhi, int nyhi, int nzhi,
	IntelBuffers<flt_t,acc_t> *buffers)
	{
	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;

	if (!ISFINITE(boxlo[0]) \|\| !ISFINITE(boxlo[1]) \|\| !ISFINITE(boxlo[2]))
	error->one(FLERR,"Non-numeric box dimensions - simulation unstable");

	int flag = 0;

	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nlocal, nthr, delx, dely, delz, sft, p2g, nup, nlow, nxlo,\
	nylo, nzlo, nxhi, nyhi, nzhi) reduction(+:flag) if(!_use_lrt)
	#endif
	{
	double **x = atom->x;
	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delx;
	const flt_t yi = dely;
	const flt_t zi = delz;
	const flt_t fshift = sft;


	int iifrom, iito, tid;
	IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));

	#if defined(LMP_SIMD_COMPILER)
	#pragma vector aligned
	#pragma simd reduction(+:flag)
	#endif
	for (int i = iifrom; i < iito; i++) {

	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// current particle coord can be outside global and local box
	// add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1

	int nx = static_cast<int> ((x[i][0]-lo0)*xi+fshift) - OFFSET;
	int ny = static_cast<int> ((x[i][1]-lo1)*yi+fshift) - OFFSET;
	int nz = static_cast<int> ((x[i][2]-lo2)*zi+fshift) - OFFSET;

	p2g[i][0] = nx;
	p2g[i][1] = ny;
	p2g[i][2] = nz;

	// check that entire stencil around nx,ny,nz will fit in my 3d brick

	if (nx+nlow < nxlo \|\| nx+nup > nxhi \|\|
	ny+nlow < nylo \|\| ny+nup > nyhi \|\|
	nz+nlow < nzlo \|\| nz+nup > nzhi)
	flag = 1;
	}
	}

	if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPMDisp");
	}

	/* ----------------------------------------------------------------------
	create discretized "density" on section of global grid due to my particles
	density(x,y,z) = charge "density" at grid points of my 3d brick
	(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
	in global grid
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers)
	{
	// clear 3d density array

	FFT_SCALAR * _noalias global_density =
	&(density_brick[nzlo_out][nylo_out][nxlo_out]);

	// loop over my charges, add their contribution to nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt

	//double *q = atom->q;
	//double **x = atom->x;
	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;

	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nthr, nlocal, global_density) if(!_use_lrt)
	#endif
	{
	double *q = atom->q;
	double **x = atom->x;

	const int nix = nxhi_out - nxlo_out + 1;
	const int niy = nyhi_out - nylo_out + 1;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv;
	const flt_t yi = delyinv;
	const flt_t zi = delzinv;
	const flt_t fshift = shift;
	const flt_t fshiftone = shiftone;
	const flt_t fdelvolinv = delvolinv;

	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
	FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
	perthread_density[tid - 1];
	// clear 3d density array
	memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));

	for (int i = ifrom; i < ito; i++) {

	int nx = part2grid[i][0];
	int ny = part2grid[i][1];
	int nz = part2grid[i][2];

	int nysum = nlower + ny - nylo_out;
	int nxsum = nlower + nx - nxlo_out;
	int nzsum = (nlower + nz - nzlo_out)nixniy + nysum*nix + nxsum;

	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho[0][k] = rho_lookup[idx][k];
	rho[1][k] = rho_lookup[idy][k];
	rho[2][k] = rho_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower; k <= nupper; k++) {
	FFT_SCALAR r1,r2,r3;
	r1 = r2 = r3 = ZEROF;

	for (int l = order-1; l >= 0; l--) {
	r1 = rho_coeff[l][k] + r1*dx;
	r2 = rho_coeff[l][k] + r2*dy;
	r3 = rho_coeff[l][k] + r3*dz;
	}
	rho[0][k-nlower] = r1;
	rho[1][k-nlower] = r2;
	rho[2][k-nlower] = r3;
	}
	}

	FFT_SCALAR z0 = fdelvolinv * q[i];

	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order; n++) {
	int mz = nnixniy + nzsum;
	FFT_SCALAR y0 = z0*rho[2][n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order; m++) {
	int mzy = m*nix + mz;
	FFT_SCALAR x0 = y0*rho[1][m];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mzyx = l + mzy;
	my_density[mzyx] += x0*rho[0][l];
	}
	}
	}
	}
	}

	// reduce all the perthread_densities into global_density
	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nthr, global_density) if(!_use_lrt)
	#endif
	{
	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);

	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int i = ifrom; i < ito; i++) {
	for(int j = 1; j < nthr; j++) {
	global_density[i] += perthread_density[j-1][i];
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	create discretized "density" on section of global grid due to my particles
	density(x,y,z) = dispersion "density" at grid points of my 3d brick
	(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
	in global grid --- geometric mixing
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers)
	{
	// clear 3d density array

	FFT_SCALAR * _noalias global_density =
	&(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]);

	// loop over my charges, add their contribution to nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt

	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;

	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nthr, nlocal, global_density) if(!_use_lrt)
	#endif
	{
	int type;
	double **x = atom->x;

	const int nix = nxhi_out_6 - nxlo_out_6 + 1;
	const int niy = nyhi_out_6 - nylo_out_6 + 1;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv_6;
	const flt_t yi = delyinv_6;
	const flt_t zi = delzinv_6;
	const flt_t fshift = shift_6;
	const flt_t fshiftone = shiftone_6;
	const flt_t fdelvolinv = delvolinv_6;

	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
	FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
	perthread_density[tid - 1];

	// clear 3d density array
	memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));

	for (int i = ifrom; i < ito; i++) {

	int nx = part2grid_6[i][0];
	int ny = part2grid_6[i][1];
	int nz = part2grid_6[i][2];

	int nysum = nlower_6 + ny - nylo_out_6;
	int nxsum = nlower_6 + nx - nxlo_out_6;
	int nzsum = (nlower_6 + nz - nzlo_out_6)nixniy + nysum*nix + nxsum;

	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho[0][k] = rho6_lookup[idx][k];
	rho[1][k] = rho6_lookup[idy][k];
	rho[2][k] = rho6_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower_6; k <= nupper_6; k++) {
	FFT_SCALAR r1,r2,r3;
	r1 = r2 = r3 = ZEROF;

	for (int l = order_6-1; l >= 0; l--) {
	r1 = rho_coeff_6[l][k] + r1*dx;
	r2 = rho_coeff_6[l][k] + r2*dy;
	r3 = rho_coeff_6[l][k] + r3*dz;
	}
	rho[0][k-nlower_6] = r1;
	rho[1][k-nlower_6] = r2;
	rho[2][k-nlower_6] = r3;
	}
	}

	type = atom->type[i];
	FFT_SCALAR z0 = fdelvolinv * B[type];

	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order_6; n++) {
	int mz = nnixniy + nzsum;
	FFT_SCALAR y0 = z0*rho[2][n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order_6; m++) {
	int mzy = m*nix + mz;
	FFT_SCALAR x0 = y0*rho[1][m];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mzyx = l + mzy;
	my_density[mzyx] += x0*rho[0][l];
	}
	}
	}
	}
	}

	// reduce all the perthread_densities into global_density
	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nthr, global_density) if(!_use_lrt)
	#endif
	{
	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);

	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int i = ifrom; i < ito; i++) {
	for(int j = 1; j < nthr; j++) {
	global_density[i] += perthread_density[j-1][i];
	}
	}
	}

	}

	/* ----------------------------------------------------------------------
	create discretized "density" on section of global grid due to my particles
	density(x,y,z) = dispersion "density" at grid points of my 3d brick
	(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
	in global grid --- arithmetic mixing
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers)
	{
	// clear 3d density array

	memset(&(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
	ngrid_6*sizeof(FFT_SCALAR));
	memset(&(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
	ngrid_6*sizeof(FFT_SCALAR));
	memset(&(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
	ngrid_6*sizeof(FFT_SCALAR));
	memset(&(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
	ngrid_6*sizeof(FFT_SCALAR));
	memset(&(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
	ngrid_6*sizeof(FFT_SCALAR));
	memset(&(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
	ngrid_6*sizeof(FFT_SCALAR));
	memset(&(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
	ngrid_6*sizeof(FFT_SCALAR));

	// loop over my charges, add their contribution to nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt

	int nlocal = atom->nlocal;

	double **x = atom->x;

	const int nix = nxhi_out_6 - nxlo_out_6 + 1;
	const int niy = nyhi_out_6 - nylo_out_6 + 1;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv_6;
	const flt_t yi = delyinv_6;
	const flt_t zi = delzinv_6;
	const flt_t fshift = shift_6;
	const flt_t fshiftone = shiftone_6;
	const flt_t fdelvolinv = delvolinv_6;

	for (int i = 0; i < nlocal; i++) {

	int nx = part2grid_6[i][0];
	int ny = part2grid_6[i][1];
	int nz = part2grid_6[i][2];

	int nxsum = nx + nlower_6;
	int nysum = ny + nlower_6;
	int nzsum = nz + nlower_6;

	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho[0][k] = rho6_lookup[idx][k];
	rho[1][k] = rho6_lookup[idy][k];
	rho[2][k] = rho6_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower_6; k <= nupper_6; k++) {
	FFT_SCALAR r1,r2,r3;
	r1 = r2 = r3 = ZEROF;

	for (int l = order_6-1; l >= 0; l--) {
	r1 = rho_coeff_6[l][k] + r1*dx;
	r2 = rho_coeff_6[l][k] + r2*dy;
	r3 = rho_coeff_6[l][k] + r3*dz;
	}
	rho[0][k-nlower_6] = r1;
	rho[1][k-nlower_6] = r2;
	rho[2][k-nlower_6] = r3;
	}
	}

	const int type = atom->type[i];
	FFT_SCALAR z0 = fdelvolinv;

	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order_6; n++) {
	int mz = n + nzsum;
	FFT_SCALAR y0 = z0*rho[2][n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order_6; m++) {
	int my = m + nysum;
	FFT_SCALAR x0 = y0*rho[1][m];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mx = l + nxsum;
	FFT_SCALAR w = x0*rho[0][l];
	density_brick_a0[mz][my][mx] += wB[7type];
	density_brick_a1[mz][my][mx] += wB[7type+1];
	density_brick_a2[mz][my][mx] += wB[7type+2];
	density_brick_a3[mz][my][mx] += wB[7type+3];
	density_brick_a4[mz][my][mx] += wB[7type+4];
	density_brick_a5[mz][my][mx] += wB[7type+5];
	density_brick_a6[mz][my][mx] += wB[7type+6];
	}
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	create discretized "density" on section of global grid due to my particles
	density(x,y,z) = dispersion "density" at grid points of my 3d brick
	(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
	in global grid --- case when mixing rules don't apply
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers)
	{

	FFT_SCALAR * _noalias global_density = &(density_brick_none[0][nzlo_out_6][nylo_out_6][nxlo_out_6]);

	// loop over my charges, add their contribution to nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt

	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;

	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nthr, nlocal, global_density) if(!_use_lrt)
	#endif
	{
	int type;
	double **x = atom->x;

	const int nix = nxhi_out_6 - nxlo_out_6 + 1;
	const int niy = nyhi_out_6 - nylo_out_6 + 1;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv_6;
	const flt_t yi = delyinv_6;
	const flt_t zi = delzinv_6;
	const flt_t fshift = shift_6;
	const flt_t fshiftone = shiftone_6;
	const flt_t fdelvolinv = delvolinv_6;

	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
	FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
	perthread_density[tid - 1];
	// clear 3d density array
	memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));

	for (int i = ifrom; i < ito; i++) {

	int nx = part2grid_6[i][0];
	int ny = part2grid_6[i][1];
	int nz = part2grid_6[i][2];

	int nysum = nlower_6 + ny - nylo_out_6;
	int nxsum = nlower_6 + nx - nxlo_out_6;
	int nzsum = (nlower_6 + nz - nzlo_out_6)nixniy + nysum*nix + nxsum;

	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho[0][k] = rho6_lookup[idx][k];
	rho[1][k] = rho6_lookup[idy][k];
	rho[2][k] = rho6_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower_6; k <= nupper_6; k++) {
	FFT_SCALAR r1,r2,r3;
	r1 = r2 = r3 = ZEROF;

	for (int l = order_6-1; l >= 0; l--) {
	r1 = rho_coeff_6[l][k] + r1*dx;
	r2 = rho_coeff_6[l][k] + r2*dy;
	r3 = rho_coeff_6[l][k] + r3*dz;
	}
	rho[0][k-nlower_6] = r1;
	rho[1][k-nlower_6] = r2;
	rho[2][k-nlower_6] = r3;
	}
	}

	type = atom->type[i];
	FFT_SCALAR z0 = fdelvolinv;

	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order_6; n++) {
	int mz = nnixniy + nzsum;
	FFT_SCALAR y0 = z0*rho[2][n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order_6; m++) {
	int mzy = m*nix + mz;
	FFT_SCALAR x0 = y0*rho[1][m];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mzyx = l + mzy;
	FFT_SCALAR w0 = x0*rho[0][l];
	for(int k = 0; k < nsplit; k++)
	my_density[mzyx + kngrid_6] += x0rho[0][l];
	}
	}
	}
	}
	}

	// reduce all the perthread_densities into global_density
	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nthr, global_density) if(!_use_lrt)
	#endif
	{
	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);

	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int i = ifrom; i < ito; i++) {
	for(int j = 1; j < nthr; j++) {
	global_density[i] += perthread_density[j-1][i];
	}
	}
	}

	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get electric field & force on my particles
	for ik scheme
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
	{

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of E-field on particle

	//double *q = atom->q;
	//double **x = atom->x;
	//double **f = atom->f;

	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;


	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nlocal, nthr) if(!_use_lrt)
	#endif
	{

	double *q = atom->q;
	double **x = atom->x;
	double **f = atom->f;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv;
	const flt_t yi = delyinv;
	const flt_t zi = delzinv;
	const flt_t fshiftone = shiftone;
	const flt_t fqqrd2es = qqrd2e * scale;

	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);

	_alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
	_alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};

	for (int i = ifrom; i < ito; i++) {
	int nx = part2grid[i][0];
	int ny = part2grid[i][1];
	int nz = part2grid[i][2];

	int nxsum = nx + nlower;
	int nysum = ny + nlower;
	int nzsum = nz + nlower;;

	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho0[k] = rho_lookup[idx][k];
	rho1[k] = rho_lookup[idy][k];
	rho2[k] = rho_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower; k <= nupper; k++) {
	FFT_SCALAR r1 = rho_coeff[order-1][k];
	FFT_SCALAR r2 = rho_coeff[order-1][k];
	FFT_SCALAR r3 = rho_coeff[order-1][k];
	for (int l = order-2; l >= 0; l--) {
	r1 = rho_coeff[l][k] + r1*dx;
	r2 = rho_coeff[l][k] + r2*dy;
	r3 = rho_coeff[l][k] + r3*dz;
	}

	rho0[k-nlower] = r1;
	rho1[k-nlower] = r2;
	rho2[k-nlower] = r3;
	}
	}

	_alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order; n++) {
	int mz = n+nzsum;
	FFT_SCALAR z0 = rho2[n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order; m++) {
	int my = m+nysum;
	FFT_SCALAR y0 = z0*rho1[m];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mx = l+nxsum;
	FFT_SCALAR x0 = y0*rho0[l];
	ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
	eky_arr[l] -= x0*vdy_brick[mz][my][mx];
	ekz_arr[l] -= x0*vdz_brick[mz][my][mx];

	}
	}
	}

	FFT_SCALAR ekx, eky, ekz;
	ekx = eky = ekz = ZEROF;


	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	ekx += ekx_arr[l];
	eky += eky_arr[l];
	ekz += ekz_arr[l];
	}

	// convert E-field to force

	const flt_t qfactor = fqqrd2es * q[i];
	f[i][0] += qfactor*ekx;
	f[i][1] += qfactor*eky;
	if (slabflag != 2) f[i][2] += qfactor*ekz;
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get electric field & force on my particles
	for ad scheme
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
	{

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of E-field on particle

	//double *q = atom->q;
	//double **x = atom->x;
	//double **f = atom->f;

	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;

	FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
	FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
	FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;

	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nlocal, nthr) if(!_use_lrt)
	#endif
	{

	double *prd;
	if (triclinic == 0) prd = domain->prd;
	else prd = domain->prd_lamda;

	double *q = atom->q;
	double **x = atom->x;
	double **f = atom->f;
	const flt_t ftwo_pi = MY_PI * 2.0;
	const flt_t ffour_pi = MY_PI * 4.0;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv;
	const flt_t yi = delyinv;
	const flt_t zi = delzinv;
	const flt_t fshiftone = shiftone;
	const flt_t fqqrd2es = qqrd2e * scale;

	const double xprd = prd[0];
	const double yprd = prd[1];
	const double zprd = prd[2]*slab_volfactor;

	const flt_t hx_inv = nx_pppm/xprd;
	const flt_t hy_inv = ny_pppm/yprd;
	const flt_t hz_inv = nz_pppm/zprd;

	const flt_t fsf_coeff0 = sf_coeff[0];
	const flt_t fsf_coeff1 = sf_coeff[1];
	const flt_t fsf_coeff2 = sf_coeff[2];
	const flt_t fsf_coeff3 = sf_coeff[3];
	const flt_t fsf_coeff4 = sf_coeff[4];
	const flt_t fsf_coeff5 = sf_coeff[5];

	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);

	_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	for (int i = ifrom; i < ito; i++) {
	int nx = part2grid[i][0];
	int ny = part2grid[i][1];
	int nz = part2grid[i][2];
	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	int nxsum = nx + nlower;
	int nysum = ny + nlower;
	int nzsum = nz + nlower;

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;

	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho[0][k] = rho_lookup[idx][k];
	rho[1][k] = rho_lookup[idy][k];
	rho[2][k] = rho_lookup[idz][k];
	drho[0][k] = drho_lookup[idx][k];
	drho[1][k] = drho_lookup[idy][k];
	drho[2][k] = drho_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower; k <= nupper; k++) {
	FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
	dr1 = dr2 = dr3 = ZEROF;

	r1 = rho_coeff[order-1][k];
	r2 = rho_coeff[order-1][k];
	r3 = rho_coeff[order-1][k];
	for (int l = order-2; l >= 0; l--) {
	r1 = rho_coeff[l][k] + r1 * dx;
	r2 = rho_coeff[l][k] + r2 * dy;
	r3 = rho_coeff[l][k] + r3 * dz;
	dr1 = drho_coeff[l][k] + dr1 * dx;
	dr2 = drho_coeff[l][k] + dr2 * dy;
	dr3 = drho_coeff[l][k] + dr3 * dz;
	}
	rho[0][k-nlower] = r1;
	rho[1][k-nlower] = r2;
	rho[2][k-nlower] = r3;
	drho[0][k-nlower] = dr1;
	drho[1][k-nlower] = dr2;
	drho[2][k-nlower] = dr3;
	}
	}
	_alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order; n++) {
	int mz = n + nzsum;
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order; m++) {
	int my = m + nysum;
	FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
	FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
	FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mx = l + nxsum;
	ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
	eky[l] += rho[0][l] * eky_p * u_brick[mz][my][mx];
	ekz[l] += rho[0][l] * ekz_p * u_brick[mz][my][mx];
	}
	}
	}

	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
	particle_ekx[i] += ekx[l];
	particle_eky[i] += eky[l];
	particle_ekz[i] += ekz[l];
	}
	}
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int i = ifrom; i < ito; i++) {
	particle_ekx[i] *= hx_inv;
	particle_eky[i] *= hy_inv;
	particle_ekz[i] *= hz_inv;

	// convert E-field to force

	const flt_t qfactor = fqqrd2es * q[i];
	const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];

	const flt_t s1 = x[i][0] * hx_inv;
	const flt_t s2 = x[i][1] * hy_inv;
	const flt_t s3 = x[i][2] * hz_inv;
	flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
	sf += fsf_coeff1 * sin(ffour_pi * s1);
	sf *= twoqsq;
	f[i][0] += qfactor * particle_ekx[i] - fqqrd2es * sf;

	sf = fsf_coeff2 * sin(ftwo_pi * s2);
	sf += fsf_coeff3 * sin(ffour_pi * s2);
	sf *= twoqsq;
	f[i][1] += qfactor * particle_eky[i] - fqqrd2es * sf;

	sf = fsf_coeff4 * sin(ftwo_pi * s3);
	sf += fsf_coeff5 * sin(ffour_pi * s3);
	sf *= twoqsq;

	if (slabflag != 2) f[i][2] += qfactor * particle_ekz[i] - fqqrd2es * sf;
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get dispersion field & force on my particles
	for geometric mixing rule
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
	{

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of dispersion field on particle

	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;


	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nlocal, nthr) if(!_use_lrt)
	#endif
	{

	double lj;
	int type;
	double **x = atom->x;
	double **f = atom->f;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv_6;
	const flt_t yi = delyinv_6;
	const flt_t zi = delzinv_6;
	const flt_t fshiftone = shiftone_6;

	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);

	_alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
	_alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};

	for (int i = ifrom; i < ito; i++) {
	int nx = part2grid_6[i][0];
	int ny = part2grid_6[i][1];
	int nz = part2grid_6[i][2];

	int nxsum = nx + nlower_6;
	int nysum = ny + nlower_6;
	int nzsum = nz + nlower_6;

	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho0[k] = rho6_lookup[idx][k];
	rho1[k] = rho6_lookup[idy][k];
	rho2[k] = rho6_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower_6; k <= nupper_6; k++) {
	FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
	FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
	FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
	for (int l = order_6-2; l >= 0; l--) {
	r1 = rho_coeff_6[l][k] + r1*dx;
	r2 = rho_coeff_6[l][k] + r2*dy;
	r3 = rho_coeff_6[l][k] + r3*dz;
	}

	rho0[k-nlower_6] = r1;
	rho1[k-nlower_6] = r2;
	rho2[k-nlower_6] = r3;
	}
	}

	_alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order_6; n++) {
	int mz = n+nzsum;
	FFT_SCALAR z0 = rho2[n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order_6; m++) {
	int my = m+nysum;
	FFT_SCALAR y0 = z0*rho1[m];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mx = l+nxsum;
	FFT_SCALAR x0 = y0*rho0[l];
	ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
	eky_arr[l] -= x0*vdy_brick_g[mz][my][mx];
	ekz_arr[l] -= x0*vdz_brick_g[mz][my][mx];

	}
	}
	}

	FFT_SCALAR ekx, eky, ekz;
	ekx = eky = ekz = ZEROF;


	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	ekx += ekx_arr[l];
	eky += eky_arr[l];
	ekz += ekz_arr[l];
	}

	// convert E-field to force

	type = atom->type[i];
	lj = B[type];
	f[i][0] += lj*ekx;
	f[i][1] += lj*eky;
	if (slabflag != 2) f[i][2] += lj*ekz;
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get dispersion field & force on my particles
	for geometric mixing rule for ad scheme
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
	{

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of dispersion field on particle

	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;

	FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
	FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
	FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;

	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nlocal, nthr) if(!_use_lrt)
	#endif
	{

	double *prd;
	if (triclinic == 0) prd = domain->prd;
	else prd = domain->prd_lamda;

	double **x = atom->x;
	double **f = atom->f;
	const flt_t ftwo_pi = MY_PI * 2.0;
	const flt_t ffour_pi = MY_PI * 4.0;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv_6;
	const flt_t yi = delyinv_6;
	const flt_t zi = delzinv_6;
	const flt_t fshiftone = shiftone_6;

	const double xprd = prd[0];
	const double yprd = prd[1];
	const double zprd = prd[2]*slab_volfactor;

	const flt_t hx_inv = nx_pppm_6/xprd;
	const flt_t hy_inv = ny_pppm_6/yprd;
	const flt_t hz_inv = nz_pppm_6/zprd;

	const flt_t fsf_coeff0 = sf_coeff_6[0];
	const flt_t fsf_coeff1 = sf_coeff_6[1];
	const flt_t fsf_coeff2 = sf_coeff_6[2];
	const flt_t fsf_coeff3 = sf_coeff_6[3];
	const flt_t fsf_coeff4 = sf_coeff_6[4];
	const flt_t fsf_coeff5 = sf_coeff_6[5];

	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);

	_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	for (int i = ifrom; i < ito; i++) {
	int nx = part2grid_6[i][0];
	int ny = part2grid_6[i][1];
	int nz = part2grid_6[i][2];
	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	int nxsum = nx + nlower_6;
	int nysum = ny + nlower_6;
	int nzsum = nz + nlower_6;

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;

	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho[0][k] = rho6_lookup[idx][k];
	rho[1][k] = rho6_lookup[idy][k];
	rho[2][k] = rho6_lookup[idz][k];
	drho[0][k] = drho6_lookup[idx][k];
	drho[1][k] = drho6_lookup[idy][k];
	drho[2][k] = drho6_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower_6; k <= nupper_6; k++) {
	FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
	dr1 = dr2 = dr3 = ZEROF;

	r1 = rho_coeff_6[order_6-1][k];
	r2 = rho_coeff_6[order_6-1][k];
	r3 = rho_coeff_6[order_6-1][k];
	for (int l = order_6-2; l >= 0; l--) {
	r1 = rho_coeff_6[l][k] + r1 * dx;
	r2 = rho_coeff_6[l][k] + r2 * dy;
	r3 = rho_coeff_6[l][k] + r3 * dz;
	dr1 = drho_coeff_6[l][k] + dr1 * dx;
	dr2 = drho_coeff_6[l][k] + dr2 * dy;
	dr3 = drho_coeff_6[l][k] + dr3 * dz;
	}
	rho[0][k-nlower_6] = r1;
	rho[1][k-nlower_6] = r2;
	rho[2][k-nlower_6] = r3;
	drho[0][k-nlower_6] = dr1;
	drho[1][k-nlower_6] = dr2;
	drho[2][k-nlower_6] = dr3;
	}
	}
	_alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order_6; n++) {
	int mz = n + nzsum;
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order_6; m++) {
	int my = m + nysum;
	FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
	FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
	FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mx = l + nxsum;
	ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
	eky[l] += rho[0][l] * eky_p * u_brick_g[mz][my][mx];
	ekz[l] += rho[0][l] * ekz_p * u_brick_g[mz][my][mx];
	}
	}
	}

	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
	particle_ekx[i] += ekx[l];
	particle_eky[i] += eky[l];
	particle_ekz[i] += ekz[l];
	}
	}
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int i = ifrom; i < ito; i++) {
	particle_ekx[i] *= hx_inv;
	particle_eky[i] *= hy_inv;
	particle_ekz[i] *= hz_inv;

	// convert E-field to force

	const int type = atom->type[i];
	const flt_t lj = B[type];
	const flt_t twoljsq = 2.ljlj;

	const flt_t s1 = x[i][0] * hx_inv;
	const flt_t s2 = x[i][1] * hy_inv;
	const flt_t s3 = x[i][2] * hz_inv;
	flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
	sf += fsf_coeff1 * sin(ffour_pi * s1);
	sf *= twoljsq;
	f[i][0] += lj * particle_ekx[i] - sf;

	sf = fsf_coeff2 * sin(ftwo_pi * s2);
	sf += fsf_coeff3 * sin(ffour_pi * s2);
	sf *= twoljsq;
	f[i][1] += lj * particle_eky[i] - sf;

	sf = fsf_coeff4 * sin(ftwo_pi * s3);
	sf += fsf_coeff5 * sin(ffour_pi * s3);
	sf *= twoljsq;

	if (slabflag != 2) f[i][2] += lj * particle_ekz[i] - sf;
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get dispersion field & force on my particles
	for arithmetic mixing rule and ik scheme
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
	{

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of dispersion field on particle

	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;


	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nlocal, nthr) if(!_use_lrt)
	#endif
	{
	double **x = atom->x;
	double **f = atom->f;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv_6;
	const flt_t yi = delyinv_6;
	const flt_t zi = delzinv_6;
	const flt_t fshiftone = shiftone_6;

	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);

	_alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
	_alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};

	for (int i = ifrom; i < ito; i++) {
	int nx = part2grid_6[i][0];
	int ny = part2grid_6[i][1];
	int nz = part2grid_6[i][2];

	int nxsum = nx + nlower_6;
	int nysum = ny + nlower_6;
	int nzsum = nz + nlower_6;

	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho0[k] = rho6_lookup[idx][k];
	rho1[k] = rho6_lookup[idy][k];
	rho2[k] = rho6_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower_6; k <= nupper_6; k++) {
	FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
	FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
	FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
	for (int l = order_6-2; l >= 0; l--) {
	r1 = rho_coeff_6[l][k] + r1*dx;
	r2 = rho_coeff_6[l][k] + r2*dy;
	r3 = rho_coeff_6[l][k] + r3*dz;
	}

	rho0[k-nlower_6] = r1;
	rho1[k-nlower_6] = r2;
	rho2[k-nlower_6] = r3;
	}
	}

	_alignvar(FFT_SCALAR ekx0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};


	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order_6; n++) {
	int mz = n+nzsum;
	FFT_SCALAR z0 = rho2[n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order_6; m++) {
	int my = m+nysum;
	FFT_SCALAR y0 = z0*rho1[m];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mx = l+nxsum;
	FFT_SCALAR x0 = y0*rho0[l];
	ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
	eky0_arr[l] -= x0*vdy_brick_a0[mz][my][mx];
	ekz0_arr[l] -= x0*vdz_brick_a0[mz][my][mx];
	ekx1_arr[l] -= x0*vdx_brick_a1[mz][my][mx];
	eky1_arr[l] -= x0*vdy_brick_a1[mz][my][mx];
	ekz1_arr[l] -= x0*vdz_brick_a1[mz][my][mx];
	ekx2_arr[l] -= x0*vdx_brick_a2[mz][my][mx];
	eky2_arr[l] -= x0*vdy_brick_a2[mz][my][mx];
	ekz2_arr[l] -= x0*vdz_brick_a2[mz][my][mx];
	ekx3_arr[l] -= x0*vdx_brick_a3[mz][my][mx];
	eky3_arr[l] -= x0*vdy_brick_a3[mz][my][mx];
	ekz3_arr[l] -= x0*vdz_brick_a3[mz][my][mx];
	ekx4_arr[l] -= x0*vdx_brick_a4[mz][my][mx];
	eky4_arr[l] -= x0*vdy_brick_a4[mz][my][mx];
	ekz4_arr[l] -= x0*vdz_brick_a4[mz][my][mx];
	ekx5_arr[l] -= x0*vdx_brick_a5[mz][my][mx];
	eky5_arr[l] -= x0*vdy_brick_a5[mz][my][mx];
	ekz5_arr[l] -= x0*vdz_brick_a5[mz][my][mx];
	ekx6_arr[l] -= x0*vdx_brick_a6[mz][my][mx];
	eky6_arr[l] -= x0*vdy_brick_a6[mz][my][mx];
	ekz6_arr[l] -= x0*vdz_brick_a6[mz][my][mx];
	}
	}
	}

	FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
	FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
	FFT_SCALAR ekx6, eky6, ekz6;
	ekx0 = eky0 = ekz0 = ZEROF;
	ekx1 = eky1 = ekz1 = ZEROF;
	ekx2 = eky2 = ekz2 = ZEROF;
	ekx3 = eky3 = ekz3 = ZEROF;
	ekx4 = eky4 = ekz4 = ZEROF;
	ekx5 = eky5 = ekz5 = ZEROF;
	ekx6 = eky6 = ekz6 = ZEROF;

	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	ekx0 += ekx0_arr[l];
	eky0 += eky0_arr[l];
	ekz0 += ekz0_arr[l];
	ekx1 += ekx1_arr[l];
	eky1 += eky1_arr[l];
	ekz1 += ekz1_arr[l];
	ekx2 += ekx2_arr[l];
	eky2 += eky2_arr[l];
	ekz2 += ekz2_arr[l];
	ekx3 += ekx3_arr[l];
	eky3 += eky3_arr[l];
	ekz3 += ekz3_arr[l];
	ekx4 += ekx4_arr[l];
	eky4 += eky4_arr[l];
	ekz4 += ekz4_arr[l];
	ekx5 += ekx5_arr[l];
	eky5 += eky5_arr[l];
	ekz5 += ekz5_arr[l];
	ekx6 += ekx6_arr[l];
	eky6 += eky6_arr[l];
	ekz6 += ekz6_arr[l];
	}

	// convert D-field to force

	const int type = atom->type[i];
	const FFT_SCALAR lj0 = B[7*type+6];
	const FFT_SCALAR lj1 = B[7*type+5];
	const FFT_SCALAR lj2 = B[7*type+4];
	const FFT_SCALAR lj3 = B[7*type+3];
	const FFT_SCALAR lj4 = B[7*type+2];
	const FFT_SCALAR lj5 = B[7*type+1];
	const FFT_SCALAR lj6 = B[7*type];

	f[i][0] += lj0ekx0 + lj1ekx1 + lj2ekx2 + lj3ekx3 +
	lj4ekx4 + lj5ekx5 + lj6*ekx6;
	f[i][1] += lj0eky0 + lj1eky1 + lj2eky2 + lj3eky3 +
	lj4eky4 + lj5eky5 + lj6*eky6;
	if (slabflag != 2) f[i][2] += lj0ekz0 + lj1ekz1 + lj2*ekz2 +
	lj3ekz3 + lj4ekz4 + lj5ekz5 + lj6ekz6;
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get dispersion field & force on my particles
	for arithmetic mixing rule for the ad scheme
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
	{

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of dispersion field on particle

	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;

	FFT_SCALAR * _noalias const particle_ekx0 = this->particle_ekx0;
	FFT_SCALAR * _noalias const particle_eky0 = this->particle_eky0;
	FFT_SCALAR * _noalias const particle_ekz0 = this->particle_ekz0;
	FFT_SCALAR * _noalias const particle_ekx1 = this->particle_ekx1;
	FFT_SCALAR * _noalias const particle_eky1 = this->particle_eky1;
	FFT_SCALAR * _noalias const particle_ekz1 = this->particle_ekz1;
	FFT_SCALAR * _noalias const particle_ekx2 = this->particle_ekx2;
	FFT_SCALAR * _noalias const particle_eky2 = this->particle_eky2;
	FFT_SCALAR * _noalias const particle_ekz2 = this->particle_ekz2;
	FFT_SCALAR * _noalias const particle_ekx3 = this->particle_ekx3;
	FFT_SCALAR * _noalias const particle_eky3 = this->particle_eky3;
	FFT_SCALAR * _noalias const particle_ekz3 = this->particle_ekz3;
	FFT_SCALAR * _noalias const particle_ekx4 = this->particle_ekx4;
	FFT_SCALAR * _noalias const particle_eky4 = this->particle_eky4;
	FFT_SCALAR * _noalias const particle_ekz4 = this->particle_ekz4;
	FFT_SCALAR * _noalias const particle_ekx5 = this->particle_ekx5;
	FFT_SCALAR * _noalias const particle_eky5 = this->particle_eky5;
	FFT_SCALAR * _noalias const particle_ekz5 = this->particle_ekz5;
	FFT_SCALAR * _noalias const particle_ekx6 = this->particle_ekx6;
	FFT_SCALAR * _noalias const particle_eky6 = this->particle_eky6;
	FFT_SCALAR * _noalias const particle_ekz6 = this->particle_ekz6;

	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nlocal, nthr) if(!_use_lrt)
	#endif
	{

	double *prd;
	if (triclinic == 0) prd = domain->prd;
	else prd = domain->prd_lamda;

	double **x = atom->x;
	double **f = atom->f;
	const flt_t ftwo_pi = MY_PI * 2.0;
	const flt_t ffour_pi = MY_PI * 4.0;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv_6;
	const flt_t yi = delyinv_6;
	const flt_t zi = delzinv_6;
	const flt_t fshiftone = shiftone_6;

	const double xprd = prd[0];
	const double yprd = prd[1];
	const double zprd = prd[2]*slab_volfactor;

	const flt_t hx_inv = nx_pppm_6/xprd;
	const flt_t hy_inv = ny_pppm_6/yprd;
	const flt_t hz_inv = nz_pppm_6/zprd;

	const flt_t fsf_coeff0 = sf_coeff_6[0];
	const flt_t fsf_coeff1 = sf_coeff_6[1];
	const flt_t fsf_coeff2 = sf_coeff_6[2];
	const flt_t fsf_coeff3 = sf_coeff_6[3];
	const flt_t fsf_coeff4 = sf_coeff_6[4];
	const flt_t fsf_coeff5 = sf_coeff_6[5];

	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);

	_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	for (int i = ifrom; i < ito; i++) {
	int nx = part2grid_6[i][0];
	int ny = part2grid_6[i][1];
	int nz = part2grid_6[i][2];
	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	int nxsum = nx + nlower_6;
	int nysum = ny + nlower_6;
	int nzsum = nz + nlower_6;

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;

	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho[0][k] = rho6_lookup[idx][k];
	rho[1][k] = rho6_lookup[idy][k];
	rho[2][k] = rho6_lookup[idz][k];
	drho[0][k] = drho6_lookup[idx][k];
	drho[1][k] = drho6_lookup[idy][k];
	drho[2][k] = drho6_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower_6; k <= nupper_6; k++) {
	FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
	dr1 = dr2 = dr3 = ZEROF;

	r1 = rho_coeff_6[order_6-1][k];
	r2 = rho_coeff_6[order_6-1][k];
	r3 = rho_coeff_6[order_6-1][k];
	for (int l = order_6-2; l >= 0; l--) {
	r1 = rho_coeff_6[l][k] + r1 * dx;
	r2 = rho_coeff_6[l][k] + r2 * dy;
	r3 = rho_coeff_6[l][k] + r3 * dz;
	dr1 = drho_coeff_6[l][k] + dr1 * dx;
	dr2 = drho_coeff_6[l][k] + dr2 * dy;
	dr3 = drho_coeff_6[l][k] + dr3 * dz;
	}
	rho[0][k-nlower_6] = r1;
	rho[1][k-nlower_6] = r2;
	rho[2][k-nlower_6] = r3;
	drho[0][k-nlower_6] = dr1;
	drho[1][k-nlower_6] = dr2;
	drho[2][k-nlower_6] = dr3;
	}
	}
	_alignvar(FFT_SCALAR ekx0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekx6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR eky6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(FFT_SCALAR ekz6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	particle_ekx0[i] = particle_eky0[i] = particle_ekz0[i] = ZEROF;
	particle_ekx1[i] = particle_eky1[i] = particle_ekz1[i] = ZEROF;
	particle_ekx2[i] = particle_eky2[i] = particle_ekz2[i] = ZEROF;
	particle_ekx3[i] = particle_eky3[i] = particle_ekz3[i] = ZEROF;
	particle_ekx4[i] = particle_eky4[i] = particle_ekz4[i] = ZEROF;
	particle_ekx5[i] = particle_eky5[i] = particle_ekz5[i] = ZEROF;
	particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;

	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order_6; n++) {
	int mz = n + nzsum;
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order_6; m++) {
	int my = m + nysum;
	FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
	FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
	FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mx = l + nxsum;
	FFT_SCALAR x0 = drho[0][l] * ekx_p;
	FFT_SCALAR y0 = rho[0][l] * eky_p;
	FFT_SCALAR z0 = rho[0][l] * ekz_p;

	ekx0[l] += x0 * u_brick_a0[mz][my][mx];
	eky0[l] += y0 * u_brick_a0[mz][my][mx];
	ekz0[l] += z0 * u_brick_a0[mz][my][mx];
	ekx1[l] += x0 * u_brick_a1[mz][my][mx];
	eky1[l] += y0 * u_brick_a1[mz][my][mx];
	ekz1[l] += z0 * u_brick_a1[mz][my][mx];
	ekx2[l] += x0 * u_brick_a2[mz][my][mx];
	eky2[l] += y0 * u_brick_a2[mz][my][mx];
	ekz2[l] += z0 * u_brick_a2[mz][my][mx];
	ekx3[l] += x0 * u_brick_a3[mz][my][mx];
	eky3[l] += y0 * u_brick_a3[mz][my][mx];
	ekz3[l] += z0 * u_brick_a3[mz][my][mx];
	ekx4[l] += x0 * u_brick_a4[mz][my][mx];
	eky4[l] += y0 * u_brick_a4[mz][my][mx];
	ekz4[l] += z0 * u_brick_a4[mz][my][mx];
	ekx5[l] += x0 * u_brick_a5[mz][my][mx];
	eky5[l] += y0 * u_brick_a5[mz][my][mx];
	ekz5[l] += z0 * u_brick_a5[mz][my][mx];
	ekx6[l] += x0 * u_brick_a6[mz][my][mx];
	eky6[l] += y0 * u_brick_a6[mz][my][mx];
	ekz6[l] += z0 * u_brick_a6[mz][my][mx];
	}
	}
	}

	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
	particle_ekx0[i] += ekx0[l];
	particle_eky0[i] += eky0[l];
	particle_ekz0[i] += ekz0[l];
	particle_ekx1[i] += ekx1[l];
	particle_eky1[i] += eky1[l];
	particle_ekz1[i] += ekz1[l];
	particle_ekx2[i] += ekx2[l];
	particle_eky2[i] += eky2[l];
	particle_ekz2[i] += ekz2[l];
	particle_ekx3[i] += ekx3[l];
	particle_eky3[i] += eky3[l];
	particle_ekz3[i] += ekz3[l];
	particle_ekx4[i] += ekx4[l];
	particle_eky4[i] += eky4[l];
	particle_ekz4[i] += ekz4[l];
	particle_ekx5[i] += ekx5[l];
	particle_eky5[i] += eky5[l];
	particle_ekz5[i] += ekz5[l];
	particle_ekx6[i] += ekx6[l];
	particle_eky6[i] += eky6[l];
	particle_ekz6[i] += ekz6[l];
	}
	}
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int i = ifrom; i < ito; i++) {
	particle_ekx0[i] *= hx_inv;
	particle_eky0[i] *= hy_inv;
	particle_ekz0[i] *= hz_inv;
	particle_ekx1[i] *= hx_inv;
	particle_eky1[i] *= hy_inv;
	particle_ekz1[i] *= hz_inv;
	particle_ekx2[i] *= hx_inv;
	particle_eky2[i] *= hy_inv;
	particle_ekz2[i] *= hz_inv;
	particle_ekx3[i] *= hx_inv;
	particle_eky3[i] *= hy_inv;
	particle_ekz3[i] *= hz_inv;
	particle_ekx4[i] *= hx_inv;
	particle_eky4[i] *= hy_inv;
	particle_ekz4[i] *= hz_inv;
	particle_ekx5[i] *= hx_inv;
	particle_eky5[i] *= hy_inv;
	particle_ekz5[i] *= hz_inv;
	particle_ekx6[i] *= hx_inv;
	particle_eky6[i] *= hy_inv;
	particle_ekz6[i] *= hz_inv;

	// convert D-field to force

	const int type = atom->type[i];
	const FFT_SCALAR lj0 = B[7*type+6];
	const FFT_SCALAR lj1 = B[7*type+5];
	const FFT_SCALAR lj2 = B[7*type+4];
	const FFT_SCALAR lj3 = B[7*type+3];
	const FFT_SCALAR lj4 = B[7*type+2];
	const FFT_SCALAR lj5 = B[7*type+1];
	const FFT_SCALAR lj6 = B[7*type];

	const flt_t s1 = x[i][0] * hx_inv;
	const flt_t s2 = x[i][1] * hy_inv;
	const flt_t s3 = x[i][2] * hz_inv;
	flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
	sf += fsf_coeff1 * sin(ffour_pi * s1);
	sf = 4lj0lj6 + 4lj1lj5 + 4lj2lj4 + 2lj3*lj3;
	f[i][0] += lj0particle_ekx0[i] + lj1particle_ekx1[i] +
	lj2particle_ekx2[i] + lj3particle_ekx3[i] + lj4*particle_ekx4[i] +
	lj5particle_ekx5[i] + lj6particle_ekx6[i] - sf;

	sf = fsf_coeff2 * sin(ftwo_pi * s2);
	sf += fsf_coeff3 * sin(ffour_pi * s2);
	sf = 4lj0lj6 + 4lj1lj5 + 4lj2lj4 + 2lj3*lj3;
	f[i][1] += lj0particle_eky0[i] + lj1particle_eky1[i] +
	lj2particle_eky2[i] + lj3particle_eky3[i] + lj4*particle_eky4[i] +
	lj5particle_eky5[i] + lj6particle_eky6[i] - sf;

	sf = fsf_coeff4 * sin(ftwo_pi * s3);
	sf += fsf_coeff5 * sin(ffour_pi * s3);
	sf = 4lj0lj6 + 4lj1lj5 + 4lj2lj4 + 2lj3*lj3;
	if (slabflag != 2)
	f[i][2] += lj0particle_ekz0[i] + lj1particle_ekz1[i] +
	lj2particle_ekz2[i] + lj3particle_ekz3[i] + lj4*particle_ekz4[i] +
	lj5particle_ekz5[i] + lj6particle_ekz6[i] - sf;
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get dispersion field & force on my particles
	for no mixing rule and ik scheme
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
	{

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of dispersion field on particle

	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;


	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nlocal, nthr) if(!_use_lrt)
	#endif
	{

	double lj;
	int type;
	double **x = atom->x;
	double **f = atom->f;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv_6;
	const flt_t yi = delyinv_6;
	const flt_t zi = delzinv_6;
	const flt_t fshiftone = shiftone_6;

	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);

	_alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
	_alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};

	for (int i = ifrom; i < ito; i++) {
	int nx = part2grid_6[i][0];
	int ny = part2grid_6[i][1];
	int nz = part2grid_6[i][2];

	int nxsum = nx + nlower_6;
	int nysum = ny + nlower_6;
	int nzsum = nz + nlower_6;

	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho0[k] = rho6_lookup[idx][k];
	rho1[k] = rho6_lookup[idy][k];
	rho2[k] = rho6_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower_6; k <= nupper_6; k++) {
	FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
	FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
	FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
	for (int l = order_6-2; l >= 0; l--) {
	r1 = rho_coeff_6[l][k] + r1*dx;
	r2 = rho_coeff_6[l][k] + r2*dy;
	r3 = rho_coeff_6[l][k] + r3*dz;
	}

	rho0[k-nlower_6] = r1;
	rho1[k-nlower_6] = r2;
	rho2[k-nlower_6] = r3;
	}
	}


	_alignvar(FFT_SCALAR ekx_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
	_alignvar(FFT_SCALAR eky_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
	_alignvar(FFT_SCALAR ekz_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);

	for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
	ekx_arr[k] = eky_arr[k] = ekz_arr[k] = ZEROF;
	}

	for (int k = 0; k < nsplit; k++) {
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order_6; n++) {
	int mz = n+nzsum;
	FFT_SCALAR z0 = rho2[n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order_6; m++) {
	int my = m+nysum;
	FFT_SCALAR y0 = z0*rho1[m];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mx = l+nxsum;
	FFT_SCALAR x0 = y0*rho0[l];
	ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
	x0*vdx_brick_none[k][mz][my][mx];
	eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
	x0*vdy_brick_none[k][mz][my][mx];
	ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
	x0*vdz_brick_none[k][mz][my][mx];
	}
	}
	}
	}

	_alignvar(FFT_SCALAR ekx[nsplit], 64);
	_alignvar(FFT_SCALAR eky[nsplit], 64);
	_alignvar(FFT_SCALAR ekz[nsplit], 64);
	for (int k = 0; k < nsplit; k++) {
	ekx[k] = eky[k] = ekz[k] = ZEROF;
	}

	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	for (int k = 0; k < nsplit; k++) {
	ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
	eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
	ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
	}
	}

	// convert E-field to force

	type = atom->type[i];
	for (int k = 0; k < nsplit; k++) {
	lj = B[nsplit*type + k];
	f[i][0] += lj*ekx[k];
	f[i][1] += lj*eky[k];
	if (slabflag != 2) f[i][2] += lj*ekz[k];
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get dispersion field & force on my particles
	for no mixing rule for the ad scheme
	------------------------------------------------------------------------- */

	template<class flt_t, class acc_t, int use_table>
	void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
	{
	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of dispersion field on particle

	int nlocal = atom->nlocal;
	int nthr = comm->nthreads;

	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(nlocal, nthr) if(!_use_lrt)
	#endif
	{

	double *prd;
	if (triclinic == 0) prd = domain->prd;
	else prd = domain->prd_lamda;

	double **x = atom->x;
	double **f = atom->f;
	const flt_t ftwo_pi = MY_PI * 2.0;
	const flt_t ffour_pi = MY_PI * 4.0;

	const flt_t lo0 = boxlo[0];
	const flt_t lo1 = boxlo[1];
	const flt_t lo2 = boxlo[2];
	const flt_t xi = delxinv_6;
	const flt_t yi = delyinv_6;
	const flt_t zi = delzinv_6;
	const flt_t fshiftone = shiftone_6;

	const double xprd = prd[0];
	const double yprd = prd[1];
	const double zprd = prd[2]*slab_volfactor;

	const flt_t hx_inv = nx_pppm_6/xprd;
	const flt_t hy_inv = ny_pppm_6/yprd;
	const flt_t hz_inv = nz_pppm_6/zprd;

	const flt_t fsf_coeff0 = sf_coeff_6[0];
	const flt_t fsf_coeff1 = sf_coeff_6[1];
	const flt_t fsf_coeff2 = sf_coeff_6[2];
	const flt_t fsf_coeff3 = sf_coeff_6[3];
	const flt_t fsf_coeff4 = sf_coeff_6[4];
	const flt_t fsf_coeff5 = sf_coeff_6[5];

	int ifrom, ito, tid;
	IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);

	_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
	_alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

	for (int i = ifrom; i < ito; i++) {
	int nx = part2grid_6[i][0];
	int ny = part2grid_6[i][1];
	int nz = part2grid_6[i][2];
	FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
	FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
	FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;

	int nxsum = nx + nlower_6;
	int nysum = ny + nlower_6;
	int nzsum = nz + nlower_6;

	if (use_table) {
	dx = dx*half_rho_scale + half_rho_scale_plus;
	int idx = dx;
	dy = dy*half_rho_scale + half_rho_scale_plus;
	int idy = dy;
	dz = dz*half_rho_scale + half_rho_scale_plus;
	int idz = dz;

	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho[0][k] = rho6_lookup[idx][k];
	rho[1][k] = rho6_lookup[idy][k];
	rho[2][k] = rho6_lookup[idz][k];
	drho[0][k] = drho6_lookup[idx][k];
	drho[1][k] = drho6_lookup[idy][k];
	drho[2][k] = drho6_lookup[idz][k];
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k = nlower_6; k <= nupper_6; k++) {
	FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
	dr1 = dr2 = dr3 = ZEROF;

	r1 = rho_coeff_6[order_6-1][k];
	r2 = rho_coeff_6[order_6-1][k];
	r3 = rho_coeff_6[order_6-1][k];
	for (int l = order_6-2; l >= 0; l--) {
	r1 = rho_coeff_6[l][k] + r1 * dx;
	r2 = rho_coeff_6[l][k] + r2 * dy;
	r3 = rho_coeff_6[l][k] + r3 * dz;
	dr1 = drho_coeff_6[l][k] + dr1 * dx;
	dr2 = drho_coeff_6[l][k] + dr2 * dy;
	dr3 = drho_coeff_6[l][k] + dr3 * dz;
	}
	rho[0][k-nlower_6] = r1;
	rho[1][k-nlower_6] = r2;
	rho[2][k-nlower_6] = r3;
	drho[0][k-nlower_6] = dr1;
	drho[1][k-nlower_6] = dr2;
	drho[2][k-nlower_6] = dr3;
	}
	}
	_alignvar(FFT_SCALAR ekx[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
	_alignvar(FFT_SCALAR eky[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
	_alignvar(FFT_SCALAR ekz[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);

	for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
	ekx[k]=eky[k]=ekz[k]=ZEROF;
	}

	for (int k = 0; k < nsplit; k++) {
	particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int n = 0; n < order_6; n++) {
	int mz = n + nzsum;
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count=7
	#endif
	for (int m = 0; m < order_6; m++) {
	int my = m + nysum;
	FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
	FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
	FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
	int mx = l + nxsum;
	ekx[kINTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] ekx_p *
	u_brick_none[k][mz][my][mx];
	eky[kINTEL_P3M_ALIGNED_MAXORDER+l] += rho[0][l] eky_p *
	u_brick_none[k][mz][my][mx];
	ekz[kINTEL_P3M_ALIGNED_MAXORDER+l] += rho[0][l] ekz_p *
	u_brick_none[k][mz][my][mx];
	}
	}
	}
	}

	_alignvar(FFT_SCALAR ekx_tot[nsplit], 64);
	_alignvar(FFT_SCALAR eky_tot[nsplit], 64);
	_alignvar(FFT_SCALAR ekz_tot[nsplit], 64);
	for (int k = 0; k < nsplit; k++) {
	ekx_tot[k] = eky_tot[k] = ekz_tot[k] = ZEROF;
	}

	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
	for (int k = 0; k < nsplit; k++) {
	ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
	eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];
	ekz_tot[k] += ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l];
	}
	}

	for (int k = 0; k < nsplit; k++) {
	ekx_tot[k] *= hx_inv;
	eky_tot[k] *= hy_inv;
	ekz_tot[k] *= hz_inv;
	}
	// convert D-field to force

	const int type = atom->type[i];

	const flt_t s1 = x[i][0] * hx_inv;
	const flt_t s2 = x[i][1] * hy_inv;
	const flt_t s3 = x[i][2] * hz_inv;
	flt_t sf1 = fsf_coeff0 * sin(ftwo_pi * s1);
	sf1 += fsf_coeff1 * sin(ffour_pi * s1);

	flt_t sf2 = fsf_coeff2 * sin(ftwo_pi * s2);
	sf2 += fsf_coeff3 * sin(ffour_pi * s2);

	flt_t sf3 = fsf_coeff4 * sin(ftwo_pi * s3);
	sf3 += fsf_coeff5 * sin(ffour_pi * s3);
	for (int k = 0; k < nsplit; k++) {
	const flt_t lj = B[nsplit*type + k];
	const flt_t twoljsq = ljlj B[k] * 2;
	flt_t sf = sf1*twoljsq;
	f[i][0] += lj * ekx_tot[k] - sf;
	sf = sf2*twoljsq;
	f[i][1] += lj * eky_tot[k] - sf;
	sf = sf3*twoljsq;
	if (slabflag != 2) f[i][2] += lj * ekz_tot[k] - sf;
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	precompute rho coefficients as a lookup table to save time in make_rho
	and fieldforce. Instead of doing this polynomial for every atom 6 times
	per time step, precompute it for some number of points.
	------------------------------------------------------------------------- */

	void PPPMDispIntel::precompute_rho()
	{

	half_rho_scale = (rho_points - 1.)/2.;
	half_rho_scale_plus = half_rho_scale + 0.5;

	for (int i = 0; i < rho_points; i++) {
	FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k=nlower; k<=nupper;k++){
	FFT_SCALAR r1 = ZEROF;
	for(int l=order-1; l>=0; l--){
	r1 = rho_coeff[l][k] + r1*dx;
	}
	rho_lookup[i][k-nlower] = r1;
	}
	for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho_lookup[i][k] = 0;
	}
	if (differentiation_flag == 1) {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for(int k=nlower; k<=nupper;k++){
	FFT_SCALAR r1 = ZEROF;
	for(int l=order-2; l>=0; l--){
	r1 = drho_coeff[l][k] + r1*dx;
	}
	drho_lookup[i][k-nlower] = r1;
	}
	for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	drho_lookup[i][k] = 0;
	}
	}
	}
	for (int i = 0; i < rho_points; i++) {
	FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for (int k=nlower_6; k<=nupper_6;k++){
	FFT_SCALAR r1 = ZEROF;
	for(int l=order_6-1; l>=0; l--){
	r1 = rho_coeff_6[l][k] + r1*dx;
	}
	rho6_lookup[i][k-nlower_6] = r1;
	}
	for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	rho6_lookup[i][k] = 0;
	}
	if (differentiation_flag == 1) {
	#if defined(LMP_SIMD_COMPILER)
	#pragma simd
	#endif
	for(int k=nlower_6; k<=nupper_6;k++){
	FFT_SCALAR r1 = ZEROF;
	for(int l=order_6-2; l>=0; l--){
	r1 = drho_coeff_6[l][k] + r1*dx;
	}
	drho6_lookup[i][k-nlower_6] = r1;
	}
	for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
	drho6_lookup[i][k] = 0;
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	Returns 0 if Intel optimizations for PPPM ignored due to offload
	------------------------------------------------------------------------- */

	#ifdef _LMP_INTEL_OFFLOAD
	int PPPMDispIntel::use_base() {
	return _use_base;
	}
	#endif

pppm_disp_intel.cppNo OneTemporaryActions

File Metadata

pppm_disp_intel.cppView Options

Event Timeline

pppm_disp_intel.cpp
No OneTemporary
Actions

pppm_disp_intel.cpp
View Options