pppm_gpu.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Aug 23, 06:22

pppm_gpu.cpp
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing authors: Mike Brown (ORNL), Axel Kohlmeyer (Temple)
	------------------------------------------------------------------------- */

	#include "lmptype.h"
	#include "mpi.h"
	#include "string.h"
	#include "stdio.h"
	#include "stdlib.h"
	#include "math.h"
	#include "pppm_gpu.h"
	#include "atom.h"
	#include "comm.h"
	#include "neighbor.h"
	#include "force.h"
	#include "pair.h"
	#include "bond.h"
	#include "angle.h"
	#include "domain.h"
	#include "fft3d_wrap.h"
	#include "remap_wrap.h"
	#include "gpu_extra.h"
	#include "math_const.h"
	#include "memory.h"
	#include "error.h"

	using namespace LAMMPS_NS;
	using namespace MathConst;

	#define MAXORDER 7
	#define OFFSET 16384
	#define SMALL 0.00001
	#define LARGE 10000.0
	#define EPS_HOC 1.0e-7

	#ifdef FFT_SINGLE
	#define ZEROF 0.0f
	#define ONEF 1.0f
	#else
	#define ZEROF 0.0
	#define ONEF 1.0
	#endif


	// External functions from cuda library for atom decomposition
	#ifdef FFT_SINGLE
	#define PPPM_GPU_API(api) pppm_gpu_ ## api ## _f
	#else
	#define PPPM_GPU_API(api) pppm_gpu_ ## api ## _d
	#endif
	FFT_SCALAR* PPPM_GPU_API(init)(const int nlocal, const int nall, FILE *screen,
	const int order, const int nxlo_out,
	const int nylo_out, const int nzlo_out,
	const int nxhi_out, const int nyhi_out,
	const int nzhi_out, FFT_SCALAR **rho_coeff,
	FFT_SCALAR **_vd_brick, const double slab_volfactor,
	const int nx_pppm, const int ny_pppm,
	const int nz_pppm, int &success);
	void PPPM_GPU_API(clear)(const double poisson_time);
	int PPPM_GPU_API(spread)(const int ago, const int nlocal, const int nall,
	double *host_x, int host_type, bool &success,
	double host_q, double boxlo, const double delxinv,
	const double delyinv, const double delzinv);
	void PPPM_GPU_API(interp)(const FFT_SCALAR qqrd2e_scale);
	double PPPM_GPU_API(bytes)();

	/* ---------------------------------------------------------------------- */

	PPPMGPU::PPPMGPU(LAMMPS lmp, int narg, char *arg) : PPPM(lmp, narg, arg)
	{
	if (narg != 1) error->all(FLERR,"Illegal kspace_style pppm/gpu command");

	density_brick_gpu = vd_brick = NULL;
	}

	/* ----------------------------------------------------------------------
	free all memory
	------------------------------------------------------------------------- */

	PPPMGPU::~PPPMGPU()
	{
	PPPM_GPU_API(clear)(poisson_time);
	}

	/* ----------------------------------------------------------------------
	called once before run
	------------------------------------------------------------------------- */

	void PPPMGPU::init()
	{
	PPPM::init();

	// GPU precision specific init.
	if (order>8)
	error->all(FLERR,"Cannot use order greater than 8 with pppm/gpu.");
	PPPM_GPU_API(clear)(poisson_time);

	int success;
	FFT_SCALAR data, h_brick;
	h_brick = PPPM_GPU_API(init)(atom->nlocal, atom->nlocal+atom->nghost, screen,
	order, nxlo_out, nylo_out, nzlo_out, nxhi_out,
	nyhi_out, nzhi_out, rho_coeff, &data,
	slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);

	GPU_EXTRA::check_flag(success,error,world);

	density_brick_gpu =
	create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
	nxlo_out,nxhi_out,"pppm:density_brick_gpu",h_brick,1);
	vd_brick =
	create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
	nxlo_out,nxhi_out,"pppm:vd_brick",data,4);

	poisson_time=0;
	}

	/* ----------------------------------------------------------------------
	compute the PPPMGPU long-range force, energy, virial
	------------------------------------------------------------------------- */

	void PPPMGPU::compute(int eflag, int vflag)
	{
	bool success = true;
	int flag=PPPM_GPU_API(spread)(neighbor->ago, atom->nlocal, atom->nlocal +
	atom->nghost, atom->x, atom->type, success,
	atom->q, domain->boxlo, delxinv, delyinv,
	delzinv);
	if (!success)
	error->one(FLERR,"Out of memory on GPGPU");
	if (flag != 0)
	error->one(FLERR,"Out of range atoms - cannot compute PPPM");

	int i;

	// convert atoms from box to lamda coords

	if (triclinic == 0) boxlo = domain->boxlo;
	else {
	boxlo = domain->boxlo_lamda;
	domain->x2lamda(atom->nlocal);
	}

	energy = 0.0;
	if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;

	double t3=MPI_Wtime();

	// all procs communicate density values from their ghost cells
	// to fully sum contribution in their 3d bricks
	// remap from 3d decomposition to FFT decomposition

	brick2fft();

	// compute potential gradient on my FFT grid and
	// portion of e_long on this proc's FFT grid
	// return gradients (electric fields) in 3d brick decomposition

	poisson(eflag,vflag);

	// all procs communicate E-field values to fill ghost cells
	// surrounding their 3d bricks

	fillbrick();

	poisson_time+=MPI_Wtime()-t3;

	// calculate the force on my particles

	FFT_SCALAR qscale = force->qqrd2e * scale;
	PPPM_GPU_API(interp)(qscale);

	// sum energy across procs and add in volume-dependent term

	if (eflag) {
	double energy_all;
	MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
	energy = energy_all;

	energy = 0.5volume;
	energy -= g_ewald*qsqsum/1.772453851 +
	MY_PI2qsumqsum / (g_ewaldg_ewaldvolume);
	energy *= qscale;
	}

	// sum virial across procs

	if (vflag) {
	double virial_all[6];
	MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
	for (i = 0; i < 6; i++) virial[i] = 0.5qscalevolume*virial_all[i];
	}

	// 2d slab correction

	if (slabflag) slabcorr(eflag);

	// convert atoms back from lamda to box coords

	if (triclinic) domain->lamda2x(atom->nlocal);
	}

	/* ----------------------------------------------------------------------
	allocate memory that depends on # of K-vectors and order
	------------------------------------------------------------------------- */

	void PPPMGPU::allocate()
	{
	memory->create(density_fft,nfft_both,"pppm:density_fft");
	memory->create(greensfn,nfft_both,"pppm:greensfn");
	memory->create(work1,2*nfft_both,"pppm:work1");
	memory->create(work2,2*nfft_both,"pppm:work2");
	memory->create(vg,nfft_both,6,"pppm:vg");

	memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
	memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
	memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");

	memory->create(buf1,nbuf,"pppm:buf1");
	memory->create(buf2,nbuf,"pppm:buf2");

	// summation coeffs

	memory->create(gf_b,order,"pppm:gf_b");
	memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
	memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");

	// create 2 FFTs and a Remap
	// 1st FFT keeps data in FFT decompostion
	// 2nd FFT returns data in 3d brick decomposition
	// remap takes data from 3d brick to FFT decomposition

	int tmp;

	fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
	nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
	nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
	0,0,&tmp);

	fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
	nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
	nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
	0,0,&tmp);

	remap = new Remap(lmp,world,
	nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
	nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
	1,0,0,FFT_PRECISION);
	}

	/* ----------------------------------------------------------------------
	deallocate memory that depends on # of K-vectors and order
	------------------------------------------------------------------------- */

	void PPPMGPU::deallocate()
	{
	destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out);
	destroy_3d_offset(vd_brick,nzlo_out,nylo_out);

	memory->destroy(density_fft);
	memory->destroy(greensfn);
	memory->destroy(work1);
	memory->destroy(work2);
	memory->destroy(vg);

	memory->destroy1d_offset(fkx,nxlo_fft);
	memory->destroy1d_offset(fky,nylo_fft);
	memory->destroy1d_offset(fkz,nzlo_fft);

	memory->destroy(buf1);
	memory->destroy(buf2);

	memory->destroy(gf_b);
	memory->destroy2d_offset(rho1d,-order/2);
	memory->destroy2d_offset(rho_coeff,(1-order)/2);

	delete fft1;
	delete fft2;
	delete remap;
	}


	/* ----------------------------------------------------------------------
	ghost-swap to accumulate full density in brick decomposition
	remap density from 3d brick decomposition to FFT decomposition
	------------------------------------------------------------------------- */

	void PPPMGPU::brick2fft()
	{
	int i,n,ix,iy,iz;
	MPI_Request request;
	MPI_Status status;

	// pack my ghosts for +x processor
	// pass data to self or +x processor
	// unpack and sum recv data into my real cells

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_out; iy <= nyhi_out; iy++)
	for (ix = nxhi_in+1; ix <= nxhi_out; ix++)
	buf1[n++] = density_brick_gpu[iz][iy][ix];

	if (comm->procneigh[0][1] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_out; iy <= nyhi_out; iy++)
	for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++)
	density_brick_gpu[iz][iy][ix] += buf2[n++];

	// pack my ghosts for -x processor
	// pass data to self or -x processor
	// unpack and sum recv data into my real cells

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_out; iy <= nyhi_out; iy++)
	for (ix = nxlo_out; ix < nxlo_in; ix++)
	buf1[n++] = density_brick_gpu[iz][iy][ix];

	if (comm->procneigh[0][0] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_out; iy <= nyhi_out; iy++)
	for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++)
	density_brick_gpu[iz][iy][ix] += buf2[n++];

	// pack my ghosts for +y processor
	// pass data to self or +y processor
	// unpack and sum recv data into my real cells

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
	for (ix = nxlo_in; ix <= nxhi_in; ix++)
	buf1[n++] = density_brick_gpu[iz][iy][ix];

	if (comm->procneigh[1][1] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
	for (ix = nxlo_in; ix <= nxhi_in; ix++)
	density_brick_gpu[iz][iy][ix] += buf2[n++];

	// pack my ghosts for -y processor
	// pass data to self or -y processor
	// unpack and sum recv data into my real cells

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_out; iy < nylo_in; iy++)
	for (ix = nxlo_in; ix <= nxhi_in; ix++)
	buf1[n++] = density_brick_gpu[iz][iy][ix];

	if (comm->procneigh[1][0] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
	for (ix = nxlo_in; ix <= nxhi_in; ix++)
	density_brick_gpu[iz][iy][ix] += buf2[n++];

	// pack my ghosts for +z processor
	// pass data to self or +z processor
	// unpack and sum recv data into my real cells

	n = 0;
	for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
	for (iy = nylo_in; iy <= nyhi_in; iy++)
	for (ix = nxlo_in; ix <= nxhi_in; ix++)
	buf1[n++] = density_brick_gpu[iz][iy][ix];

	if (comm->procneigh[2][1] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
	for (iy = nylo_in; iy <= nyhi_in; iy++)
	for (ix = nxlo_in; ix <= nxhi_in; ix++)
	density_brick_gpu[iz][iy][ix] += buf2[n++];

	// pack my ghosts for -z processor
	// pass data to self or -z processor
	// unpack and sum recv data into my real cells

	n = 0;
	for (iz = nzlo_out; iz < nzlo_in; iz++)
	for (iy = nylo_in; iy <= nyhi_in; iy++)
	for (ix = nxlo_in; ix <= nxhi_in; ix++)
	buf1[n++] = density_brick_gpu[iz][iy][ix];

	if (comm->procneigh[2][0] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
	for (iy = nylo_in; iy <= nyhi_in; iy++)
	for (ix = nxlo_in; ix <= nxhi_in; ix++)
	density_brick_gpu[iz][iy][ix] += buf2[n++];

	// remap from 3d brick decomposition to FFT decomposition
	// copy grabs inner portion of density from 3d brick
	// remap could be done as pre-stage of FFT,
	// but this works optimally on only double values, not complex values

	n = 0;
	for (iz = nzlo_in; iz <= nzhi_in; iz++)
	for (iy = nylo_in; iy <= nyhi_in; iy++)
	for (ix = nxlo_in; ix <= nxhi_in; ix++)
	density_fft[n++] = density_brick_gpu[iz][iy][ix];

	remap->perform(density_fft,density_fft,work1);
	}

	/* ----------------------------------------------------------------------
	ghost-swap to fill ghost cells of my brick with field values
	------------------------------------------------------------------------- */

	void PPPMGPU::fillbrick()
	{
	int i,n,ix,iy,iz;
	MPI_Request request;
	MPI_Status status;

	// pack my real cells for +z processor
	// pass data to self or +z processor
	// unpack and sum recv data into my ghost cells

	n = 0;
	int x_lo = nxlo_in * 4;
	int x_hi = nxhi_in * 4 + 1;
	for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
	for (iy = nylo_in; iy <= nyhi_in; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	buf1[n++] = vd_brick[iz][iy][ix];
	buf1[n++] = vd_brick[iz][iy][ix+1];
	buf1[n++] = vd_brick[iz][iy][ix+2];
	}

	if (comm->procneigh[2][1] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	for (iz = nzlo_out; iz < nzlo_in; iz++)
	for (iy = nylo_in; iy <= nyhi_in; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	vd_brick[iz][iy][ix] = buf2[n++];
	vd_brick[iz][iy][ix+1] = buf2[n++];
	vd_brick[iz][iy][ix+2] = buf2[n++];
	}

	// pack my real cells for -z processor
	// pass data to self or -z processor
	// unpack and sum recv data into my ghost cells

	n = 0;
	for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
	for (iy = nylo_in; iy <= nyhi_in; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	buf1[n++] = vd_brick[iz][iy][ix];
	buf1[n++] = vd_brick[iz][iy][ix+1];
	buf1[n++] = vd_brick[iz][iy][ix+2];
	}

	if (comm->procneigh[2][0] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
	for (iy = nylo_in; iy <= nyhi_in; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	vd_brick[iz][iy][ix] = buf2[n++];
	vd_brick[iz][iy][ix+1] = buf2[n++];
	vd_brick[iz][iy][ix+2] = buf2[n++];
	}

	// pack my real cells for +y processor
	// pass data to self or +y processor
	// unpack and sum recv data into my ghost cells

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	buf1[n++] = vd_brick[iz][iy][ix];
	buf1[n++] = vd_brick[iz][iy][ix+1];
	buf1[n++] = vd_brick[iz][iy][ix+2];
	}

	if (comm->procneigh[1][1] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_out; iy < nylo_in; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	vd_brick[iz][iy][ix] = buf2[n++];
	vd_brick[iz][iy][ix+1] = buf2[n++];
	vd_brick[iz][iy][ix+2] = buf2[n++];
	}

	// pack my real cells for -y processor
	// pass data to self or -y processor
	// unpack and sum recv data into my ghost cells

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	buf1[n++] = vd_brick[iz][iy][ix];
	buf1[n++] = vd_brick[iz][iy][ix+1];
	buf1[n++] = vd_brick[iz][iy][ix+2];
	}

	if (comm->procneigh[1][0] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	vd_brick[iz][iy][ix] = buf2[n++];
	vd_brick[iz][iy][ix+1] = buf2[n++];
	vd_brick[iz][iy][ix+2] = buf2[n++];
	}

	// pack my real cells for +x processor
	// pass data to self or +x processor
	// unpack and sum recv data into my ghost cells

	n = 0;
	x_lo = (nxhi_in-nxhi_ghost+1) * 4;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_out; iy <= nyhi_out; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	buf1[n++] = vd_brick[iz][iy][ix];
	buf1[n++] = vd_brick[iz][iy][ix+1];
	buf1[n++] = vd_brick[iz][iy][ix+2];
	}

	if (comm->procneigh[0][1] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	x_lo = nxlo_out * 4;
	x_hi = nxlo_in * 4;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_out; iy <= nyhi_out; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	vd_brick[iz][iy][ix] = buf2[n++];
	vd_brick[iz][iy][ix+1] = buf2[n++];
	vd_brick[iz][iy][ix+2] = buf2[n++];
	}

	// pack my real cells for -x processor
	// pass data to self or -x processor
	// unpack and sum recv data into my ghost cells

	n = 0;
	x_lo = x_hi;
	x_hi = (nxlo_in+nxlo_ghost) * 4;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_out; iy <= nyhi_out; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	buf1[n++] = vd_brick[iz][iy][ix];
	buf1[n++] = vd_brick[iz][iy][ix+1];
	buf1[n++] = vd_brick[iz][iy][ix+2];
	}

	if (comm->procneigh[0][0] == me)
	for (i = 0; i < n; i++) buf2[i] = buf1[i];
	else {
	MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
	MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
	MPI_Wait(&request,&status);
	}

	n = 0;
	x_lo = (nxhi_in + 1) * 4;
	x_hi = nxhi_out * 4 + 1;
	for (iz = nzlo_out; iz <= nzhi_out; iz++)
	for (iy = nylo_out; iy <= nyhi_out; iy++)
	for (ix = x_lo; ix < x_hi; ix+=4) {
	vd_brick[iz][iy][ix] = buf2[n++];
	vd_brick[iz][iy][ix+1] = buf2[n++];
	vd_brick[iz][iy][ix+2] = buf2[n++];
	}
	}

	/* ----------------------------------------------------------------------
	FFT-based Poisson solver
	------------------------------------------------------------------------- */

	void PPPMGPU::poisson(int eflag, int vflag)
	{
	int i,j,k,n;
	double eng;

	// transform charge density (r -> k)

	n = 0;
	for (i = 0; i < nfft; i++) {
	work1[n++] = density_fft[i];
	work1[n++] = ZEROF;
	}

	fft1->compute(work1,work1,1);

	// if requested, compute energy and virial contribution

	double scaleinv = 1.0/(nx_pppmny_pppmnz_pppm);
	double s2 = scaleinv*scaleinv;

	if (eflag \|\| vflag) {
	if (vflag) {
	n = 0;
	for (i = 0; i < nfft; i++) {
	eng = s2 * greensfn[i] * (work1[n]work1[n] + work1[n+1]work1[n+1]);
	for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
	energy += eng;
	n += 2;
	}
	} else {
	n = 0;
	for (i = 0; i < nfft; i++) {
	energy +=
	s2 * greensfn[i] * (work1[n]work1[n] + work1[n+1]work1[n+1]);
	n += 2;
	}
	}
	}

	// scale by 1/total-grid-pts to get rho(k)
	// multiply by Green's function to get V(k)

	n = 0;
	for (i = 0; i < nfft; i++) {
	work1[n++] = scaleinv greensfn[i];
	work1[n++] = scaleinv greensfn[i];
	}

	// compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
	// FFT leaves data in 3d brick decomposition
	// copy it into inner portion of vdx,vdy,vdz arrays

	// x direction gradient

	n = 0;
	for (k = nzlo_fft; k <= nzhi_fft; k++)
	for (j = nylo_fft; j <= nyhi_fft; j++)
	for (i = nxlo_fft; i <= nxhi_fft; i++) {
	work2[n] = fkx[i]*work1[n+1];
	work2[n+1] = -fkx[i]*work1[n];
	n += 2;
	}

	fft2->compute(work2,work2,-1);

	n = 0;
	int x_hi = nxhi_in * 4 + 3;
	for (k = nzlo_in; k <= nzhi_in; k++)
	for (j = nylo_in; j <= nyhi_in; j++)
	for (i = nxlo_in * 4; i < x_hi; i+=4) {
	vd_brick[k][j][i] = work2[n];
	n += 2;
	}

	// y direction gradient

	n = 0;
	for (k = nzlo_fft; k <= nzhi_fft; k++)
	for (j = nylo_fft; j <= nyhi_fft; j++)
	for (i = nxlo_fft; i <= nxhi_fft; i++) {
	work2[n] = fky[j]*work1[n+1];
	work2[n+1] = -fky[j]*work1[n];
	n += 2;
	}

	fft2->compute(work2,work2,-1);

	n = 0;
	for (k = nzlo_in; k <= nzhi_in; k++)
	for (j = nylo_in; j <= nyhi_in; j++)
	for (i = nxlo_in * 4 + 1; i < x_hi; i+=4) {
	vd_brick[k][j][i] = work2[n];
	n += 2;
	}

	// z direction gradient

	n = 0;
	for (k = nzlo_fft; k <= nzhi_fft; k++)
	for (j = nylo_fft; j <= nyhi_fft; j++)
	for (i = nxlo_fft; i <= nxhi_fft; i++) {
	work2[n] = fkz[k]*work1[n+1];
	work2[n+1] = -fkz[k]*work1[n];
	n += 2;
	}

	fft2->compute(work2,work2,-1);

	n = 0;
	for (k = nzlo_in; k <= nzhi_in; k++)
	for (j = nylo_in; j <= nyhi_in; j++)
	for (i = nxlo_in * 4 + 2; i < x_hi; i+=4) {
	vd_brick[k][j][i] = work2[n];
	n += 2;
	}
	}

	/* ----------------------------------------------------------------------
	Create array using offsets from pinned memory allocation
	------------------------------------------------------------------------- */

	FFT_SCALAR ***PPPMGPU::create_3d_offset(int n1lo, int n1hi, int n2lo, int n2hi,
	int n3lo, int n3hi, const char *name,
	FFT_SCALAR *data, int vec_length)
	{
	int i,j;
	int n1 = n1hi - n1lo + 1;
	int n2 = n2hi - n2lo + 1;
	int n3 = n3hi - n3lo + 1;

	FFT_SCALAR plane = (FFT_SCALAR )memory->smalloc(n1n2sizeof(FFT_SCALAR *),name);
	FFT_SCALAR *array = (FFT_SCALAR )memory->smalloc(n1sizeof(FFT_SCALAR **),name);

	int n = 0;
	for (i = 0; i < n1; i++) {
	array[i] = &plane[i*n2];
	for (j = 0; j < n2; j++) {
	plane[i*n2+j] = &data[n];
	n += n3*vec_length;
	}
	}

	for (i = 0; i < n1n2; i++) array[0][i] -= n3lovec_length;
	for (i = 0; i < n1; i++) array[i] -= n2lo;
	return array-n1lo;
	}

	/* ----------------------------------------------------------------------
	3d memory offsets
	------------------------------------------------------------------------- */

	void PPPMGPU::destroy_3d_offset(FFT_SCALAR ***array, int n1_offset,
	int n2_offset)
	{
	if (array == NULL) return;
	memory->sfree(&array[n1_offset][n2_offset]);
	memory->sfree(array + n1_offset);
	}


	/* ----------------------------------------------------------------------
	memory usage of local arrays
	------------------------------------------------------------------------- */

	double PPPMGPU::memory_usage()
	{
	double bytes = nmax3 sizeof(double);
	int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
	(nzhi_out-nzlo_out+1);
	bytes += 4 * nbrick * sizeof(FFT_SCALAR);
	bytes += 6 * nfft_both * sizeof(double);
	bytes += nfft_both * sizeof(double);
	bytes += nfft_both5 sizeof(FFT_SCALAR);
	bytes += 2 * nbuf * sizeof(double);
	return bytes + PPPM_GPU_API(bytes)();
	}

pppm_gpu.cppNo OneTemporaryActions

File Metadata

pppm_gpu.cppView Options

Event Timeline

pppm_gpu.cpp
No OneTemporary
Actions

pppm_gpu.cpp
View Options