Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F92466583
pppm_gpu.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Nov 20, 13:31
Size
24 KB
Mime Type
text/x-c
Expires
Fri, Nov 22, 13:31 (2 d)
Engine
blob
Format
Raw Data
Handle
22446056
Attached To
rLAMMPS lammps
pppm_gpu.cpp
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), Axel Kohlmeyer (Temple)
------------------------------------------------------------------------- */
#include "lmptype.h"
#include "mpi.h"
#include "string.h"
#include "stdio.h"
#include "stdlib.h"
#include "math.h"
#include "pppm_gpu.h"
#include "atom.h"
#include "comm.h"
#include "neighbor.h"
#include "force.h"
#include "pair.h"
#include "bond.h"
#include "angle.h"
#include "domain.h"
#include "fft3d_wrap.h"
#include "remap_wrap.h"
#include "gpu_extra.h"
#include "math_const.h"
#include "memory.h"
#include "error.h"
using namespace LAMMPS_NS;
using namespace MathConst;
#define MAXORDER 7
#define OFFSET 16384
#define SMALL 0.00001
#define LARGE 10000.0
#define EPS_HOC 1.0e-7
#ifdef FFT_SINGLE
#define ZEROF 0.0f
#define ONEF 1.0f
#else
#define ZEROF 0.0
#define ONEF 1.0
#endif
// External functions from cuda library for atom decomposition
#ifdef FFT_SINGLE
#define PPPM_GPU_API(api) pppm_gpu_ ## api ## _f
#else
#define PPPM_GPU_API(api) pppm_gpu_ ## api ## _d
#endif
FFT_SCALAR* PPPM_GPU_API(init)(const int nlocal, const int nall, FILE *screen,
const int order, const int nxlo_out,
const int nylo_out, const int nzlo_out,
const int nxhi_out, const int nyhi_out,
const int nzhi_out, FFT_SCALAR **rho_coeff,
FFT_SCALAR **_vd_brick, const double slab_volfactor,
const int nx_pppm, const int ny_pppm,
const int nz_pppm, int &success);
void PPPM_GPU_API(clear)(const double poisson_time);
int PPPM_GPU_API(spread)(const int ago, const int nlocal, const int nall,
double **host_x, int *host_type, bool &success,
double *host_q, double *boxlo, const double delxinv,
const double delyinv, const double delzinv);
void PPPM_GPU_API(interp)(const FFT_SCALAR qqrd2e_scale);
double PPPM_GPU_API(bytes)();
/* ---------------------------------------------------------------------- */
PPPMGPU::PPPMGPU(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
{
if (narg != 1) error->all(FLERR,"Illegal kspace_style pppm/gpu command");
density_brick_gpu = vd_brick = NULL;
}
/* ----------------------------------------------------------------------
free all memory
------------------------------------------------------------------------- */
PPPMGPU::~PPPMGPU()
{
PPPM_GPU_API(clear)(poisson_time);
}
/* ----------------------------------------------------------------------
called once before run
------------------------------------------------------------------------- */
void PPPMGPU::init()
{
PPPM::init();
// GPU precision specific init.
if (order>8)
error->all(FLERR,"Cannot use order greater than 8 with pppm/gpu.");
PPPM_GPU_API(clear)(poisson_time);
int success;
FFT_SCALAR *data, *h_brick;
h_brick = PPPM_GPU_API(init)(atom->nlocal, atom->nlocal+atom->nghost, screen,
order, nxlo_out, nylo_out, nzlo_out, nxhi_out,
nyhi_out, nzhi_out, rho_coeff, &data,
slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
GPU_EXTRA::check_flag(success,error,world);
density_brick_gpu =
create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:density_brick_gpu",h_brick,1);
vd_brick =
create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:vd_brick",data,4);
poisson_time=0;
}
/* ----------------------------------------------------------------------
compute the PPPMGPU long-range force, energy, virial
------------------------------------------------------------------------- */
void PPPMGPU::compute(int eflag, int vflag)
{
bool success = true;
int flag=PPPM_GPU_API(spread)(neighbor->ago, atom->nlocal, atom->nlocal +
atom->nghost, atom->x, atom->type, success,
atom->q, domain->boxlo, delxinv, delyinv,
delzinv);
if (!success)
error->one(FLERR,"Out of memory on GPGPU");
if (flag != 0)
error->one(FLERR,"Out of range atoms - cannot compute PPPM");
int i;
// convert atoms from box to lamda coords
if (triclinic == 0) boxlo = domain->boxlo;
else {
boxlo = domain->boxlo_lamda;
domain->x2lamda(atom->nlocal);
}
energy = 0.0;
if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;
double t3=MPI_Wtime();
// all procs communicate density values from their ghost cells
// to fully sum contribution in their 3d bricks
// remap from 3d decomposition to FFT decomposition
brick2fft();
// compute potential gradient on my FFT grid and
// portion of e_long on this proc's FFT grid
// return gradients (electric fields) in 3d brick decomposition
poisson(eflag,vflag);
// all procs communicate E-field values to fill ghost cells
// surrounding their 3d bricks
fillbrick();
poisson_time+=MPI_Wtime()-t3;
// calculate the force on my particles
FFT_SCALAR qqrd2e_scale=force->qqrd2e*scale;
PPPM_GPU_API(interp)(qqrd2e_scale);
// sum energy across procs and add in volume-dependent term
if (eflag) {
double energy_all;
MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
energy = energy_all;
energy *= 0.5*volume;
energy -= g_ewald*qsqsum/1.772453851 +
MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
energy *= force->qqrd2e*scale;
}
// sum virial across procs
if (vflag) {
double virial_all[6];
MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
for (i = 0; i < 6; i++) virial[i] = 0.5*force->qqrd2e*scale*volume*virial_all[i];
}
// 2d slab correction
if (slabflag) slabcorr(eflag);
// convert atoms back from lamda to box coords
if (triclinic) domain->lamda2x(atom->nlocal);
}
/* ----------------------------------------------------------------------
allocate memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */
void PPPMGPU::allocate()
{
memory->create(density_fft,nfft_both,"pppm:density_fft");
memory->create(greensfn,nfft_both,"pppm:greensfn");
memory->create(work1,2*nfft_both,"pppm:work1");
memory->create(work2,2*nfft_both,"pppm:work2");
memory->create(vg,nfft_both,6,"pppm:vg");
memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");
memory->create(buf1,nbuf,"pppm:buf1");
memory->create(buf2,nbuf,"pppm:buf2");
// summation coeffs
memory->create(gf_b,order,"pppm:gf_b");
memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
// create 2 FFTs and a Remap
// 1st FFT keeps data in FFT decompostion
// 2nd FFT returns data in 3d brick decomposition
// remap takes data from 3d brick to FFT decomposition
int tmp;
fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
0,0,&tmp);
fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
0,0,&tmp);
remap = new Remap(lmp,world,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
1,0,0,FFT_PRECISION);
}
/* ----------------------------------------------------------------------
deallocate memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */
void PPPMGPU::deallocate()
{
destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out);
destroy_3d_offset(vd_brick,nzlo_out,nylo_out);
memory->destroy(density_fft);
memory->destroy(greensfn);
memory->destroy(work1);
memory->destroy(work2);
memory->destroy(vg);
memory->destroy1d_offset(fkx,nxlo_fft);
memory->destroy1d_offset(fky,nylo_fft);
memory->destroy1d_offset(fkz,nzlo_fft);
memory->destroy(buf1);
memory->destroy(buf2);
memory->destroy(gf_b);
memory->destroy2d_offset(rho1d,-order/2);
memory->destroy2d_offset(rho_coeff,(1-order)/2);
delete fft1;
delete fft2;
delete remap;
}
/* ----------------------------------------------------------------------
ghost-swap to accumulate full density in brick decomposition
remap density from 3d brick decomposition to FFT decomposition
------------------------------------------------------------------------- */
void PPPMGPU::brick2fft()
{
int i,n,ix,iy,iz;
MPI_Request request;
MPI_Status status;
// pack my ghosts for +x processor
// pass data to self or +x processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = nxhi_in+1; ix <= nxhi_out; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[0][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// pack my ghosts for -x processor
// pass data to self or -x processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = nxlo_out; ix < nxlo_in; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[0][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// pack my ghosts for +y processor
// pass data to self or +y processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[1][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// pack my ghosts for -y processor
// pass data to self or -y processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy < nylo_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[1][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// pack my ghosts for +z processor
// pass data to self or +z processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[2][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// pack my ghosts for -z processor
// pass data to self or -z processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzlo_out; iz < nzlo_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[2][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// remap from 3d brick decomposition to FFT decomposition
// copy grabs inner portion of density from 3d brick
// remap could be done as pre-stage of FFT,
// but this works optimally on only double values, not complex values
n = 0;
for (iz = nzlo_in; iz <= nzhi_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
density_fft[n++] = density_brick_gpu[iz][iy][ix];
remap->perform(density_fft,density_fft,work1);
}
/* ----------------------------------------------------------------------
ghost-swap to fill ghost cells of my brick with field values
------------------------------------------------------------------------- */
void PPPMGPU::fillbrick()
{
int i,n,ix,iy,iz;
MPI_Request request;
MPI_Status status;
// pack my real cells for +z processor
// pass data to self or +z processor
// unpack and sum recv data into my ghost cells
n = 0;
int x_lo = nxlo_in * 4;
int x_hi = nxhi_in * 4 + 1;
for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[2][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz < nzlo_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
// pack my real cells for -z processor
// pass data to self or -z processor
// unpack and sum recv data into my ghost cells
n = 0;
for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[2][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
// pack my real cells for +y processor
// pass data to self or +y processor
// unpack and sum recv data into my ghost cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[1][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy < nylo_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
// pack my real cells for -y processor
// pass data to self or -y processor
// unpack and sum recv data into my ghost cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[1][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
// pack my real cells for +x processor
// pass data to self or +x processor
// unpack and sum recv data into my ghost cells
n = 0;
x_lo = (nxhi_in-nxhi_ghost+1) * 4;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[0][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
x_lo = nxlo_out * 4;
x_hi = nxlo_in * 4;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
// pack my real cells for -x processor
// pass data to self or -x processor
// unpack and sum recv data into my ghost cells
n = 0;
x_lo = x_hi;
x_hi = (nxlo_in+nxlo_ghost) * 4;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[0][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
x_lo = (nxhi_in + 1) * 4;
x_hi = nxhi_out * 4 + 1;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
}
/* ----------------------------------------------------------------------
FFT-based Poisson solver
------------------------------------------------------------------------- */
void PPPMGPU::poisson(int eflag, int vflag)
{
int i,j,k,n;
double eng;
// transform charge density (r -> k)
n = 0;
for (i = 0; i < nfft; i++) {
work1[n++] = density_fft[i];
work1[n++] = ZEROF;
}
fft1->compute(work1,work1,1);
// if requested, compute energy and virial contribution
double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
double s2 = scaleinv*scaleinv;
if (eflag || vflag) {
if (vflag) {
n = 0;
for (i = 0; i < nfft; i++) {
eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
energy += eng;
n += 2;
}
} else {
n = 0;
for (i = 0; i < nfft; i++) {
energy +=
s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
n += 2;
}
}
}
// scale by 1/total-grid-pts to get rho(k)
// multiply by Green's function to get V(k)
n = 0;
for (i = 0; i < nfft; i++) {
work1[n++] *= scaleinv * greensfn[i];
work1[n++] *= scaleinv * greensfn[i];
}
// compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
// FFT leaves data in 3d brick decomposition
// copy it into inner portion of vdx,vdy,vdz arrays
// x direction gradient
n = 0;
for (k = nzlo_fft; k <= nzhi_fft; k++)
for (j = nylo_fft; j <= nyhi_fft; j++)
for (i = nxlo_fft; i <= nxhi_fft; i++) {
work2[n] = fkx[i]*work1[n+1];
work2[n+1] = -fkx[i]*work1[n];
n += 2;
}
fft2->compute(work2,work2,-1);
n = 0;
int x_hi = nxhi_in * 4 + 3;
for (k = nzlo_in; k <= nzhi_in; k++)
for (j = nylo_in; j <= nyhi_in; j++)
for (i = nxlo_in * 4; i < x_hi; i+=4) {
vd_brick[k][j][i] = work2[n];
n += 2;
}
// y direction gradient
n = 0;
for (k = nzlo_fft; k <= nzhi_fft; k++)
for (j = nylo_fft; j <= nyhi_fft; j++)
for (i = nxlo_fft; i <= nxhi_fft; i++) {
work2[n] = fky[j]*work1[n+1];
work2[n+1] = -fky[j]*work1[n];
n += 2;
}
fft2->compute(work2,work2,-1);
n = 0;
for (k = nzlo_in; k <= nzhi_in; k++)
for (j = nylo_in; j <= nyhi_in; j++)
for (i = nxlo_in * 4 + 1; i < x_hi; i+=4) {
vd_brick[k][j][i] = work2[n];
n += 2;
}
// z direction gradient
n = 0;
for (k = nzlo_fft; k <= nzhi_fft; k++)
for (j = nylo_fft; j <= nyhi_fft; j++)
for (i = nxlo_fft; i <= nxhi_fft; i++) {
work2[n] = fkz[k]*work1[n+1];
work2[n+1] = -fkz[k]*work1[n];
n += 2;
}
fft2->compute(work2,work2,-1);
n = 0;
for (k = nzlo_in; k <= nzhi_in; k++)
for (j = nylo_in; j <= nyhi_in; j++)
for (i = nxlo_in * 4 + 2; i < x_hi; i+=4) {
vd_brick[k][j][i] = work2[n];
n += 2;
}
}
/* ----------------------------------------------------------------------
Create array using offsets from pinned memory allocation
------------------------------------------------------------------------- */
FFT_SCALAR ***PPPMGPU::create_3d_offset(int n1lo, int n1hi, int n2lo, int n2hi,
int n3lo, int n3hi, const char *name,
FFT_SCALAR *data, int vec_length)
{
int i,j;
int n1 = n1hi - n1lo + 1;
int n2 = n2hi - n2lo + 1;
int n3 = n3hi - n3lo + 1;
FFT_SCALAR **plane = (FFT_SCALAR **)memory->smalloc(n1*n2*sizeof(FFT_SCALAR *),name);
FFT_SCALAR ***array = (FFT_SCALAR ***)memory->smalloc(n1*sizeof(FFT_SCALAR **),name);
int n = 0;
for (i = 0; i < n1; i++) {
array[i] = &plane[i*n2];
for (j = 0; j < n2; j++) {
plane[i*n2+j] = &data[n];
n += n3*vec_length;
}
}
for (i = 0; i < n1*n2; i++) array[0][i] -= n3lo*vec_length;
for (i = 0; i < n1; i++) array[i] -= n2lo;
return array-n1lo;
}
/* ----------------------------------------------------------------------
3d memory offsets
------------------------------------------------------------------------- */
void PPPMGPU::destroy_3d_offset(FFT_SCALAR ***array, int n1_offset,
int n2_offset)
{
if (array == NULL) return;
memory->sfree(&array[n1_offset][n2_offset]);
memory->sfree(array + n1_offset);
}
/* ----------------------------------------------------------------------
memory usage of local arrays
------------------------------------------------------------------------- */
double PPPMGPU::memory_usage()
{
double bytes = nmax*3 * sizeof(double);
int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
(nzhi_out-nzlo_out+1);
bytes += 4 * nbrick * sizeof(FFT_SCALAR);
bytes += 6 * nfft_both * sizeof(double);
bytes += nfft_both * sizeof(double);
bytes += nfft_both*5 * sizeof(FFT_SCALAR);
bytes += 2 * nbuf * sizeof(double);
return bytes + PPPM_GPU_API(bytes)();
}
Event Timeline
Log In to Comment