Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F85842360
pppm_gpu.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Oct 2, 11:49
Size
22 KB
Mime Type
text/x-c
Expires
Fri, Oct 4, 11:49 (2 d)
Engine
blob
Format
Raw Data
Handle
21268208
Attached To
rLAMMPS lammps
pppm_gpu.cpp
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), Axel Kohlmeyer (Temple)
------------------------------------------------------------------------- */
#include "lmptype.h"
#include "mpi.h"
#include "string.h"
#include "stdio.h"
#include "stdlib.h"
#include "math.h"
#include "pppm_gpu.h"
#include "atom.h"
#include "comm.h"
#include "commgrid.h"
#include "neighbor.h"
#include "force.h"
#include "pair.h"
#include "bond.h"
#include "angle.h"
#include "domain.h"
#include "fft3d_wrap.h"
#include "remap_wrap.h"
#include "gpu_extra.h"
#include "math_const.h"
#include "memory.h"
#include "error.h"
#include "update.h"
#include "universe.h"
#include "fix.h"
using
namespace
LAMMPS_NS
;
using
namespace
MathConst
;
#define MAXORDER 7
#define OFFSET 16384
#define SMALL 0.00001
#define LARGE 10000.0
#define EPS_HOC 1.0e-7
enum
{
REVERSE_RHO
};
enum
{
FORWARD_IK
,
FORWARD_AD
,
FORWARD_IK_PERATOM
,
FORWARD_AD_PERATOM
};
#ifdef FFT_SINGLE
#define ZEROF 0.0f
#define ONEF 1.0f
#else
#define ZEROF 0.0
#define ONEF 1.0
#endif
// external functions from cuda library for atom decomposition
#ifdef FFT_SINGLE
#define PPPM_GPU_API(api) pppm_gpu_ ## api ## _f
#else
#define PPPM_GPU_API(api) pppm_gpu_ ## api ## _d
#endif
FFT_SCALAR
*
PPPM_GPU_API
(
init
)(
const
int
nlocal
,
const
int
nall
,
FILE
*
screen
,
const
int
order
,
const
int
nxlo_out
,
const
int
nylo_out
,
const
int
nzlo_out
,
const
int
nxhi_out
,
const
int
nyhi_out
,
const
int
nzhi_out
,
FFT_SCALAR
**
rho_coeff
,
FFT_SCALAR
**
_vd_brick
,
const
double
slab_volfactor
,
const
int
nx_pppm
,
const
int
ny_pppm
,
const
int
nz_pppm
,
const
bool
split
,
const
bool
respa
,
int
&
success
);
void
PPPM_GPU_API
(
clear
)(
const
double
poisson_time
);
int
PPPM_GPU_API
(
spread
)(
const
int
ago
,
const
int
nlocal
,
const
int
nall
,
double
**
host_x
,
int
*
host_type
,
bool
&
success
,
double
*
host_q
,
double
*
boxlo
,
const
double
delxinv
,
const
double
delyinv
,
const
double
delzinv
);
void
PPPM_GPU_API
(
interp
)(
const
FFT_SCALAR
qqrd2e_scale
);
double
PPPM_GPU_API
(
bytes
)();
void
PPPM_GPU_API
(
forces
)(
double
**
f
);
/* ---------------------------------------------------------------------- */
PPPMGPU
::
PPPMGPU
(
LAMMPS
*
lmp
,
int
narg
,
char
**
arg
)
:
PPPM
(
lmp
,
narg
,
arg
)
{
if
(
narg
!=
1
)
error
->
all
(
FLERR
,
"Illegal kspace_style pppm/gpu command"
);
triclinic_support
=
0
;
density_brick_gpu
=
vd_brick
=
NULL
;
kspace_split
=
false
;
im_real_space
=
false
;
GPU_EXTRA
::
gpu_ready
(
lmp
->
modify
,
lmp
->
error
);
}
/* ----------------------------------------------------------------------
free all memory
------------------------------------------------------------------------- */
PPPMGPU
::~
PPPMGPU
()
{
PPPM_GPU_API
(
clear
)(
poisson_time
);
}
/* ----------------------------------------------------------------------
called once before run
------------------------------------------------------------------------- */
void
PPPMGPU
::
init
()
{
// PPPM init manages all arrays except density_brick_gpu and vd_brick
// thru its deallocate(), allocate()
// NOTE: could free density_brick and vdxyz_brick after PPPM allocates them,
// before allocating db_gpu and vd_brick down below, if don't need,
// if do this, make sure to set them to NULL
destroy_3d_offset
(
density_brick_gpu
,
nzlo_out
,
nylo_out
);
destroy_3d_offset
(
vd_brick
,
nzlo_out
,
nylo_out
);
density_brick_gpu
=
vd_brick
=
NULL
;
PPPM
::
init
();
// insure no conflict with fix balance
for
(
int
i
=
0
;
i
<
modify
->
nfix
;
i
++
)
if
(
strcmp
(
modify
->
fix
[
i
]
->
style
,
"balance"
)
==
0
)
error
->
all
(
FLERR
,
"Cannot currently use pppm/gpu with fix balance."
);
// unsupported option
if
(
differentiation_flag
==
1
)
error
->
all
(
FLERR
,
"Cannot (yet) do analytic differentiation with pppm/gpu"
);
if
(
strcmp
(
update
->
integrate_style
,
"verlet/split"
)
==
0
)
{
kspace_split
=
true
;
old_nlocal
=
0
;
}
if
(
kspace_split
&&
universe
->
iworld
==
0
)
{
im_real_space
=
true
;
return
;
}
// GPU precision specific init
bool
respa_value
=
false
;
if
(
strstr
(
update
->
integrate_style
,
"respa"
))
respa_value
=
true
;
if
(
order
>
8
)
error
->
all
(
FLERR
,
"Cannot use order greater than 8 with pppm/gpu."
);
PPPM_GPU_API
(
clear
)(
poisson_time
);
int
success
;
FFT_SCALAR
*
data
,
*
h_brick
;
h_brick
=
PPPM_GPU_API
(
init
)(
atom
->
nlocal
,
atom
->
nlocal
+
atom
->
nghost
,
screen
,
order
,
nxlo_out
,
nylo_out
,
nzlo_out
,
nxhi_out
,
nyhi_out
,
nzhi_out
,
rho_coeff
,
&
data
,
slab_volfactor
,
nx_pppm
,
ny_pppm
,
nz_pppm
,
kspace_split
,
respa_value
,
success
);
GPU_EXTRA
::
check_flag
(
success
,
error
,
world
);
// allocate density_brick_gpu and vd_brick
density_brick_gpu
=
create_3d_offset
(
nzlo_out
,
nzhi_out
,
nylo_out
,
nyhi_out
,
nxlo_out
,
nxhi_out
,
"pppm:density_brick_gpu"
,
h_brick
,
1
);
vd_brick
=
create_3d_offset
(
nzlo_out
,
nzhi_out
,
nylo_out
,
nyhi_out
,
nxlo_out
,
nxhi_out
,
"pppm:vd_brick"
,
data
,
4
);
poisson_time
=
0.0
;
}
/* ----------------------------------------------------------------------
compute the PPPMGPU long-range force, energy, virial
------------------------------------------------------------------------- */
void
PPPMGPU
::
compute
(
int
eflag
,
int
vflag
)
{
int
i
,
j
;
int
nago
;
if
(
kspace_split
)
{
if
(
im_real_space
)
return
;
if
(
atom
->
nlocal
>
old_nlocal
)
{
nago
=
0
;
old_nlocal
=
atom
->
nlocal
;
}
else
nago
=
1
;
}
else
nago
=
neighbor
->
ago
;
// set energy/virial flags
// invoke allocate_peratom() if needed for first time
if
(
eflag
||
vflag
)
ev_setup
(
eflag
,
vflag
);
else
evflag
=
evflag_atom
=
eflag_global
=
vflag_global
=
eflag_atom
=
vflag_atom
=
0
;
// If need per-atom energies/virials, also do particle map on host
// concurrently with GPU calculations
if
(
evflag_atom
&&
!
peratom_allocate_flag
)
{
allocate_peratom
();
cg_peratom
->
ghost_notify
();
cg_peratom
->
setup
();
peratom_allocate_flag
=
1
;
}
bool
success
=
true
;
int
flag
=
PPPM_GPU_API
(
spread
)(
nago
,
atom
->
nlocal
,
atom
->
nlocal
+
atom
->
nghost
,
atom
->
x
,
atom
->
type
,
success
,
atom
->
q
,
domain
->
boxlo
,
delxinv
,
delyinv
,
delzinv
);
if
(
!
success
)
error
->
one
(
FLERR
,
"Insufficient memory on accelerator"
);
if
(
flag
!=
0
)
error
->
one
(
FLERR
,
"Out of range atoms - cannot compute PPPM"
);
// convert atoms from box to lamda coords
if
(
triclinic
==
0
)
boxlo
=
domain
->
boxlo
;
else
{
boxlo
=
domain
->
boxlo_lamda
;
domain
->
x2lamda
(
atom
->
nlocal
);
}
// extend size of per-atom arrays if necessary
if
(
evflag_atom
&&
atom
->
nlocal
>
nmax
)
{
memory
->
destroy
(
part2grid
);
nmax
=
atom
->
nmax
;
memory
->
create
(
part2grid
,
nmax
,
3
,
"pppm:part2grid"
);
particle_map
();
}
double
t3
=
MPI_Wtime
();
// all procs communicate density values from their ghost cells
// to fully sum contribution in their 3d bricks
// remap from 3d decomposition to FFT decomposition
cg
->
reverse_comm
(
this
,
REVERSE_RHO
);
brick2fft
();
// compute potential gradient on my FFT grid and
// portion of e_long on this proc's FFT grid
// return gradients (electric fields) in 3d brick decomposition
poisson
();
// all procs communicate E-field values
// to fill ghost cells surrounding their 3d bricks
if
(
differentiation_flag
==
1
)
cg
->
forward_comm
(
this
,
FORWARD_AD
);
else
cg
->
forward_comm
(
this
,
FORWARD_IK
);
// extra per-atom energy/virial communication
if
(
evflag_atom
)
{
if
(
differentiation_flag
==
1
&&
vflag_atom
)
cg_peratom
->
forward_comm
(
this
,
FORWARD_AD_PERATOM
);
else
if
(
differentiation_flag
==
0
)
cg_peratom
->
forward_comm
(
this
,
FORWARD_IK_PERATOM
);
}
poisson_time
+=
MPI_Wtime
()
-
t3
;
// calculate the force on my particles
FFT_SCALAR
qscale
=
force
->
qqrd2e
*
scale
;
PPPM_GPU_API
(
interp
)(
qscale
);
// per-atom energy/virial
// energy includes self-energy correction
if
(
evflag_atom
)
fieldforce_peratom
();
// sum energy across procs and add in volume-dependent term
// reset qsum and qsqsum if atom count has changed
if
(
eflag_global
)
{
double
energy_all
;
MPI_Allreduce
(
&
energy
,
&
energy_all
,
1
,
MPI_DOUBLE
,
MPI_SUM
,
world
);
energy
=
energy_all
;
if
(
atom
->
natoms
!=
natoms_original
)
{
qsum_qsq
(
0
);
natoms_original
=
atom
->
natoms
;
}
energy
*=
0.5
*
volume
;
energy
-=
g_ewald
*
qsqsum
/
1.772453851
+
MY_PI2
*
qsum
*
qsum
/
(
g_ewald
*
g_ewald
*
volume
);
energy
*=
qscale
;
}
// sum virial across procs
if
(
vflag_global
)
{
double
virial_all
[
6
];
MPI_Allreduce
(
virial
,
virial_all
,
6
,
MPI_DOUBLE
,
MPI_SUM
,
world
);
for
(
i
=
0
;
i
<
6
;
i
++
)
virial
[
i
]
=
0.5
*
qscale
*
volume
*
virial_all
[
i
];
}
// per-atom energy/virial
// energy includes self-energy correction
if
(
evflag_atom
)
{
double
*
q
=
atom
->
q
;
int
nlocal
=
atom
->
nlocal
;
if
(
eflag_atom
)
{
for
(
i
=
0
;
i
<
nlocal
;
i
++
)
{
eatom
[
i
]
*=
0.5
;
eatom
[
i
]
-=
g_ewald
*
q
[
i
]
*
q
[
i
]
/
MY_PIS
+
MY_PI2
*
q
[
i
]
*
qsum
/
(
g_ewald
*
g_ewald
*
volume
);
eatom
[
i
]
*=
qscale
;
}
}
if
(
vflag_atom
)
{
for
(
i
=
0
;
i
<
nlocal
;
i
++
)
for
(
j
=
0
;
j
<
6
;
j
++
)
vatom
[
i
][
j
]
*=
0.5
*
qscale
;
}
}
// 2d slab correction
if
(
slabflag
)
slabcorr
();
// convert atoms back from lamda to box coords
if
(
triclinic
)
domain
->
lamda2x
(
atom
->
nlocal
);
if
(
kspace_split
)
PPPM_GPU_API
(
forces
)(
atom
->
f
);
}
/* ----------------------------------------------------------------------
remap density from 3d brick decomposition to FFT decomposition
------------------------------------------------------------------------- */
void
PPPMGPU
::
brick2fft
()
{
int
n
,
ix
,
iy
,
iz
;
// copy grabs inner portion of density from 3d brick
// remap could be done as pre-stage of FFT,
// but this works optimally on only double values, not complex values
n
=
0
;
for
(
iz
=
nzlo_in
;
iz
<=
nzhi_in
;
iz
++
)
for
(
iy
=
nylo_in
;
iy
<=
nyhi_in
;
iy
++
)
for
(
ix
=
nxlo_in
;
ix
<=
nxhi_in
;
ix
++
)
density_fft
[
n
++
]
=
density_brick_gpu
[
iz
][
iy
][
ix
];
remap
->
perform
(
density_fft
,
density_fft
,
work1
);
}
/* ----------------------------------------------------------------------
FFT-based Poisson solver
------------------------------------------------------------------------- */
void
PPPMGPU
::
poisson_ik
()
{
int
i
,
j
,
k
,
n
;
double
eng
;
// transform charge density (r -> k)
n
=
0
;
for
(
i
=
0
;
i
<
nfft
;
i
++
)
{
work1
[
n
++
]
=
density_fft
[
i
];
work1
[
n
++
]
=
ZEROF
;
}
fft1
->
compute
(
work1
,
work1
,
1
);
// if requested, compute energy and virial contribution
double
scaleinv
=
1.0
/
(
nx_pppm
*
ny_pppm
*
nz_pppm
);
double
s2
=
scaleinv
*
scaleinv
;
if
(
eflag_global
||
vflag_global
)
{
if
(
vflag_global
)
{
n
=
0
;
for
(
i
=
0
;
i
<
nfft
;
i
++
)
{
eng
=
s2
*
greensfn
[
i
]
*
(
work1
[
n
]
*
work1
[
n
]
+
work1
[
n
+
1
]
*
work1
[
n
+
1
]);
for
(
j
=
0
;
j
<
6
;
j
++
)
virial
[
j
]
+=
eng
*
vg
[
i
][
j
];
if
(
eflag_global
)
energy
+=
eng
;
n
+=
2
;
}
}
else
{
n
=
0
;
for
(
i
=
0
;
i
<
nfft
;
i
++
)
{
energy
+=
s2
*
greensfn
[
i
]
*
(
work1
[
n
]
*
work1
[
n
]
+
work1
[
n
+
1
]
*
work1
[
n
+
1
]);
n
+=
2
;
}
}
}
// scale by 1/total-grid-pts to get rho(k)
// multiply by Green's function to get V(k)
n
=
0
;
for
(
i
=
0
;
i
<
nfft
;
i
++
)
{
work1
[
n
++
]
*=
scaleinv
*
greensfn
[
i
];
work1
[
n
++
]
*=
scaleinv
*
greensfn
[
i
];
}
// extra FFTs for per-atom energy/virial
if
(
evflag_atom
)
poisson_peratom
();
// compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
// FFT leaves data in 3d brick decomposition
// copy it into inner portion of vdx,vdy,vdz arrays
// x direction gradient
n
=
0
;
for
(
k
=
nzlo_fft
;
k
<=
nzhi_fft
;
k
++
)
for
(
j
=
nylo_fft
;
j
<=
nyhi_fft
;
j
++
)
for
(
i
=
nxlo_fft
;
i
<=
nxhi_fft
;
i
++
)
{
work2
[
n
]
=
fkx
[
i
]
*
work1
[
n
+
1
];
work2
[
n
+
1
]
=
-
fkx
[
i
]
*
work1
[
n
];
n
+=
2
;
}
fft2
->
compute
(
work2
,
work2
,
-
1
);
n
=
0
;
int
x_hi
=
nxhi_in
*
4
+
3
;
for
(
k
=
nzlo_in
;
k
<=
nzhi_in
;
k
++
)
for
(
j
=
nylo_in
;
j
<=
nyhi_in
;
j
++
)
for
(
i
=
nxlo_in
*
4
;
i
<
x_hi
;
i
+=
4
)
{
vd_brick
[
k
][
j
][
i
]
=
work2
[
n
];
n
+=
2
;
}
// y direction gradient
n
=
0
;
for
(
k
=
nzlo_fft
;
k
<=
nzhi_fft
;
k
++
)
for
(
j
=
nylo_fft
;
j
<=
nyhi_fft
;
j
++
)
for
(
i
=
nxlo_fft
;
i
<=
nxhi_fft
;
i
++
)
{
work2
[
n
]
=
fky
[
j
]
*
work1
[
n
+
1
];
work2
[
n
+
1
]
=
-
fky
[
j
]
*
work1
[
n
];
n
+=
2
;
}
fft2
->
compute
(
work2
,
work2
,
-
1
);
n
=
0
;
for
(
k
=
nzlo_in
;
k
<=
nzhi_in
;
k
++
)
for
(
j
=
nylo_in
;
j
<=
nyhi_in
;
j
++
)
for
(
i
=
nxlo_in
*
4
+
1
;
i
<
x_hi
;
i
+=
4
)
{
vd_brick
[
k
][
j
][
i
]
=
work2
[
n
];
n
+=
2
;
}
// z direction gradient
n
=
0
;
for
(
k
=
nzlo_fft
;
k
<=
nzhi_fft
;
k
++
)
for
(
j
=
nylo_fft
;
j
<=
nyhi_fft
;
j
++
)
for
(
i
=
nxlo_fft
;
i
<=
nxhi_fft
;
i
++
)
{
work2
[
n
]
=
fkz
[
k
]
*
work1
[
n
+
1
];
work2
[
n
+
1
]
=
-
fkz
[
k
]
*
work1
[
n
];
n
+=
2
;
}
fft2
->
compute
(
work2
,
work2
,
-
1
);
n
=
0
;
for
(
k
=
nzlo_in
;
k
<=
nzhi_in
;
k
++
)
for
(
j
=
nylo_in
;
j
<=
nyhi_in
;
j
++
)
for
(
i
=
nxlo_in
*
4
+
2
;
i
<
x_hi
;
i
+=
4
)
{
vd_brick
[
k
][
j
][
i
]
=
work2
[
n
];
n
+=
2
;
}
}
/* ----------------------------------------------------------------------
pack own values to buf to send to another proc
------------------------------------------------------------------------- */
void
PPPMGPU
::
pack_forward
(
int
flag
,
FFT_SCALAR
*
buf
,
int
nlist
,
int
*
list
)
{
int
n
=
0
;
if
(
flag
==
FORWARD_IK
)
{
int
offset
;
FFT_SCALAR
*
src
=
&
vd_brick
[
nzlo_out
][
nylo_out
][
4
*
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
{
offset
=
4
*
list
[
i
];
buf
[
n
++
]
=
src
[
offset
++
];
buf
[
n
++
]
=
src
[
offset
++
];
buf
[
n
++
]
=
src
[
offset
];
}
}
else
if
(
flag
==
FORWARD_AD
)
{
FFT_SCALAR
*
src
=
&
u_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
buf
[
i
]
=
src
[
list
[
i
]];
}
else
if
(
flag
==
FORWARD_IK_PERATOM
)
{
FFT_SCALAR
*
esrc
=
&
u_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v0src
=
&
v0_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v1src
=
&
v1_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v2src
=
&
v2_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v3src
=
&
v3_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v4src
=
&
v4_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v5src
=
&
v5_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
{
if
(
eflag_atom
)
buf
[
n
++
]
=
esrc
[
list
[
i
]];
if
(
vflag_atom
)
{
buf
[
n
++
]
=
v0src
[
list
[
i
]];
buf
[
n
++
]
=
v1src
[
list
[
i
]];
buf
[
n
++
]
=
v2src
[
list
[
i
]];
buf
[
n
++
]
=
v3src
[
list
[
i
]];
buf
[
n
++
]
=
v4src
[
list
[
i
]];
buf
[
n
++
]
=
v5src
[
list
[
i
]];
}
}
}
else
if
(
flag
==
FORWARD_AD_PERATOM
)
{
FFT_SCALAR
*
v0src
=
&
v0_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v1src
=
&
v1_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v2src
=
&
v2_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v3src
=
&
v3_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v4src
=
&
v4_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v5src
=
&
v5_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
{
buf
[
n
++
]
=
v0src
[
list
[
i
]];
buf
[
n
++
]
=
v1src
[
list
[
i
]];
buf
[
n
++
]
=
v2src
[
list
[
i
]];
buf
[
n
++
]
=
v3src
[
list
[
i
]];
buf
[
n
++
]
=
v4src
[
list
[
i
]];
buf
[
n
++
]
=
v5src
[
list
[
i
]];
}
}
}
/* ----------------------------------------------------------------------
unpack another proc's own values from buf and set own ghost values
------------------------------------------------------------------------- */
void
PPPMGPU
::
unpack_forward
(
int
flag
,
FFT_SCALAR
*
buf
,
int
nlist
,
int
*
list
)
{
int
n
=
0
;
if
(
flag
==
FORWARD_IK
)
{
int
offset
;
FFT_SCALAR
*
dest
=
&
vd_brick
[
nzlo_out
][
nylo_out
][
4
*
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
{
offset
=
4
*
list
[
i
];
dest
[
offset
++
]
=
buf
[
n
++
];
dest
[
offset
++
]
=
buf
[
n
++
];
dest
[
offset
]
=
buf
[
n
++
];
}
}
else
if
(
flag
==
FORWARD_AD
)
{
FFT_SCALAR
*
dest
=
&
u_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
dest
[
list
[
i
]]
=
buf
[
i
];
}
else
if
(
flag
==
FORWARD_IK_PERATOM
)
{
FFT_SCALAR
*
esrc
=
&
u_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v0src
=
&
v0_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v1src
=
&
v1_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v2src
=
&
v2_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v3src
=
&
v3_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v4src
=
&
v4_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v5src
=
&
v5_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
{
if
(
eflag_atom
)
esrc
[
list
[
i
]]
=
buf
[
n
++
];
if
(
vflag_atom
)
{
v0src
[
list
[
i
]]
=
buf
[
n
++
];
v1src
[
list
[
i
]]
=
buf
[
n
++
];
v2src
[
list
[
i
]]
=
buf
[
n
++
];
v3src
[
list
[
i
]]
=
buf
[
n
++
];
v4src
[
list
[
i
]]
=
buf
[
n
++
];
v5src
[
list
[
i
]]
=
buf
[
n
++
];
}
}
}
else
if
(
flag
==
FORWARD_AD_PERATOM
)
{
FFT_SCALAR
*
v0src
=
&
v0_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v1src
=
&
v1_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v2src
=
&
v2_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v3src
=
&
v3_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v4src
=
&
v4_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
FFT_SCALAR
*
v5src
=
&
v5_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
{
v0src
[
list
[
i
]]
=
buf
[
n
++
];
v1src
[
list
[
i
]]
=
buf
[
n
++
];
v2src
[
list
[
i
]]
=
buf
[
n
++
];
v3src
[
list
[
i
]]
=
buf
[
n
++
];
v4src
[
list
[
i
]]
=
buf
[
n
++
];
v5src
[
list
[
i
]]
=
buf
[
n
++
];
}
}
}
/* ----------------------------------------------------------------------
pack ghost values into buf to send to another proc
------------------------------------------------------------------------- */
void
PPPMGPU
::
pack_reverse
(
int
flag
,
FFT_SCALAR
*
buf
,
int
nlist
,
int
*
list
)
{
if
(
flag
==
REVERSE_RHO
)
{
FFT_SCALAR
*
src
=
&
density_brick_gpu
[
nzlo_out
][
nylo_out
][
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
buf
[
i
]
=
src
[
list
[
i
]];
}
}
/* ----------------------------------------------------------------------
unpack another proc's ghost values from buf and add to own values
------------------------------------------------------------------------- */
void
PPPMGPU
::
unpack_reverse
(
int
flag
,
FFT_SCALAR
*
buf
,
int
nlist
,
int
*
list
)
{
if
(
flag
==
REVERSE_RHO
)
{
FFT_SCALAR
*
dest
=
&
density_brick_gpu
[
nzlo_out
][
nylo_out
][
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
dest
[
list
[
i
]]
+=
buf
[
i
];
}
}
/* ----------------------------------------------------------------------
create array using offsets from pinned memory allocation
------------------------------------------------------------------------- */
FFT_SCALAR
***
PPPMGPU
::
create_3d_offset
(
int
n1lo
,
int
n1hi
,
int
n2lo
,
int
n2hi
,
int
n3lo
,
int
n3hi
,
const
char
*
name
,
FFT_SCALAR
*
data
,
int
vec_length
)
{
int
i
,
j
;
int
n1
=
n1hi
-
n1lo
+
1
;
int
n2
=
n2hi
-
n2lo
+
1
;
int
n3
=
n3hi
-
n3lo
+
1
;
FFT_SCALAR
**
plane
=
(
FFT_SCALAR
**
)
memory
->
smalloc
(
n1
*
n2
*
sizeof
(
FFT_SCALAR
*
),
name
);
FFT_SCALAR
***
array
=
(
FFT_SCALAR
***
)
memory
->
smalloc
(
n1
*
sizeof
(
FFT_SCALAR
**
),
name
);
int
n
=
0
;
for
(
i
=
0
;
i
<
n1
;
i
++
)
{
array
[
i
]
=
&
plane
[
i
*
n2
];
for
(
j
=
0
;
j
<
n2
;
j
++
)
{
plane
[
i
*
n2
+
j
]
=
&
data
[
n
];
n
+=
n3
*
vec_length
;
}
}
for
(
i
=
0
;
i
<
n1
*
n2
;
i
++
)
array
[
0
][
i
]
-=
n3lo
*
vec_length
;
for
(
i
=
0
;
i
<
n1
;
i
++
)
array
[
i
]
-=
n2lo
;
return
array
-
n1lo
;
}
/* ----------------------------------------------------------------------
3d memory offsets
------------------------------------------------------------------------- */
void
PPPMGPU
::
destroy_3d_offset
(
FFT_SCALAR
***
array
,
int
n1_offset
,
int
n2_offset
)
{
if
(
array
==
NULL
)
return
;
memory
->
sfree
(
&
array
[
n1_offset
][
n2_offset
]);
memory
->
sfree
(
array
+
n1_offset
);
}
/* ----------------------------------------------------------------------
memory usage of local arrays
------------------------------------------------------------------------- */
double
PPPMGPU
::
memory_usage
()
{
double
bytes
=
PPPM
::
memory_usage
();
// NOTE: add tallying here for density_brick_gpu and vd_brick
// could subtract out density_brick and vdxyz_brick if freed them above
// it the net efffect is zero, do nothing
return
bytes
+
PPPM_GPU_API
(
bytes
)();
}
/* ----------------------------------------------------------------------
perform and time the 1d FFTs required for N timesteps
------------------------------------------------------------------------- */
int
PPPMGPU
::
timing_1d
(
int
n
,
double
&
time1d
)
{
if
(
im_real_space
)
{
time1d
=
1.0
;
return
4
;
}
PPPM
::
timing_1d
(
n
,
time1d
);
return
4
;
}
/* ----------------------------------------------------------------------
perform and time the 3d FFTs required for N timesteps
------------------------------------------------------------------------- */
int
PPPMGPU
::
timing_3d
(
int
n
,
double
&
time3d
)
{
if
(
im_real_space
)
{
time3d
=
1.0
;
return
4
;
}
PPPM
::
timing_3d
(
n
,
time3d
);
return
4
;
}
/* ----------------------------------------------------------------------
adjust PPPM coeffs, called initially and whenever volume has changed
------------------------------------------------------------------------- */
void
PPPMGPU
::
setup
()
{
if
(
im_real_space
)
return
;
PPPM
::
setup
();
}
Event Timeline
Log In to Comment