Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F64588710
pppm_intel.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, May 28, 00:01
Size
40 KB
Mime Type
text/x-c
Expires
Thu, May 30, 00:01 (2 d)
Engine
blob
Format
Raw Data
Handle
17928697
Attached To
rLAMMPS lammps
pppm_intel.cpp
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: William McDoniel (RWTH Aachen University)
Rodrigo Canales (RWTH Aachen University)
Markus Hoehnerbach (RWTH Aachen University)
W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include <mpi.h>
#include <stdlib.h>
#include <math.h>
#include "pppm_intel.h"
#include "atom.h"
#include "error.h"
#include "fft3d_wrap.h"
#include "gridcomm.h"
#include "math_const.h"
#include "math_special.h"
#include "memory.h"
#include "suffix.h"
using
namespace
LAMMPS_NS
;
using
namespace
MathConst
;
using
namespace
MathSpecial
;
#define MAXORDER 7
#define OFFSET 16384
#define LARGE 10000.0
#define SMALL 0.00001
#define EPS_HOC 1.0e-7
enum
{
REVERSE_RHO
};
enum
{
FORWARD_IK
,
FORWARD_AD
,
FORWARD_IK_PERATOM
,
FORWARD_AD_PERATOM
};
#ifdef FFT_SINGLE
#define ZEROF 0.0f
#define ONEF 1.0f
#else
#define ZEROF 0.0
#define ONEF 1.0
#endif
/* ---------------------------------------------------------------------- */
PPPMIntel
::
PPPMIntel
(
LAMMPS
*
lmp
,
int
narg
,
char
**
arg
)
:
PPPM
(
lmp
,
narg
,
arg
)
{
suffix_flag
|=
Suffix
::
INTEL
;
order
=
7
;
//sets default stencil size to 7
perthread_density
=
NULL
;
particle_ekx
=
particle_eky
=
particle_ekz
=
NULL
;
rho_lookup
=
drho_lookup
=
NULL
;
rho_points
=
0
;
vdxy_brick
=
vdz0_brick
=
NULL
;
work3
=
NULL
;
cg_pack
=
NULL
;
_use_table
=
_use_packing
=
_use_lrt
=
0
;
}
PPPMIntel
::~
PPPMIntel
()
{
memory
->
destroy
(
perthread_density
);
memory
->
destroy
(
particle_ekx
);
memory
->
destroy
(
particle_eky
);
memory
->
destroy
(
particle_ekz
);
memory
->
destroy
(
rho_lookup
);
memory
->
destroy
(
drho_lookup
);
memory
->
destroy3d_offset
(
vdxy_brick
,
nzlo_out
,
nylo_out
,
2
*
nxlo_out
);
memory
->
destroy3d_offset
(
vdz0_brick
,
nzlo_out
,
nylo_out
,
2
*
nxlo_out
);
memory
->
destroy
(
work3
);
delete
cg_pack
;
}
/* ----------------------------------------------------------------------
called once before run
------------------------------------------------------------------------- */
void
PPPMIntel
::
init
()
{
PPPM
::
init
();
int
ifix
=
modify
->
find_fix
(
"package_intel"
);
if
(
ifix
<
0
)
error
->
all
(
FLERR
,
"The 'package intel' command is required for /intel styles"
);
fix
=
static_cast
<
FixIntel
*>
(
modify
->
fix
[
ifix
]);
#ifdef _LMP_INTEL_OFFLOAD
_use_base
=
0
;
if
(
fix
->
offload_balance
()
!=
0.0
)
{
_use_base
=
1
;
return
;
}
#endif
fix
->
kspace_init_check
();
_use_lrt
=
fix
->
lrt
();
// For vectorization, we need some padding in the end
// The first thread computes on the global density
if
((
comm
->
nthreads
>
1
)
&&
!
_use_lrt
)
{
memory
->
destroy
(
perthread_density
);
memory
->
create
(
perthread_density
,
comm
->
nthreads
-
1
,
ngrid
+
INTEL_P3M_ALIGNED_MAXORDER
,
"pppmintel:perthread_density"
);
}
_use_table
=
fix
->
pppm_table
();
if
(
_use_table
)
{
rho_points
=
5000
;
memory
->
destroy
(
rho_lookup
);
memory
->
create
(
rho_lookup
,
rho_points
,
INTEL_P3M_ALIGNED_MAXORDER
,
"pppmintel:rho_lookup"
);
if
(
differentiation_flag
==
1
)
{
memory
->
destroy
(
drho_lookup
);
memory
->
create
(
drho_lookup
,
rho_points
,
INTEL_P3M_ALIGNED_MAXORDER
,
"pppmintel:drho_lookup"
);
}
precompute_rho
();
}
if
(
order
>
INTEL_P3M_MAXORDER
)
error
->
all
(
FLERR
,
"PPPM order greater than supported by USER-INTEL
\n
"
);
_use_packing
=
(
order
==
7
)
&&
(
INTEL_VECTOR_WIDTH
==
16
)
&&
(
sizeof
(
FFT_SCALAR
)
==
sizeof
(
float
))
&&
(
differentiation_flag
==
0
);
if
(
_use_packing
)
{
memory
->
destroy3d_offset
(
vdx_brick
,
nzlo_out
,
nylo_out
,
nxlo_out
);
memory
->
destroy3d_offset
(
vdy_brick
,
nzlo_out
,
nylo_out
,
nxlo_out
);
memory
->
destroy3d_offset
(
vdz_brick
,
nzlo_out
,
nylo_out
,
nxlo_out
);
memory
->
destroy3d_offset
(
vdxy_brick
,
nzlo_out
,
nylo_out
,
2
*
nxlo_out
);
create3d_offset
(
vdxy_brick
,
nzlo_out
,
nzhi_out
+
2
,
nylo_out
,
nyhi_out
,
2
*
nxlo_out
,
2
*
nxhi_out
+
1
,
"pppmintel:vdxy_brick"
);
memory
->
destroy3d_offset
(
vdz0_brick
,
nzlo_out
,
nylo_out
,
2
*
nxlo_out
);
create3d_offset
(
vdz0_brick
,
nzlo_out
,
nzhi_out
+
2
,
nylo_out
,
nyhi_out
,
2
*
nxlo_out
,
2
*
nxhi_out
+
1
,
"pppmintel:vdz0_brick"
);
memory
->
destroy
(
work3
);
memory
->
create
(
work3
,
2
*
nfft_both
,
"pppmintel:work3"
);
// new communicator for the double-size bricks
delete
cg_pack
;
int
(
*
procneigh
)[
2
]
=
comm
->
procneigh
;
cg_pack
=
new
GridComm
(
lmp
,
world
,
2
,
0
,
2
*
nxlo_in
,
2
*
nxhi_in
+
1
,
nylo_in
,
nyhi_in
,
nzlo_in
,
nzhi_in
,
2
*
nxlo_out
,
2
*
nxhi_out
+
1
,
nylo_out
,
nyhi_out
,
nzlo_out
,
nzhi_out
,
procneigh
[
0
][
0
],
procneigh
[
0
][
1
],
procneigh
[
1
][
0
],
procneigh
[
1
][
1
],
procneigh
[
2
][
0
],
procneigh
[
2
][
1
]);
cg_pack
->
ghost_notify
();
cg_pack
->
setup
();
}
}
/* ----------------------------------------------------------------------
compute the PPPMIntel long-range force, energy, virial
------------------------------------------------------------------------- */
void
PPPMIntel
::
compute
(
int
eflag
,
int
vflag
)
{
#ifdef _LMP_INTEL_OFFLOAD
if
(
_use_base
)
{
PPPM
::
compute
(
eflag
,
vflag
);
return
;
}
#endif
compute_first
(
eflag
,
vflag
);
compute_second
(
eflag
,
vflag
);
}
/* ---------------------------------------------------------------------- */
void
PPPMIntel
::
compute_first
(
int
eflag
,
int
vflag
)
{
int
i
,
j
;
// set energy/virial flags
// invoke allocate_peratom() if needed for first time
if
(
eflag
||
vflag
)
ev_setup
(
eflag
,
vflag
);
else
evflag
=
evflag_atom
=
eflag_global
=
vflag_global
=
eflag_atom
=
vflag_atom
=
0
;
if
(
evflag_atom
&&
!
peratom_allocate_flag
)
{
allocate_peratom
();
cg_peratom
->
ghost_notify
();
cg_peratom
->
setup
();
}
// if atom count has changed, update qsum and qsqsum
if
(
atom
->
natoms
!=
natoms_original
)
{
qsum_qsq
();
natoms_original
=
atom
->
natoms
;
}
// return if there are no charges
if
(
qsqsum
==
0.0
)
return
;
// convert atoms from box to lamda coords
if
(
triclinic
==
0
)
boxlo
=
domain
->
boxlo
;
else
{
boxlo
=
domain
->
boxlo_lamda
;
domain
->
x2lamda
(
atom
->
nlocal
);
}
// extend size of per-atom arrays if necessary
if
(
atom
->
nmax
>
nmax
)
{
memory
->
destroy
(
part2grid
);
if
(
differentiation_flag
==
1
)
{
memory
->
destroy
(
particle_ekx
);
memory
->
destroy
(
particle_eky
);
memory
->
destroy
(
particle_ekz
);
}
nmax
=
atom
->
nmax
;
memory
->
create
(
part2grid
,
nmax
,
3
,
"pppm:part2grid"
);
if
(
differentiation_flag
==
1
)
{
memory
->
create
(
particle_ekx
,
nmax
,
"pppmintel:pekx"
);
memory
->
create
(
particle_eky
,
nmax
,
"pppmintel:peky"
);
memory
->
create
(
particle_ekz
,
nmax
,
"pppmintel:pekz"
);
}
}
// find grid points for all my particles
// map my particle charge onto my local 3d density grid
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_MIXED
)
{
particle_map
<
float
,
double
>
(
fix
->
get_mixed_buffers
());
make_rho
<
float
,
double
>
(
fix
->
get_mixed_buffers
());
}
else
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_DOUBLE
)
{
particle_map
<
double
,
double
>
(
fix
->
get_double_buffers
());
make_rho
<
double
,
double
>
(
fix
->
get_double_buffers
());
}
else
{
particle_map
<
float
,
float
>
(
fix
->
get_single_buffers
());
make_rho
<
float
,
float
>
(
fix
->
get_single_buffers
());
}
// all procs communicate density values from their ghost cells
// to fully sum contribution in their 3d bricks
// remap from 3d decomposition to FFT decomposition
cg
->
reverse_comm
(
this
,
REVERSE_RHO
);
brick2fft
();
// compute potential gradient on my FFT grid and
// portion of e_long on this proc's FFT grid
// return gradients (electric fields) in 3d brick decomposition
// also performs per-atom calculations via poisson_peratom()
if
(
differentiation_flag
==
1
)
poisson_ad
();
else
poisson_ik_intel
();
// all procs communicate E-field values
// to fill ghost cells surrounding their 3d bricks
if
(
differentiation_flag
==
1
)
cg
->
forward_comm
(
this
,
FORWARD_AD
);
else
{
if
(
_use_packing
)
cg_pack
->
forward_comm
(
this
,
FORWARD_IK
);
else
cg
->
forward_comm
(
this
,
FORWARD_IK
);
}
// extra per-atom energy/virial communication
if
(
evflag_atom
)
{
if
(
differentiation_flag
==
1
&&
vflag_atom
)
cg_peratom
->
forward_comm
(
this
,
FORWARD_AD_PERATOM
);
else
if
(
differentiation_flag
==
0
)
cg_peratom
->
forward_comm
(
this
,
FORWARD_IK_PERATOM
);
}
}
/* ---------------------------------------------------------------------- */
void
PPPMIntel
::
compute_second
(
int
eflag
,
int
vflag
)
{
int
i
,
j
;
// calculate the force on my particles
if
(
differentiation_flag
==
1
)
{
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_MIXED
)
fieldforce_ad
<
float
,
double
>
(
fix
->
get_mixed_buffers
());
else
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_DOUBLE
)
fieldforce_ad
<
double
,
double
>
(
fix
->
get_double_buffers
());
else
fieldforce_ad
<
float
,
float
>
(
fix
->
get_single_buffers
());
}
else
{
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_MIXED
)
fieldforce_ik
<
float
,
double
>
(
fix
->
get_mixed_buffers
());
else
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_DOUBLE
)
fieldforce_ik
<
double
,
double
>
(
fix
->
get_double_buffers
());
else
fieldforce_ik
<
float
,
float
>
(
fix
->
get_single_buffers
());
}
// extra per-atom energy/virial communication
if
(
evflag_atom
)
fieldforce_peratom
();
// sum global energy across procs and add in volume-dependent term
const
double
qscale
=
qqrd2e
*
scale
;
if
(
eflag_global
)
{
double
energy_all
;
MPI_Allreduce
(
&
energy
,
&
energy_all
,
1
,
MPI_DOUBLE
,
MPI_SUM
,
world
);
energy
=
energy_all
;
energy
*=
0.5
*
volume
;
energy
-=
g_ewald
*
qsqsum
/
MY_PIS
+
MY_PI2
*
qsum
*
qsum
/
(
g_ewald
*
g_ewald
*
volume
);
energy
*=
qscale
;
}
// sum global virial across procs
if
(
vflag_global
)
{
double
virial_all
[
6
];
MPI_Allreduce
(
virial
,
virial_all
,
6
,
MPI_DOUBLE
,
MPI_SUM
,
world
);
for
(
i
=
0
;
i
<
6
;
i
++
)
virial
[
i
]
=
0.5
*
qscale
*
volume
*
virial_all
[
i
];
}
// per-atom energy/virial
// energy includes self-energy correction
// ntotal accounts for TIP4P tallying eatom/vatom for ghost atoms
if
(
evflag_atom
)
{
double
*
q
=
atom
->
q
;
int
nlocal
=
atom
->
nlocal
;
int
ntotal
=
nlocal
;
if
(
tip4pflag
)
ntotal
+=
atom
->
nghost
;
if
(
eflag_atom
)
{
for
(
i
=
0
;
i
<
nlocal
;
i
++
)
{
eatom
[
i
]
*=
0.5
;
eatom
[
i
]
-=
g_ewald
*
q
[
i
]
*
q
[
i
]
/
MY_PIS
+
MY_PI2
*
q
[
i
]
*
qsum
/
(
g_ewald
*
g_ewald
*
volume
);
eatom
[
i
]
*=
qscale
;
}
for
(
i
=
nlocal
;
i
<
ntotal
;
i
++
)
eatom
[
i
]
*=
0.5
*
qscale
;
}
if
(
vflag_atom
)
{
for
(
i
=
0
;
i
<
ntotal
;
i
++
)
for
(
j
=
0
;
j
<
6
;
j
++
)
vatom
[
i
][
j
]
*=
0.5
*
qscale
;
}
}
// 2d slab correction
if
(
slabflag
==
1
)
slabcorr
();
// convert atoms back from lamda to box coords
if
(
triclinic
)
domain
->
lamda2x
(
atom
->
nlocal
);
}
/* ----------------------------------------------------------------------
find center grid pt for each of my particles
check that full stencil for the particle will fit in my 3d brick
store central grid pt indices in part2grid array
------------------------------------------------------------------------- */
template
<
class
flt_t
,
class
acc_t
>
void
PPPMIntel
::
particle_map
(
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
)
{
ATOM_T
*
_noalias
const
x
=
buffers
->
get_x
(
0
);
int
nlocal
=
atom
->
nlocal
;
int
nthr
;
if
(
_use_lrt
)
nthr
=
1
;
else
nthr
=
comm
->
nthreads
;
int
flag
=
0
;
if
(
!
ISFINITE
(
boxlo
[
0
])
||
!
ISFINITE
(
boxlo
[
1
])
||
!
ISFINITE
(
boxlo
[
2
]))
error
->
one
(
FLERR
,
"Non-numeric box dimensions - simulation unstable"
);
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(nlocal, nthr) reduction(+:flag) if(!_use_lrt)
#endif
{
const
flt_t
lo0
=
boxlo
[
0
];
const
flt_t
lo1
=
boxlo
[
1
];
const
flt_t
lo2
=
boxlo
[
2
];
const
flt_t
xi
=
delxinv
;
const
flt_t
yi
=
delyinv
;
const
flt_t
zi
=
delzinv
;
const
flt_t
fshift
=
shift
;
int
iifrom
,
iito
,
tid
;
IP_PRE_omp_range_id_align
(
iifrom
,
iito
,
tid
,
nlocal
,
nthr
,
sizeof
(
ATOM_T
));
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:flag)
#endif
for
(
int
i
=
iifrom
;
i
<
iito
;
i
++
)
{
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// current particle coord can be outside global and local box
// add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
int
nx
=
static_cast
<
int
>
((
x
[
i
].
x
-
lo0
)
*
xi
+
fshift
)
-
OFFSET
;
int
ny
=
static_cast
<
int
>
((
x
[
i
].
y
-
lo1
)
*
yi
+
fshift
)
-
OFFSET
;
int
nz
=
static_cast
<
int
>
((
x
[
i
].
z
-
lo2
)
*
zi
+
fshift
)
-
OFFSET
;
part2grid
[
i
][
0
]
=
nx
;
part2grid
[
i
][
1
]
=
ny
;
part2grid
[
i
][
2
]
=
nz
;
// check that entire stencil around nx,ny,nz will fit in my 3d brick
if
(
nx
+
nlower
<
nxlo_out
||
nx
+
nupper
>
nxhi_out
||
ny
+
nlower
<
nylo_out
||
ny
+
nupper
>
nyhi_out
||
nz
+
nlower
<
nzlo_out
||
nz
+
nupper
>
nzhi_out
)
flag
=
1
;
}
}
if
(
flag
)
error
->
one
(
FLERR
,
"Out of range atoms - cannot compute PPPM"
);
}
/* ----------------------------------------------------------------------
create discretized "density" on section of global grid due to my particles
density(x,y,z) = charge "density" at grid points of my 3d brick
(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
in global grid
------------------------------------------------------------------------- */
template
<
class
flt_t
,
class
acc_t
,
int
use_table
>
void
PPPMIntel
::
make_rho
(
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
)
{
FFT_SCALAR
*
_noalias
global_density
=
&
(
density_brick
[
nzlo_out
][
nylo_out
][
nxlo_out
]);
// loop over my charges, add their contribution to nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt
ATOM_T
*
_noalias
const
x
=
buffers
->
get_x
(
0
);
flt_t
*
_noalias
const
q
=
buffers
->
get_q
(
0
);
int
nlocal
=
atom
->
nlocal
;
int
nthr
;
if
(
_use_lrt
)
nthr
=
1
;
else
nthr
=
comm
->
nthreads
;
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(nthr, nlocal, global_density) if(!_use_lrt)
#endif
{
const
int
nix
=
nxhi_out
-
nxlo_out
+
1
;
const
int
niy
=
nyhi_out
-
nylo_out
+
1
;
const
flt_t
lo0
=
boxlo
[
0
];
const
flt_t
lo1
=
boxlo
[
1
];
const
flt_t
lo2
=
boxlo
[
2
];
const
flt_t
xi
=
delxinv
;
const
flt_t
yi
=
delyinv
;
const
flt_t
zi
=
delzinv
;
const
flt_t
fshift
=
shift
;
const
flt_t
fshiftone
=
shiftone
;
const
flt_t
fdelvolinv
=
delvolinv
;
int
ifrom
,
ito
,
tid
;
IP_PRE_omp_range_id
(
ifrom
,
ito
,
tid
,
nlocal
,
nthr
);
FFT_SCALAR
*
_noalias
my_density
=
tid
==
0
?
global_density
:
perthread_density
[
tid
-
1
];
// clear 3d density array
memset
(
my_density
,
0
,
ngrid
*
sizeof
(
FFT_SCALAR
));
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
int
nx
=
part2grid
[
i
][
0
];
int
ny
=
part2grid
[
i
][
1
];
int
nz
=
part2grid
[
i
][
2
];
int
nysum
=
nlower
+
ny
-
nylo_out
;
int
nxsum
=
nlower
+
nx
-
nxlo_out
;
int
nzsum
=
(
nlower
+
nz
-
nzlo_out
)
*
nix
*
niy
+
nysum
*
nix
+
nxsum
;
FFT_SCALAR
dx
=
nx
+
fshiftone
-
(
x
[
i
].
x
-
lo0
)
*
xi
;
FFT_SCALAR
dy
=
ny
+
fshiftone
-
(
x
[
i
].
y
-
lo1
)
*
yi
;
FFT_SCALAR
dz
=
nz
+
fshiftone
-
(
x
[
i
].
z
-
lo2
)
*
zi
;
_alignvar
(
flt_t
rho
[
3
][
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
if
(
use_table
)
{
dx
=
dx
*
half_rho_scale
+
half_rho_scale_plus
;
int
idx
=
dx
;
dy
=
dy
*
half_rho_scale
+
half_rho_scale_plus
;
int
idy
=
dy
;
dz
=
dz
*
half_rho_scale
+
half_rho_scale_plus
;
int
idz
=
dz
;
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
k
=
0
;
k
<
INTEL_P3M_ALIGNED_MAXORDER
;
k
++
)
{
rho
[
0
][
k
]
=
rho_lookup
[
idx
][
k
];
rho
[
1
][
k
]
=
rho_lookup
[
idy
][
k
];
rho
[
2
][
k
]
=
rho_lookup
[
idz
][
k
];
}
}
else
{
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
k
=
nlower
;
k
<=
nupper
;
k
++
)
{
FFT_SCALAR
r1
,
r2
,
r3
;
r1
=
r2
=
r3
=
ZEROF
;
for
(
int
l
=
order
-
1
;
l
>=
0
;
l
--
)
{
r1
=
rho_coeff
[
l
][
k
]
+
r1
*
dx
;
r2
=
rho_coeff
[
l
][
k
]
+
r2
*
dy
;
r3
=
rho_coeff
[
l
][
k
]
+
r3
*
dz
;
}
rho
[
0
][
k
-
nlower
]
=
r1
;
rho
[
1
][
k
-
nlower
]
=
r2
;
rho
[
2
][
k
-
nlower
]
=
r3
;
}
}
FFT_SCALAR
z0
=
fdelvolinv
*
q
[
i
];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for
(
int
n
=
0
;
n
<
order
;
n
++
)
{
int
mz
=
n
*
nix
*
niy
+
nzsum
;
FFT_SCALAR
y0
=
z0
*
rho
[
2
][
n
];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for
(
int
m
=
0
;
m
<
order
;
m
++
)
{
int
mzy
=
m
*
nix
+
mz
;
FFT_SCALAR
x0
=
y0
*
rho
[
1
][
m
];
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
l
=
0
;
l
<
INTEL_P3M_ALIGNED_MAXORDER
;
l
++
)
{
int
mzyx
=
l
+
mzy
;
my_density
[
mzyx
]
+=
x0
*
rho
[
0
][
l
];
}
}
}
}
}
// reduce all the perthread_densities into global_density
if
(
nthr
>
1
)
{
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(nthr, global_density) if(!_use_lrt)
#endif
{
int
ifrom
,
ito
,
tid
;
IP_PRE_omp_range_id
(
ifrom
,
ito
,
tid
,
ngrid
,
nthr
);
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
for
(
int
j
=
1
;
j
<
nthr
;
j
++
)
{
global_density
[
i
]
+=
perthread_density
[
j
-
1
][
i
];
}
}
}
}
}
/* ----------------------------------------------------------------------
interpolate from grid to get electric field & force on my particles for ik
------------------------------------------------------------------------- */
template
<
class
flt_t
,
class
acc_t
,
int
use_table
,
int
use_packing
>
void
PPPMIntel
::
fieldforce_ik
(
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
)
{
// loop over my charges, interpolate electric field from nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt
// ek = 3 components of E-field on particle
ATOM_T
*
_noalias
const
x
=
buffers
->
get_x
(
0
);
flt_t
*
_noalias
const
q
=
buffers
->
get_q
(
0
);
FORCE_T
*
_noalias
const
f
=
buffers
->
get_f
();
int
nlocal
=
atom
->
nlocal
;
int
nthr
;
if
(
_use_lrt
)
nthr
=
1
;
else
nthr
=
comm
->
nthreads
;
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(nlocal, nthr) if(!_use_lrt)
#endif
{
const
flt_t
lo0
=
boxlo
[
0
];
const
flt_t
lo1
=
boxlo
[
1
];
const
flt_t
lo2
=
boxlo
[
2
];
const
flt_t
xi
=
delxinv
;
const
flt_t
yi
=
delyinv
;
const
flt_t
zi
=
delzinv
;
const
flt_t
fshiftone
=
shiftone
;
const
flt_t
fqqrd2es
=
qqrd2e
*
scale
;
int
ifrom
,
ito
,
tid
;
IP_PRE_omp_range_id
(
ifrom
,
ito
,
tid
,
nlocal
,
nthr
);
_alignvar
(
flt_t
rho0
[
2
*
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
_alignvar
(
flt_t
rho1
[
INTEL_P3M_ALIGNED_MAXORDER
]
,
64
)
=
{
0
};
_alignvar
(
flt_t
rho2
[
INTEL_P3M_ALIGNED_MAXORDER
]
,
64
)
=
{
0
};
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
int
nx
=
part2grid
[
i
][
0
];
int
ny
=
part2grid
[
i
][
1
];
int
nz
=
part2grid
[
i
][
2
];
int
nxsum
=
(
use_packing
?
2
:
1
)
*
(
nx
+
nlower
);
int
nysum
=
ny
+
nlower
;
int
nzsum
=
nz
+
nlower
;;
FFT_SCALAR
dx
=
nx
+
fshiftone
-
(
x
[
i
].
x
-
lo0
)
*
xi
;
FFT_SCALAR
dy
=
ny
+
fshiftone
-
(
x
[
i
].
y
-
lo1
)
*
yi
;
FFT_SCALAR
dz
=
nz
+
fshiftone
-
(
x
[
i
].
z
-
lo2
)
*
zi
;
if
(
use_table
)
{
dx
=
dx
*
half_rho_scale
+
half_rho_scale_plus
;
int
idx
=
dx
;
dy
=
dy
*
half_rho_scale
+
half_rho_scale_plus
;
int
idy
=
dy
;
dz
=
dz
*
half_rho_scale
+
half_rho_scale_plus
;
int
idz
=
dz
;
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
k
=
0
;
k
<
INTEL_P3M_ALIGNED_MAXORDER
;
k
++
)
{
if
(
use_packing
)
{
rho0
[
2
*
k
]
=
rho_lookup
[
idx
][
k
];
rho0
[
2
*
k
+
1
]
=
rho_lookup
[
idx
][
k
];
}
else
{
rho0
[
k
]
=
rho_lookup
[
idx
][
k
];
}
rho1
[
k
]
=
rho_lookup
[
idy
][
k
];
rho2
[
k
]
=
rho_lookup
[
idz
][
k
];
}
}
else
{
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
k
=
nlower
;
k
<=
nupper
;
k
++
)
{
FFT_SCALAR
r1
=
rho_coeff
[
order
-
1
][
k
];
FFT_SCALAR
r2
=
rho_coeff
[
order
-
1
][
k
];
FFT_SCALAR
r3
=
rho_coeff
[
order
-
1
][
k
];
for
(
int
l
=
order
-
2
;
l
>=
0
;
l
--
)
{
r1
=
rho_coeff
[
l
][
k
]
+
r1
*
dx
;
r2
=
rho_coeff
[
l
][
k
]
+
r2
*
dy
;
r3
=
rho_coeff
[
l
][
k
]
+
r3
*
dz
;
}
if
(
use_packing
)
{
rho0
[
2
*
(
k
-
nlower
)]
=
r1
;
rho0
[
2
*
(
k
-
nlower
)
+
1
]
=
r1
;
}
else
{
rho0
[
k
-
nlower
]
=
r1
;
}
rho1
[
k
-
nlower
]
=
r2
;
rho2
[
k
-
nlower
]
=
r3
;
}
}
_alignvar
(
FFT_SCALAR
ekx_arr
[
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
_alignvar
(
FFT_SCALAR
eky_arr
[
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
_alignvar
(
FFT_SCALAR
ekz_arr
[
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
_alignvar
(
FFT_SCALAR
ekxy_arr
[
2
*
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
_alignvar
(
FFT_SCALAR
ekz0_arr
[
2
*
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for
(
int
n
=
0
;
n
<
order
;
n
++
)
{
int
mz
=
n
+
nzsum
;
FFT_SCALAR
z0
=
rho2
[
n
];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for
(
int
m
=
0
;
m
<
order
;
m
++
)
{
int
my
=
m
+
nysum
;
FFT_SCALAR
y0
=
z0
*
rho1
[
m
];
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
l
=
0
;
l
<
(
use_packing
?
2
:
1
)
*
INTEL_P3M_ALIGNED_MAXORDER
;
l
++
)
{
int
mx
=
l
+
nxsum
;
FFT_SCALAR
x0
=
y0
*
rho0
[
l
];
if
(
use_packing
)
{
ekxy_arr
[
l
]
-=
x0
*
vdxy_brick
[
mz
][
my
][
mx
];
ekz0_arr
[
l
]
-=
x0
*
vdz0_brick
[
mz
][
my
][
mx
];
}
else
{
ekx_arr
[
l
]
-=
x0
*
vdx_brick
[
mz
][
my
][
mx
];
eky_arr
[
l
]
-=
x0
*
vdy_brick
[
mz
][
my
][
mx
];
ekz_arr
[
l
]
-=
x0
*
vdz_brick
[
mz
][
my
][
mx
];
}
}
}
}
FFT_SCALAR
ekx
,
eky
,
ekz
;
ekx
=
eky
=
ekz
=
ZEROF
;
if
(
use_packing
)
{
for
(
int
l
=
0
;
l
<
2
*
order
;
l
+=
2
)
{
ekx
+=
ekxy_arr
[
l
];
eky
+=
ekxy_arr
[
l
+
1
];
ekz
+=
ekz0_arr
[
l
];
}
}
else
{
for
(
int
l
=
0
;
l
<
order
;
l
++
)
{
ekx
+=
ekx_arr
[
l
];
eky
+=
eky_arr
[
l
];
ekz
+=
ekz_arr
[
l
];
}
}
// convert E-field to force
const
flt_t
qfactor
=
fqqrd2es
*
q
[
i
];
f
[
i
].
x
+=
qfactor
*
ekx
;
f
[
i
].
y
+=
qfactor
*
eky
;
if
(
slabflag
!=
2
)
f
[
i
].
z
+=
qfactor
*
ekz
;
}
}
}
/* ----------------------------------------------------------------------
interpolate from grid to get electric field & force on my particles for ad
------------------------------------------------------------------------- */
template
<
class
flt_t
,
class
acc_t
,
int
use_table
>
void
PPPMIntel
::
fieldforce_ad
(
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
)
{
// loop over my charges, interpolate electric field from nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt
// ek = 3 components of E-field on particle
ATOM_T
*
_noalias
const
x
=
buffers
->
get_x
(
0
);
const
flt_t
*
_noalias
const
q
=
buffers
->
get_q
(
0
);
FORCE_T
*
_noalias
const
f
=
buffers
->
get_f
();
int
nlocal
=
atom
->
nlocal
;
int
nthr
;
if
(
_use_lrt
)
nthr
=
1
;
else
nthr
=
comm
->
nthreads
;
FFT_SCALAR
*
_noalias
const
particle_ekx
=
this
->
particle_ekx
;
FFT_SCALAR
*
_noalias
const
particle_eky
=
this
->
particle_eky
;
FFT_SCALAR
*
_noalias
const
particle_ekz
=
this
->
particle_ekz
;
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(nlocal, nthr) if(!_use_lrt)
#endif
{
const
flt_t
ftwo_pi
=
MY_PI
*
2.0
;
const
flt_t
ffour_pi
=
MY_PI
*
4.0
;
const
flt_t
lo0
=
boxlo
[
0
];
const
flt_t
lo1
=
boxlo
[
1
];
const
flt_t
lo2
=
boxlo
[
2
];
const
flt_t
xi
=
delxinv
;
const
flt_t
yi
=
delyinv
;
const
flt_t
zi
=
delzinv
;
const
flt_t
fshiftone
=
shiftone
;
const
flt_t
fqqrd2es
=
qqrd2e
*
scale
;
const
double
*
prd
=
domain
->
prd
;
const
double
xprd
=
prd
[
0
];
const
double
yprd
=
prd
[
1
];
const
double
zprd
=
prd
[
2
];
const
flt_t
hx_inv
=
nx_pppm
/
xprd
;
const
flt_t
hy_inv
=
ny_pppm
/
yprd
;
const
flt_t
hz_inv
=
nz_pppm
/
zprd
;
const
flt_t
fsf_coeff0
=
sf_coeff
[
0
];
const
flt_t
fsf_coeff1
=
sf_coeff
[
1
];
const
flt_t
fsf_coeff2
=
sf_coeff
[
2
];
const
flt_t
fsf_coeff3
=
sf_coeff
[
3
];
const
flt_t
fsf_coeff4
=
sf_coeff
[
4
];
const
flt_t
fsf_coeff5
=
sf_coeff
[
5
];
int
ifrom
,
ito
,
tid
;
IP_PRE_omp_range_id
(
ifrom
,
ito
,
tid
,
nlocal
,
nthr
);
_alignvar
(
flt_t
rho
[
3
][
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
_alignvar
(
flt_t
drho
[
3
][
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
int
nx
=
part2grid
[
i
][
0
];
int
ny
=
part2grid
[
i
][
1
];
int
nz
=
part2grid
[
i
][
2
];
FFT_SCALAR
dx
=
nx
+
fshiftone
-
(
x
[
i
].
x
-
lo0
)
*
xi
;
FFT_SCALAR
dy
=
ny
+
fshiftone
-
(
x
[
i
].
y
-
lo1
)
*
yi
;
FFT_SCALAR
dz
=
nz
+
fshiftone
-
(
x
[
i
].
z
-
lo2
)
*
zi
;
int
nxsum
=
nx
+
nlower
;
int
nysum
=
ny
+
nlower
;
int
nzsum
=
nz
+
nlower
;
if
(
use_table
)
{
dx
=
dx
*
half_rho_scale
+
half_rho_scale_plus
;
int
idx
=
dx
;
dy
=
dy
*
half_rho_scale
+
half_rho_scale_plus
;
int
idy
=
dy
;
dz
=
dz
*
half_rho_scale
+
half_rho_scale_plus
;
int
idz
=
dz
;
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
k
=
0
;
k
<
INTEL_P3M_ALIGNED_MAXORDER
;
k
++
)
{
rho
[
0
][
k
]
=
rho_lookup
[
idx
][
k
];
rho
[
1
][
k
]
=
rho_lookup
[
idy
][
k
];
rho
[
2
][
k
]
=
rho_lookup
[
idz
][
k
];
drho
[
0
][
k
]
=
drho_lookup
[
idx
][
k
];
drho
[
1
][
k
]
=
drho_lookup
[
idy
][
k
];
drho
[
2
][
k
]
=
drho_lookup
[
idz
][
k
];
}
}
else
{
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
k
=
nlower
;
k
<=
nupper
;
k
++
)
{
FFT_SCALAR
r1
,
r2
,
r3
,
dr1
,
dr2
,
dr3
;
dr1
=
dr2
=
dr3
=
ZEROF
;
r1
=
rho_coeff
[
order
-
1
][
k
];
r2
=
rho_coeff
[
order
-
1
][
k
];
r3
=
rho_coeff
[
order
-
1
][
k
];
for
(
int
l
=
order
-
2
;
l
>=
0
;
l
--
)
{
r1
=
rho_coeff
[
l
][
k
]
+
r1
*
dx
;
r2
=
rho_coeff
[
l
][
k
]
+
r2
*
dy
;
r3
=
rho_coeff
[
l
][
k
]
+
r3
*
dz
;
dr1
=
drho_coeff
[
l
][
k
]
+
dr1
*
dx
;
dr2
=
drho_coeff
[
l
][
k
]
+
dr2
*
dy
;
dr3
=
drho_coeff
[
l
][
k
]
+
dr3
*
dz
;
}
rho
[
0
][
k
-
nlower
]
=
r1
;
rho
[
1
][
k
-
nlower
]
=
r2
;
rho
[
2
][
k
-
nlower
]
=
r3
;
drho
[
0
][
k
-
nlower
]
=
dr1
;
drho
[
1
][
k
-
nlower
]
=
dr2
;
drho
[
2
][
k
-
nlower
]
=
dr3
;
}
}
_alignvar
(
FFT_SCALAR
ekx
[
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
_alignvar
(
FFT_SCALAR
eky
[
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
_alignvar
(
FFT_SCALAR
ekz
[
INTEL_P3M_ALIGNED_MAXORDER
],
64
)
=
{
0
};
particle_ekx
[
i
]
=
particle_eky
[
i
]
=
particle_ekz
[
i
]
=
ZEROF
;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for
(
int
n
=
0
;
n
<
order
;
n
++
)
{
int
mz
=
n
+
nzsum
;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for
(
int
m
=
0
;
m
<
order
;
m
++
)
{
int
my
=
m
+
nysum
;
FFT_SCALAR
ekx_p
=
rho
[
1
][
m
]
*
rho
[
2
][
n
];
FFT_SCALAR
eky_p
=
drho
[
1
][
m
]
*
rho
[
2
][
n
];
FFT_SCALAR
ekz_p
=
rho
[
1
][
m
]
*
drho
[
2
][
n
];
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
l
=
0
;
l
<
INTEL_P3M_ALIGNED_MAXORDER
;
l
++
)
{
int
mx
=
l
+
nxsum
;
ekx
[
l
]
+=
drho
[
0
][
l
]
*
ekx_p
*
u_brick
[
mz
][
my
][
mx
];
eky
[
l
]
+=
rho
[
0
][
l
]
*
eky_p
*
u_brick
[
mz
][
my
][
mx
];
ekz
[
l
]
+=
rho
[
0
][
l
]
*
ekz_p
*
u_brick
[
mz
][
my
][
mx
];
}
}
}
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for
(
int
l
=
0
;
l
<
order
;
l
++
){
particle_ekx
[
i
]
+=
ekx
[
l
];
particle_eky
[
i
]
+=
eky
[
l
];
particle_ekz
[
i
]
+=
ekz
[
l
];
}
}
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
particle_ekx
[
i
]
*=
hx_inv
;
particle_eky
[
i
]
*=
hy_inv
;
particle_ekz
[
i
]
*=
hz_inv
;
// convert E-field to force
const
flt_t
qfactor
=
fqqrd2es
*
q
[
i
];
const
flt_t
twoqsq
=
(
flt_t
)
2.0
*
q
[
i
]
*
q
[
i
];
const
flt_t
s1
=
x
[
i
].
x
*
hx_inv
;
const
flt_t
s2
=
x
[
i
].
y
*
hy_inv
;
const
flt_t
s3
=
x
[
i
].
z
*
hz_inv
;
flt_t
sf
=
fsf_coeff0
*
sin
(
ftwo_pi
*
s1
);
sf
+=
fsf_coeff1
*
sin
(
ffour_pi
*
s1
);
sf
*=
twoqsq
;
f
[
i
].
x
+=
qfactor
*
particle_ekx
[
i
]
-
fqqrd2es
*
sf
;
sf
=
fsf_coeff2
*
sin
(
ftwo_pi
*
s2
);
sf
+=
fsf_coeff3
*
sin
(
ffour_pi
*
s2
);
sf
*=
twoqsq
;
f
[
i
].
y
+=
qfactor
*
particle_eky
[
i
]
-
fqqrd2es
*
sf
;
sf
=
fsf_coeff4
*
sin
(
ftwo_pi
*
s3
);
sf
+=
fsf_coeff5
*
sin
(
ffour_pi
*
s3
);
sf
*=
twoqsq
;
if
(
slabflag
!=
2
)
f
[
i
].
z
+=
qfactor
*
particle_ekz
[
i
]
-
fqqrd2es
*
sf
;
}
}
}
/* ----------------------------------------------------------------------
FFT-based Poisson solver for ik
Does special things for packing mode to avoid repeated copies
------------------------------------------------------------------------- */
void
PPPMIntel
::
poisson_ik_intel
()
{
if
(
_use_packing
==
0
)
{
poisson_ik
();
return
;
}
int
i
,
j
,
k
,
n
;
double
eng
;
// transform charge density (r -> k)
n
=
0
;
for
(
i
=
0
;
i
<
nfft
;
i
++
)
{
work1
[
n
++
]
=
density_fft
[
i
];
work1
[
n
++
]
=
ZEROF
;
}
fft1
->
compute
(
work1
,
work1
,
1
);
// global energy and virial contribution
double
scaleinv
=
1.0
/
(
nx_pppm
*
ny_pppm
*
nz_pppm
);
double
s2
=
scaleinv
*
scaleinv
;
if
(
eflag_global
||
vflag_global
)
{
if
(
vflag_global
)
{
n
=
0
;
for
(
i
=
0
;
i
<
nfft
;
i
++
)
{
eng
=
s2
*
greensfn
[
i
]
*
(
work1
[
n
]
*
work1
[
n
]
+
work1
[
n
+
1
]
*
work1
[
n
+
1
]);
for
(
j
=
0
;
j
<
6
;
j
++
)
virial
[
j
]
+=
eng
*
vg
[
i
][
j
];
if
(
eflag_global
)
energy
+=
eng
;
n
+=
2
;
}
}
else
{
n
=
0
;
for
(
i
=
0
;
i
<
nfft
;
i
++
)
{
energy
+=
s2
*
greensfn
[
i
]
*
(
work1
[
n
]
*
work1
[
n
]
+
work1
[
n
+
1
]
*
work1
[
n
+
1
]);
n
+=
2
;
}
}
}
// scale by 1/total-grid-pts to get rho(k)
// multiply by Green's function to get V(k)
n
=
0
;
for
(
i
=
0
;
i
<
nfft
;
i
++
)
{
work1
[
n
++
]
*=
scaleinv
*
greensfn
[
i
];
work1
[
n
++
]
*=
scaleinv
*
greensfn
[
i
];
}
// extra FFTs for per-atom energy/virial
if
(
evflag_atom
)
poisson_peratom
();
// triclinic system
if
(
triclinic
)
{
poisson_ik_triclinic
();
return
;
}
// compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
// FFT leaves data in 3d brick decomposition
// copy it into inner portion of vdx,vdy,vdz arrays
// x direction gradient
n
=
0
;
for
(
k
=
nzlo_fft
;
k
<=
nzhi_fft
;
k
++
)
for
(
j
=
nylo_fft
;
j
<=
nyhi_fft
;
j
++
)
for
(
i
=
nxlo_fft
;
i
<=
nxhi_fft
;
i
++
)
{
work2
[
n
]
=
fkx
[
i
]
*
work1
[
n
+
1
];
work2
[
n
+
1
]
=
-
fkx
[
i
]
*
work1
[
n
];
n
+=
2
;
}
fft2
->
compute
(
work2
,
work2
,
-
1
);
// y direction gradient
n
=
0
;
for
(
k
=
nzlo_fft
;
k
<=
nzhi_fft
;
k
++
)
for
(
j
=
nylo_fft
;
j
<=
nyhi_fft
;
j
++
)
for
(
i
=
nxlo_fft
;
i
<=
nxhi_fft
;
i
++
)
{
work3
[
n
]
=
fky
[
j
]
*
work1
[
n
+
1
];
work3
[
n
+
1
]
=
-
fky
[
j
]
*
work1
[
n
];
n
+=
2
;
}
fft2
->
compute
(
work3
,
work3
,
-
1
);
n
=
0
;
for
(
k
=
nzlo_in
;
k
<=
nzhi_in
;
k
++
)
for
(
j
=
nylo_in
;
j
<=
nyhi_in
;
j
++
)
for
(
i
=
nxlo_in
;
i
<=
nxhi_in
;
i
++
)
{
vdxy_brick
[
k
][
j
][
2
*
i
]
=
work2
[
n
];
vdxy_brick
[
k
][
j
][
2
*
i
+
1
]
=
work3
[
n
];
n
+=
2
;
}
// z direction gradient
n
=
0
;
for
(
k
=
nzlo_fft
;
k
<=
nzhi_fft
;
k
++
)
for
(
j
=
nylo_fft
;
j
<=
nyhi_fft
;
j
++
)
for
(
i
=
nxlo_fft
;
i
<=
nxhi_fft
;
i
++
)
{
work2
[
n
]
=
fkz
[
k
]
*
work1
[
n
+
1
];
work2
[
n
+
1
]
=
-
fkz
[
k
]
*
work1
[
n
];
n
+=
2
;
}
fft2
->
compute
(
work2
,
work2
,
-
1
);
n
=
0
;
for
(
k
=
nzlo_in
;
k
<=
nzhi_in
;
k
++
)
for
(
j
=
nylo_in
;
j
<=
nyhi_in
;
j
++
)
for
(
i
=
nxlo_in
;
i
<=
nxhi_in
;
i
++
)
{
vdz0_brick
[
k
][
j
][
2
*
i
]
=
work2
[
n
];
vdz0_brick
[
k
][
j
][
2
*
i
+
1
]
=
0.
;
n
+=
2
;
}
}
/* ----------------------------------------------------------------------
precompute rho coefficients as a lookup table to save time in make_rho
and fieldforce. Instead of doing this polynomial for every atom 6 times
per time step, precompute it for some number of points.
------------------------------------------------------------------------- */
void
PPPMIntel
::
precompute_rho
()
{
half_rho_scale
=
(
rho_points
-
1.
)
/
2.
;
half_rho_scale_plus
=
half_rho_scale
+
0.5
;
for
(
int
i
=
0
;
i
<
rho_points
;
i
++
)
{
FFT_SCALAR
dx
=
-
1.
+
1.
/
half_rho_scale
*
(
FFT_SCALAR
)
i
;
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
k
=
nlower
;
k
<=
nupper
;
k
++
){
FFT_SCALAR
r1
=
ZEROF
;
for
(
int
l
=
order
-
1
;
l
>=
0
;
l
--
){
r1
=
rho_coeff
[
l
][
k
]
+
r1
*
dx
;
}
rho_lookup
[
i
][
k
-
nlower
]
=
r1
;
}
for
(
int
k
=
nupper
-
nlower
+
1
;
k
<
INTEL_P3M_ALIGNED_MAXORDER
;
k
++
)
{
rho_lookup
[
i
][
k
]
=
0
;
}
if
(
differentiation_flag
==
1
)
{
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for
(
int
k
=
nlower
;
k
<=
nupper
;
k
++
){
FFT_SCALAR
r1
=
ZEROF
;
for
(
int
l
=
order
-
2
;
l
>=
0
;
l
--
){
r1
=
drho_coeff
[
l
][
k
]
+
r1
*
dx
;
}
drho_lookup
[
i
][
k
-
nlower
]
=
r1
;
}
for
(
int
k
=
nupper
-
nlower
+
1
;
k
<
INTEL_P3M_ALIGNED_MAXORDER
;
k
++
)
{
drho_lookup
[
i
][
k
]
=
0
;
}
}
}
}
/* ----------------------------------------------------------------------
pack own values to buf to send to another proc
------------------------------------------------------------------------- */
void
PPPMIntel
::
pack_forward
(
int
flag
,
FFT_SCALAR
*
buf
,
int
nlist
,
int
*
list
)
{
int
n
=
0
;
if
((
flag
==
FORWARD_IK
)
&&
_use_packing
)
{
FFT_SCALAR
*
xsrc
=
&
vdxy_brick
[
nzlo_out
][
nylo_out
][
2
*
nxlo_out
];
FFT_SCALAR
*
zsrc
=
&
vdz0_brick
[
nzlo_out
][
nylo_out
][
2
*
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
{
buf
[
n
++
]
=
xsrc
[
list
[
i
]];
buf
[
n
++
]
=
zsrc
[
list
[
i
]];
}
}
else
{
PPPM
::
pack_forward
(
flag
,
buf
,
nlist
,
list
);
}
}
/* ----------------------------------------------------------------------
unpack another proc's own values from buf and set own ghost values
------------------------------------------------------------------------- */
void
PPPMIntel
::
unpack_forward
(
int
flag
,
FFT_SCALAR
*
buf
,
int
nlist
,
int
*
list
)
{
int
n
=
0
;
if
((
flag
==
FORWARD_IK
)
&&
_use_packing
)
{
FFT_SCALAR
*
xdest
=
&
vdxy_brick
[
nzlo_out
][
nylo_out
][
2
*
nxlo_out
];
FFT_SCALAR
*
zdest
=
&
vdz0_brick
[
nzlo_out
][
nylo_out
][
2
*
nxlo_out
];
for
(
int
i
=
0
;
i
<
nlist
;
i
++
)
{
xdest
[
list
[
i
]]
=
buf
[
n
++
];
zdest
[
list
[
i
]]
=
buf
[
n
++
];
}
}
else
{
PPPM
::
unpack_forward
(
flag
,
buf
,
nlist
,
list
);
}
}
/* ----------------------------------------------------------------------
memory usage of local arrays
------------------------------------------------------------------------- */
double
PPPMIntel
::
memory_usage
()
{
double
bytes
=
PPPM
::
memory_usage
();
if
((
comm
->
nthreads
>
1
)
&&
!
_use_lrt
)
{
bytes
+=
(
comm
->
nthreads
-
1
)
*
(
ngrid
+
INTEL_P3M_ALIGNED_MAXORDER
)
*
sizeof
(
FFT_SCALAR
);
}
if
(
differentiation_flag
==
1
)
{
bytes
+=
3
*
nmax
*
sizeof
(
FFT_SCALAR
);
}
if
(
_use_table
)
{
bytes
+=
rho_points
*
INTEL_P3M_ALIGNED_MAXORDER
*
sizeof
(
FFT_SCALAR
);
if
(
differentiation_flag
==
1
)
{
bytes
+=
rho_points
*
INTEL_P3M_ALIGNED_MAXORDER
*
sizeof
(
FFT_SCALAR
);
}
}
if
(
_use_packing
)
{
bytes
+=
2
*
(
nzhi_out
+
2
-
nzlo_out
+
1
)
*
(
nyhi_out
-
nylo_out
+
1
)
*
(
2
*
nxhi_out
+
1
-
2
*
nxlo_out
+
1
)
*
sizeof
(
FFT_SCALAR
);
bytes
-=
3
*
(
nxhi_out
-
nxlo_out
+
1
)
*
(
nyhi_out
-
nylo_out
+
1
)
*
(
nzhi_out
-
nzlo_out
+
1
)
*
sizeof
(
FFT_SCALAR
);
bytes
+=
2
*
nfft_both
*
sizeof
(
FFT_SCALAR
);
bytes
+=
cg_pack
->
memory_usage
();
}
return
bytes
;
}
/* ----------------------------------------------------------------------
Pack data into intel package buffers if using LRT mode
------------------------------------------------------------------------- */
void
PPPMIntel
::
pack_buffers
()
{
fix
->
start_watch
(
TIME_PACK
);
int
packthreads
;
if
(
comm
->
nthreads
>
INTEL_HTHREADS
)
packthreads
=
comm
->
nthreads
;
else
packthreads
=
1
;
#if defined(_OPENMP)
#pragma omp parallel if(packthreads > 1)
#endif
{
int
ifrom
,
ito
,
tid
;
IP_PRE_omp_range_id_align
(
ifrom
,
ito
,
tid
,
atom
->
nlocal
+
atom
->
nghost
,
packthreads
,
sizeof
(
IntelBuffers
<
float
,
double
>::
atom_t
));
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_MIXED
)
fix
->
get_mixed_buffers
()
->
thr_pack
(
ifrom
,
ito
,
1
);
else
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_DOUBLE
)
fix
->
get_double_buffers
()
->
thr_pack
(
ifrom
,
ito
,
1
);
else
fix
->
get_single_buffers
()
->
thr_pack
(
ifrom
,
ito
,
1
);
}
fix
->
stop_watch
(
TIME_PACK
);
}
/* ----------------------------------------------------------------------
Allocate density_brick with extra padding for vector writes
------------------------------------------------------------------------- */
void
PPPMIntel
::
allocate
()
{
PPPM
::
allocate
();
memory
->
destroy3d_offset
(
density_brick
,
nzlo_out
,
nylo_out
,
nxlo_out
);
create3d_offset
(
density_brick
,
nzlo_out
,
nzhi_out
,
nylo_out
,
nyhi_out
,
nxlo_out
,
nxhi_out
,
"pppm:density_brick"
);
if
(
differentiation_flag
==
1
)
{
memory
->
destroy3d_offset
(
u_brick
,
nzlo_out
,
nylo_out
,
nxlo_out
);
create3d_offset
(
u_brick
,
nzlo_out
,
nzhi_out
,
nylo_out
,
nyhi_out
,
nxlo_out
,
nxhi_out
,
"pppm:u_brick"
);
}
else
{
memory
->
destroy3d_offset
(
vdx_brick
,
nzlo_out
,
nylo_out
,
nxlo_out
);
memory
->
destroy3d_offset
(
vdy_brick
,
nzlo_out
,
nylo_out
,
nxlo_out
);
memory
->
destroy3d_offset
(
vdz_brick
,
nzlo_out
,
nylo_out
,
nxlo_out
);
create3d_offset
(
vdx_brick
,
nzlo_out
,
nzhi_out
,
nylo_out
,
nyhi_out
,
nxlo_out
,
nxhi_out
,
"pppm:vdx_brick"
);
create3d_offset
(
vdy_brick
,
nzlo_out
,
nzhi_out
,
nylo_out
,
nyhi_out
,
nxlo_out
,
nxhi_out
,
"pppm:vdy_brick"
);
create3d_offset
(
vdz_brick
,
nzlo_out
,
nzhi_out
,
nylo_out
,
nyhi_out
,
nxlo_out
,
nxhi_out
,
"pppm:vdz_brick"
);
}
}
/* ----------------------------------------------------------------------
Create 3D-offset allocation with extra padding for vector writes
------------------------------------------------------------------------- */
FFT_SCALAR
***
PPPMIntel
::
create3d_offset
(
FFT_SCALAR
***&
array
,
int
n1lo
,
int
n1hi
,
int
n2lo
,
int
n2hi
,
int
n3lo
,
int
n3hi
,
const
char
*
name
)
{
int
n1
=
n1hi
-
n1lo
+
1
;
int
n2
=
n2hi
-
n2lo
+
1
;
int
n3
=
n3hi
-
n3lo
+
1
;
bigint
nbytes
=
((
bigint
)
sizeof
(
FFT_SCALAR
))
*
n1
*
n2
*
n3
+
INTEL_P3M_ALIGNED_MAXORDER
*
2
;
FFT_SCALAR
*
data
=
(
FFT_SCALAR
*
)
memory
->
smalloc
(
nbytes
,
name
);
nbytes
=
((
bigint
)
sizeof
(
FFT_SCALAR
*
))
*
n1
*
n2
;
FFT_SCALAR
**
plane
=
(
FFT_SCALAR
**
)
memory
->
smalloc
(
nbytes
,
name
);
nbytes
=
((
bigint
)
sizeof
(
FFT_SCALAR
**
))
*
n1
;
array
=
(
FFT_SCALAR
***
)
memory
->
smalloc
(
nbytes
,
name
);
bigint
m
;
bigint
n
=
0
;
for
(
int
i
=
0
;
i
<
n1
;
i
++
)
{
m
=
((
bigint
)
i
)
*
n2
;
array
[
i
]
=
&
plane
[
m
];
for
(
int
j
=
0
;
j
<
n2
;
j
++
)
{
plane
[
m
+
j
]
=
&
data
[
n
];
n
+=
n3
;
}
}
m
=
((
bigint
)
n1
)
*
n2
;
for
(
bigint
i
=
0
;
i
<
m
;
i
++
)
array
[
0
][
i
]
-=
n3lo
;
for
(
int
i
=
0
;
i
<
n1
;
i
++
)
array
[
i
]
-=
n2lo
;
array
-=
n1lo
;
return
array
;
}
/* ----------------------------------------------------------------------
Returns 0 if Intel optimizations for PPPM ignored due to offload
------------------------------------------------------------------------- */
#ifdef _LMP_INTEL_OFFLOAD
int
PPPMIntel
::
use_base
()
{
return
_use_base
;
}
#endif
Event Timeline
Log In to Comment