Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F88484885
lal_eam_lj.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Oct 19, 01:41
Size
20 KB
Mime Type
text/x-c++
Expires
Mon, Oct 21, 01:41 (2 d)
Engine
blob
Format
Raw Data
Handle
21756781
Attached To
rLAMMPS lammps
lal_eam_lj.cpp
View Options
/***************************************************************************
lal_eam_lj.cpp
-------------------
W. Michael Brown, Trung Dac Nguyen (ORNL)
Class for acceleration of the eam pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "eam_lj_cl.h"
#else
#include "eam_lj_ptx.h"
#endif
#include "lal_eam_lj.h"
#include <cassert>
using
namespace
LAMMPS_AL
;
#define EAMLJT EAMLJ<numtyp, acctyp>
#define MIN(A,B) ((A) < (B) ? (A) : (B))
#define MAX(A,B) ((A) > (B) ? (A) : (B))
extern
Device
<
PRECISION
,
ACC_PRECISION
>
device
;
template
<
class
numtyp
,
class
acctyp
>
EAMLJT
::
EAMLJ
()
:
BaseAtomic
<
numtyp
,
acctyp
>
(),
_compiled_energy
(
false
),
_allocated
(
false
)
{
}
template
<
class
numtyp
,
class
acctyp
>
EAMLJT
::~
EAMLJ
()
{
clear
();
}
template
<
class
numtyp
,
class
acctyp
>
int
EAMLJT
::
init
(
const
int
ntypes
,
double
host_cutforcesq
,
int
**
host_type2rhor
,
int
**
host_type2z2r
,
int
*
host_type2frho
,
double
***
host_rhor_spline
,
double
***
host_z2r_spline
,
double
***
host_frho_spline
,
double
**
host_cutsq
,
double
**
host_lj1
,
double
**
host_lj2
,
double
**
host_lj3
,
double
**
host_lj4
,
double
**
host_offset
,
double
*
host_special_lj
,
double
rdr
,
double
rdrho
,
int
nrhor
,
int
nrho
,
int
nz2r
,
int
nfrho
,
int
nr
,
const
int
nlocal
,
const
int
nall
,
const
int
max_nbors
,
const
int
maxspecial
,
const
double
cell_size
,
const
double
gpu_split
,
FILE
*
_screen
)
{
int
success
;
success
=
this
->
init_atomic
(
nlocal
,
nall
,
max_nbors
,
maxspecial
,
cell_size
,
gpu_split
,
_screen
,
eam_lj
);
if
(
success
!=
0
)
return
success
;
// allocate fp
bool
cpuview
=
false
;
if
(
this
->
ucl_device
->
device_type
()
==
UCL_CPU
)
cpuview
=
true
;
int
ef_nall
=
nall
;
if
(
ef_nall
==
0
)
ef_nall
=
2000
;
_max_fp_size
=
static_cast
<
int
>
(
static_cast
<
double
>
(
ef_nall
)
*
1.10
);
host_fp
.
alloc
(
_max_fp_size
,
*
(
this
->
ucl_device
));
if
(
cpuview
)
dev_fp
.
view
(
host_fp
);
else
dev_fp
.
alloc
(
_max_fp_size
,
*
(
this
->
ucl_device
),
UCL_WRITE_ONLY
);
k_energy
.
set_function
(
*
(
this
->
pair_program
),
"kernel_energy"
);
k_energy_fast
.
set_function
(
*
(
this
->
pair_program
),
"kernel_energy_fast"
);
fp_tex
.
get_texture
(
*
(
this
->
pair_program
),
"fp_tex"
);
fp_tex
.
bind_float
(
dev_fp
,
1
);
_compiled_energy
=
true
;
// Initialize timers for selected GPU
time_pair2
.
init
(
*
(
this
->
ucl_device
));
time_pair2
.
zero
();
time_fp1
.
init
(
*
(
this
->
ucl_device
));
time_fp1
.
zero
();
time_fp2
.
init
(
*
(
this
->
ucl_device
));
time_fp2
.
zero
();
// If atom type constants fit in shared memory use fast kernel
int
lj_types
=
ntypes
;
shared_types
=
false
;
int
max_shared_types
=
this
->
device
->
max_shared_types
();
if
(
lj_types
<=
max_shared_types
&&
this
->
_block_size
>=
max_shared_types
)
{
lj_types
=
max_shared_types
;
shared_types
=
true
;
}
_ntypes
=
lj_types
;
_cutforcesq
=
host_cutforcesq
;
_rdr
=
rdr
;
_rdrho
=
rdrho
;
_nrhor
=
nrhor
;
_nrho
=
nrho
;
_nz2r
=
nz2r
;
_nfrho
=
nfrho
;
_nr
=
nr
;
// EAM allocation
UCL_H_Vec
<
int2
>
dview_type
(
lj_types
*
lj_types
,
*
(
this
->
ucl_device
),
UCL_WRITE_OPTIMIZED
);
for
(
int
i
=
0
;
i
<
lj_types
*
lj_types
;
i
++
)
{
dview_type
[
i
].
x
=
0
;
dview_type
[
i
].
y
=
0
;
}
// pack type2rhor and type2z2r
type2rhor_z2r
.
alloc
(
lj_types
*
lj_types
,
*
(
this
->
ucl_device
),
UCL_READ_ONLY
);
for
(
int
i
=
0
;
i
<
ntypes
;
i
++
)
{
for
(
int
j
=
0
;
j
<
ntypes
;
j
++
)
{
dview_type
[
i
*
lj_types
+
j
].
x
=
host_type2rhor
[
i
][
j
];
dview_type
[
i
*
lj_types
+
j
].
y
=
host_type2z2r
[
i
][
j
];
}
}
ucl_copy
(
type2rhor_z2r
,
dview_type
,
false
);
// pack type2frho
UCL_H_Vec
<
int
>
dview_type2frho
(
lj_types
,
*
(
this
->
ucl_device
),
UCL_WRITE_OPTIMIZED
);
type2frho
.
alloc
(
lj_types
,
*
(
this
->
ucl_device
),
UCL_READ_ONLY
);
for
(
int
i
=
0
;
i
<
ntypes
;
i
++
)
dview_type2frho
[
i
]
=
host_type2frho
[
i
];
ucl_copy
(
type2frho
,
dview_type2frho
,
false
);
// pack frho_spline
UCL_H_Vec
<
numtyp4
>
dview_frho_spline
(
nfrho
*
(
nrho
+
1
),
*
(
this
->
ucl_device
),
UCL_WRITE_OPTIMIZED
);
for
(
int
ix
=
0
;
ix
<
nfrho
;
ix
++
)
for
(
int
iy
=
0
;
iy
<
nrho
+
1
;
iy
++
)
{
dview_frho_spline
[
ix
*
(
nrho
+
1
)
+
iy
].
x
=
host_frho_spline
[
ix
][
iy
][
0
];
dview_frho_spline
[
ix
*
(
nrho
+
1
)
+
iy
].
y
=
host_frho_spline
[
ix
][
iy
][
1
];
dview_frho_spline
[
ix
*
(
nrho
+
1
)
+
iy
].
z
=
host_frho_spline
[
ix
][
iy
][
2
];
dview_frho_spline
[
ix
*
(
nrho
+
1
)
+
iy
].
w
=
0
;
}
frho_spline1
.
alloc
(
nfrho
*
(
nrho
+
1
),
*
(
this
->
ucl_device
),
UCL_READ_ONLY
);
ucl_copy
(
frho_spline1
,
dview_frho_spline
,
false
);
frho_spline1_tex
.
get_texture
(
*
(
this
->
pair_program
),
"frho_sp1_tex"
);
frho_spline1_tex
.
bind_float
(
frho_spline1
,
4
);
for
(
int
ix
=
0
;
ix
<
nfrho
;
ix
++
)
for
(
int
iy
=
0
;
iy
<
nrho
+
1
;
iy
++
)
{
dview_frho_spline
[
ix
*
(
nrho
+
1
)
+
iy
].
x
=
host_frho_spline
[
ix
][
iy
][
3
];
dview_frho_spline
[
ix
*
(
nrho
+
1
)
+
iy
].
y
=
host_frho_spline
[
ix
][
iy
][
4
];
dview_frho_spline
[
ix
*
(
nrho
+
1
)
+
iy
].
z
=
host_frho_spline
[
ix
][
iy
][
5
];
dview_frho_spline
[
ix
*
(
nrho
+
1
)
+
iy
].
w
=
host_frho_spline
[
ix
][
iy
][
6
];
}
frho_spline2
.
alloc
(
nfrho
*
(
nrho
+
1
),
*
(
this
->
ucl_device
),
UCL_READ_ONLY
);
ucl_copy
(
frho_spline2
,
dview_frho_spline
,
false
);
frho_spline2_tex
.
get_texture
(
*
(
this
->
pair_program
),
"frho_sp2_tex"
);
frho_spline2_tex
.
bind_float
(
frho_spline2
,
4
);
// pack rhor_spline
UCL_H_Vec
<
numtyp4
>
dview_rhor_spline
(
nrhor
*
(
nr
+
1
),
*
(
this
->
ucl_device
),
UCL_WRITE_OPTIMIZED
);
for
(
int
ix
=
0
;
ix
<
nrhor
;
ix
++
)
for
(
int
iy
=
0
;
iy
<
nr
+
1
;
iy
++
)
{
dview_rhor_spline
[
ix
*
(
nr
+
1
)
+
iy
].
x
=
host_rhor_spline
[
ix
][
iy
][
0
];
dview_rhor_spline
[
ix
*
(
nr
+
1
)
+
iy
].
y
=
host_rhor_spline
[
ix
][
iy
][
1
];
dview_rhor_spline
[
ix
*
(
nr
+
1
)
+
iy
].
z
=
host_rhor_spline
[
ix
][
iy
][
2
];
dview_rhor_spline
[
ix
*
(
nr
+
1
)
+
iy
].
w
=
(
numtyp
)
0
;
}
rhor_spline1
.
alloc
(
nrhor
*
(
nr
+
1
),
*
(
this
->
ucl_device
),
UCL_READ_ONLY
);
ucl_copy
(
rhor_spline1
,
dview_rhor_spline
,
false
);
rhor_spline1_tex
.
get_texture
(
*
(
this
->
pair_program
),
"rhor_sp1_tex"
);
rhor_spline1_tex
.
bind_float
(
rhor_spline1
,
4
);
for
(
int
ix
=
0
;
ix
<
nrhor
;
ix
++
)
for
(
int
iy
=
0
;
iy
<
nr
+
1
;
iy
++
)
{
dview_rhor_spline
[
ix
*
(
nr
+
1
)
+
iy
].
x
=
host_rhor_spline
[
ix
][
iy
][
3
];
dview_rhor_spline
[
ix
*
(
nr
+
1
)
+
iy
].
y
=
host_rhor_spline
[
ix
][
iy
][
4
];
dview_rhor_spline
[
ix
*
(
nr
+
1
)
+
iy
].
z
=
host_rhor_spline
[
ix
][
iy
][
5
];
dview_rhor_spline
[
ix
*
(
nr
+
1
)
+
iy
].
w
=
host_rhor_spline
[
ix
][
iy
][
6
];
}
rhor_spline2
.
alloc
(
nrhor
*
(
nr
+
1
),
*
(
this
->
ucl_device
),
UCL_READ_ONLY
);
ucl_copy
(
rhor_spline2
,
dview_rhor_spline
,
false
);
rhor_spline2_tex
.
get_texture
(
*
(
this
->
pair_program
),
"rhor_sp2_tex"
);
rhor_spline2_tex
.
bind_float
(
rhor_spline2
,
4
);
// pack z2r_spline
UCL_H_Vec
<
numtyp4
>
dview_z2r_spline
(
nz2r
*
(
nr
+
1
),
*
(
this
->
ucl_device
),
UCL_WRITE_OPTIMIZED
);
for
(
int
ix
=
0
;
ix
<
nz2r
;
ix
++
)
for
(
int
iy
=
0
;
iy
<
nr
+
1
;
iy
++
)
{
dview_z2r_spline
[
ix
*
(
nr
+
1
)
+
iy
].
x
=
host_z2r_spline
[
ix
][
iy
][
0
];
dview_z2r_spline
[
ix
*
(
nr
+
1
)
+
iy
].
y
=
host_z2r_spline
[
ix
][
iy
][
1
];
dview_z2r_spline
[
ix
*
(
nr
+
1
)
+
iy
].
z
=
host_z2r_spline
[
ix
][
iy
][
2
];
dview_z2r_spline
[
ix
*
(
nr
+
1
)
+
iy
].
w
=
(
numtyp
)
0
;
}
z2r_spline1
.
alloc
(
nz2r
*
(
nr
+
1
),
*
(
this
->
ucl_device
),
UCL_READ_ONLY
);
ucl_copy
(
z2r_spline1
,
dview_z2r_spline
,
false
);
z2r_spline1_tex
.
get_texture
(
*
(
this
->
pair_program
),
"z2r_sp1_tex"
);
z2r_spline1_tex
.
bind_float
(
z2r_spline1
,
4
);
for
(
int
ix
=
0
;
ix
<
nz2r
;
ix
++
)
for
(
int
iy
=
0
;
iy
<
nr
+
1
;
iy
++
)
{
dview_z2r_spline
[
ix
*
(
nr
+
1
)
+
iy
].
x
=
host_z2r_spline
[
ix
][
iy
][
3
];
dview_z2r_spline
[
ix
*
(
nr
+
1
)
+
iy
].
y
=
host_z2r_spline
[
ix
][
iy
][
4
];
dview_z2r_spline
[
ix
*
(
nr
+
1
)
+
iy
].
z
=
host_z2r_spline
[
ix
][
iy
][
5
];
dview_z2r_spline
[
ix
*
(
nr
+
1
)
+
iy
].
w
=
host_z2r_spline
[
ix
][
iy
][
6
];
}
z2r_spline2
.
alloc
(
nz2r
*
(
nr
+
1
),
*
(
this
->
ucl_device
),
UCL_READ_ONLY
);
ucl_copy
(
z2r_spline2
,
dview_z2r_spline
,
false
);
z2r_spline2_tex
.
get_texture
(
*
(
this
->
pair_program
),
"z2r_sp2_tex"
);
z2r_spline2_tex
.
bind_float
(
z2r_spline2
,
4
);
// LJ allocation
// Allocate a host write buffer for data initialization
UCL_H_Vec
<
numtyp
>
host_write
(
lj_types
*
lj_types
*
32
,
*
(
this
->
ucl_device
),
UCL_WRITE_OPTIMIZED
);
for
(
int
i
=
0
;
i
<
lj_types
*
lj_types
;
i
++
)
host_write
[
i
]
=
0.0
;
lj1
.
alloc
(
lj_types
*
lj_types
,
*
(
this
->
ucl_device
),
UCL_READ_ONLY
);
this
->
atom
->
type_pack4
(
ntypes
,
lj_types
,
lj1
,
host_write
,
host_lj1
,
host_lj2
,
host_cutsq
);
lj3
.
alloc
(
lj_types
*
lj_types
,
*
(
this
->
ucl_device
),
UCL_READ_ONLY
);
this
->
atom
->
type_pack4
(
ntypes
,
lj_types
,
lj3
,
host_write
,
host_lj3
,
host_lj4
,
host_offset
);
UCL_H_Vec
<
double
>
dview
;
sp_lj
.
alloc
(
4
,
*
(
this
->
ucl_device
),
UCL_READ_ONLY
);
dview
.
view
(
host_special_lj
,
4
,
*
(
this
->
ucl_device
));
ucl_copy
(
sp_lj
,
dview
,
false
);
_allocated
=
true
;
this
->
_max_bytes
=
type2rhor_z2r
.
row_bytes
()
+
type2frho
.
row_bytes
()
+
rhor_spline1
.
row_bytes
()
+
rhor_spline2
.
row_bytes
()
+
frho_spline1
.
row_bytes
()
+
frho_spline2
.
row_bytes
()
+
z2r_spline1
.
row_bytes
()
+
z2r_spline2
.
row_bytes
()
+
dev_fp
.
row_bytes
()
+
lj1
.
row_bytes
()
+
lj3
.
row_bytes
()
+
sp_lj
.
row_bytes
();
return
0
;
}
template
<
class
numtyp
,
class
acctyp
>
void
EAMLJT
::
clear
()
{
if
(
!
_allocated
)
return
;
_allocated
=
false
;
type2rhor_z2r
.
clear
();
type2frho
.
clear
();
rhor_spline1
.
clear
();
rhor_spline2
.
clear
();
frho_spline1
.
clear
();
frho_spline2
.
clear
();
z2r_spline1
.
clear
();
z2r_spline2
.
clear
();
host_fp
.
clear
();
dev_fp
.
clear
();
lj1
.
clear
();
lj3
.
clear
();
sp_lj
.
clear
();
time_pair2
.
clear
();
time_fp1
.
clear
();
time_fp2
.
clear
();
if
(
_compiled_energy
)
{
k_energy_fast
.
clear
();
k_energy
.
clear
();
_compiled_energy
=
false
;
}
this
->
clear_atomic
();
}
template
<
class
numtyp
,
class
acctyp
>
double
EAMLJT
::
host_memory_usage
()
const
{
return
this
->
host_memory_usage_atomic
()
+
sizeof
(
EAMLJ
<
numtyp
,
acctyp
>
);
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then compute atom energies/forces
// ---------------------------------------------------------------------------
template
<
class
numtyp
,
class
acctyp
>
void
EAMLJT
::
compute
(
const
int
f_ago
,
const
int
inum_full
,
const
int
nlocal
,
const
int
nall
,
double
**
host_x
,
int
*
host_type
,
int
*
ilist
,
int
*
numj
,
int
**
firstneigh
,
const
bool
eflag
,
const
bool
vflag
,
const
bool
eatom
,
const
bool
vatom
,
int
&
host_start
,
const
double
cpu_time
,
bool
&
success
,
void
**
fp_ptr
)
{
this
->
acc_timers
();
if
(
this
->
device
->
time_device
())
{
// Put time from the second part to the total time_pair
this
->
time_pair
.
add_time_to_total
(
time_pair2
.
time
());
// Add transfer time from device -> host after part 1
this
->
atom
->
add_transfer_time
(
time_fp1
.
time
());
// Add transfer time from host -> device before part 2
this
->
atom
->
add_transfer_time
(
time_fp2
.
time
());
}
// ------------------- Resize FP Array for EAM --------------------
if
(
nall
>
_max_fp_size
)
{
dev_fp
.
clear
();
host_fp
.
clear
();
_max_fp_size
=
static_cast
<
int
>
(
static_cast
<
double
>
(
nall
)
*
1.10
);
host_fp
.
alloc
(
_max_fp_size
,
*
(
this
->
ucl_device
));
if
(
this
->
ucl_device
->
device_type
()
==
UCL_CPU
)
dev_fp
.
view
(
host_fp
);
else
dev_fp
.
alloc
(
_max_fp_size
,
*
(
this
->
ucl_device
));
fp_tex
.
bind_float
(
dev_fp
,
1
);
}
*
fp_ptr
=
host_fp
.
begin
();
// ----------------------------------------------------------------
if
(
inum_full
==
0
)
{
host_start
=
0
;
// Make sure textures are correct if realloc by a different hybrid style
this
->
resize_atom
(
0
,
nall
,
success
);
this
->
zero_timers
();
return
;
}
int
ago
=
this
->
hd_balancer
.
ago_first
(
f_ago
);
int
inum
=
this
->
hd_balancer
.
balance
(
ago
,
inum_full
,
cpu_time
);
this
->
ans
->
inum
(
inum
);
host_start
=
inum
;
// -----------------------------------------------------------------
if
(
ago
==
0
)
{
this
->
reset_nbors
(
nall
,
inum
,
ilist
,
numj
,
firstneigh
,
success
);
if
(
!
success
)
return
;
}
this
->
atom
->
cast_x_data
(
host_x
,
host_type
);
this
->
atom
->
add_x_data
(
host_x
,
host_type
);
loop
(
eflag
,
vflag
);
// copy fp from device to host for comm
_nlocal
=
nlocal
;
time_fp1
.
start
();
ucl_copy
(
host_fp
,
dev_fp
,
nlocal
,
false
);
time_fp1
.
stop
();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU and then compute per-atom densities
// ---------------------------------------------------------------------------
template
<
class
numtyp
,
class
acctyp
>
int
**
EAMLJT
::
compute
(
const
int
ago
,
const
int
inum_full
,
const
int
nall
,
double
**
host_x
,
int
*
host_type
,
double
*
sublo
,
double
*
subhi
,
int
*
tag
,
int
**
nspecial
,
int
**
special
,
const
bool
eflag
,
const
bool
vflag
,
const
bool
eatom
,
const
bool
vatom
,
int
&
host_start
,
int
**
ilist
,
int
**
jnum
,
const
double
cpu_time
,
bool
&
success
,
int
&
inum
,
void
**
fp_ptr
)
{
this
->
acc_timers
();
if
(
this
->
device
->
time_device
())
{
// Put time from the second part to the total time_pair
this
->
time_pair
.
add_time_to_total
(
time_pair2
.
time
());
// Add transfer time from device -> host after part 1
this
->
atom
->
add_transfer_time
(
time_fp1
.
time
());
// Add transfer time from host -> device before part 2
this
->
atom
->
add_transfer_time
(
time_fp2
.
time
());
}
// ------------------- Resize FP Array for EAM --------------------
if
(
nall
>
_max_fp_size
)
{
dev_fp
.
clear
();
host_fp
.
clear
();
_max_fp_size
=
static_cast
<
int
>
(
static_cast
<
double
>
(
nall
)
*
1.10
);
host_fp
.
alloc
(
_max_fp_size
,
*
(
this
->
ucl_device
));
if
(
this
->
ucl_device
->
device_type
()
==
UCL_CPU
)
dev_fp
.
view
(
host_fp
);
else
dev_fp
.
alloc
(
_max_fp_size
,
*
(
this
->
ucl_device
));
fp_tex
.
bind_float
(
dev_fp
,
1
);
}
*
fp_ptr
=
host_fp
.
begin
();
// -----------------------------------------------------------------
if
(
inum_full
==
0
)
{
host_start
=
0
;
// Make sure textures are correct if realloc by a different hybrid style
this
->
resize_atom
(
0
,
nall
,
success
);
this
->
zero_timers
();
return
NULL
;
}
// load balance, returning the atom count on the device (inum)
this
->
hd_balancer
.
balance
(
cpu_time
);
inum
=
this
->
hd_balancer
.
get_gpu_count
(
ago
,
inum_full
);
this
->
ans
->
inum
(
inum
);
host_start
=
inum
;
// Build neighbor list on GPU if necessary
if
(
ago
==
0
)
{
this
->
build_nbor_list
(
inum
,
inum_full
-
inum
,
nall
,
host_x
,
host_type
,
sublo
,
subhi
,
tag
,
nspecial
,
special
,
success
);
if
(
!
success
)
return
NULL
;
}
else
{
this
->
atom
->
cast_x_data
(
host_x
,
host_type
);
this
->
atom
->
add_x_data
(
host_x
,
host_type
);
}
*
ilist
=
this
->
nbor
->
host_ilist
.
begin
();
*
jnum
=
this
->
nbor
->
host_acc
.
begin
();
loop
(
eflag
,
vflag
);
// copy fp from device to host for comm
_nlocal
=
inum_full
;
time_fp1
.
start
();
ucl_copy
(
host_fp
,
dev_fp
,
inum_full
,
false
);
time_fp1
.
stop
();
return
this
->
nbor
->
host_jlist
.
begin
()
-
host_start
;
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template
<
class
numtyp
,
class
acctyp
>
void
EAMLJT
::
compute2
(
int
*
ilist
,
const
bool
eflag
,
const
bool
vflag
,
const
bool
eatom
,
const
bool
vatom
)
{
if
(
this
->
ans
->
inum
()
==
0
)
return
;
this
->
hd_balancer
.
start_timer
();
time_fp2
.
start
();
this
->
add_fp_data
();
time_fp2
.
stop
();
loop2
(
eflag
,
vflag
);
if
(
ilist
==
NULL
)
this
->
ans
->
copy_answers
(
eflag
,
vflag
,
eatom
,
vatom
);
else
this
->
ans
->
copy_answers
(
eflag
,
vflag
,
eatom
,
vatom
,
ilist
);
this
->
device
->
add_ans_object
(
this
->
ans
);
this
->
hd_balancer
.
stop_timer
();
}
// ---------------------------------------------------------------------------
// Calculate per-atom energies and forces
// ---------------------------------------------------------------------------
template
<
class
numtyp
,
class
acctyp
>
void
EAMLJT
::
loop
(
const
bool
_eflag
,
const
bool
_vflag
)
{
// Compute the block size and grid size to keep all cores busy
const
int
BX
=
this
->
block_size
();
int
eflag
,
vflag
;
if
(
_eflag
)
eflag
=
1
;
else
eflag
=
0
;
if
(
_vflag
)
vflag
=
1
;
else
vflag
=
0
;
int
GX
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
this
->
ans
->
inum
())
/
(
BX
/
this
->
_threads_per_atom
)));
int
ainum
=
this
->
ans
->
inum
();
int
nbor_pitch
=
this
->
nbor
->
nbor_pitch
();
this
->
time_pair
.
start
();
if
(
shared_types
)
{
this
->
k_energy_fast
.
set_size
(
GX
,
BX
);
this
->
k_energy_fast
.
run
(
&
this
->
atom
->
dev_x
.
begin
(),
&
type2rhor_z2r
.
begin
(),
&
type2frho
.
begin
(),
&
rhor_spline2
.
begin
(),
&
frho_spline1
.
begin
(),
&
frho_spline2
.
begin
(),
&
this
->
nbor
->
dev_nbor
.
begin
(),
&
this
->
_nbor_data
->
begin
(),
&
dev_fp
.
begin
(),
&
this
->
ans
->
dev_engv
.
begin
(),
&
eflag
,
&
ainum
,
&
nbor_pitch
,
&
_ntypes
,
&
_cutforcesq
,
&
_rdr
,
&
_rdrho
,
&
_nrho
,
&
_nr
,
&
this
->
_threads_per_atom
);
}
else
{
this
->
k_energy
.
set_size
(
GX
,
BX
);
this
->
k_energy
.
run
(
&
this
->
atom
->
dev_x
.
begin
(),
&
type2rhor_z2r
.
begin
(),
&
type2frho
.
begin
(),
&
rhor_spline2
.
begin
(),
&
frho_spline1
.
begin
(),
&
frho_spline2
.
begin
(),
&
this
->
nbor
->
dev_nbor
.
begin
(),
&
this
->
_nbor_data
->
begin
(),
&
dev_fp
.
begin
(),
&
this
->
ans
->
dev_engv
.
begin
(),
&
eflag
,
&
ainum
,
&
nbor_pitch
,
&
_ntypes
,
&
_cutforcesq
,
&
_rdr
,
&
_rdrho
,
&
_nrho
,
&
_nr
,
&
this
->
_threads_per_atom
);
}
this
->
time_pair
.
stop
();
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template
<
class
numtyp
,
class
acctyp
>
void
EAMLJT
::
loop2
(
const
bool
_eflag
,
const
bool
_vflag
)
{
// Compute the block size and grid size to keep all cores busy
const
int
BX
=
this
->
block_size
();
int
eflag
,
vflag
;
if
(
_eflag
)
eflag
=
1
;
else
eflag
=
0
;
if
(
_vflag
)
vflag
=
1
;
else
vflag
=
0
;
int
GX
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
this
->
ans
->
inum
())
/
(
BX
/
this
->
_threads_per_atom
)));
int
ainum
=
this
->
ans
->
inum
();
int
nbor_pitch
=
this
->
nbor
->
nbor_pitch
();
this
->
time_pair2
.
start
();
if
(
shared_types
)
{
this
->
k_pair_fast
.
set_size
(
GX
,
BX
);
this
->
k_pair_fast
.
run
(
&
this
->
atom
->
dev_x
.
begin
(),
&
dev_fp
.
begin
(),
&
type2rhor_z2r
.
begin
(),
&
rhor_spline1
.
begin
(),
&
z2r_spline1
.
begin
(),
&
z2r_spline2
.
begin
(),
&
lj1
.
begin
(),
&
lj3
.
begin
(),
&
sp_lj
.
begin
(),
&
this
->
nbor
->
dev_nbor
.
begin
(),
&
this
->
_nbor_data
->
begin
(),
&
this
->
ans
->
dev_ans
.
begin
(),
&
this
->
ans
->
dev_engv
.
begin
(),
&
eflag
,
&
vflag
,
&
ainum
,
&
nbor_pitch
,
&
_cutforcesq
,
&
_rdr
,
&
_nr
,
&
this
->
_threads_per_atom
);
}
else
{
this
->
k_pair
.
set_size
(
GX
,
BX
);
this
->
k_pair
.
run
(
&
this
->
atom
->
dev_x
.
begin
(),
&
dev_fp
.
begin
(),
&
type2rhor_z2r
.
begin
(),
&
rhor_spline1
.
begin
(),
&
z2r_spline1
.
begin
(),
&
z2r_spline2
.
begin
(),
&
lj1
.
begin
(),
&
lj3
.
begin
(),
&
sp_lj
.
begin
(),
&
this
->
nbor
->
dev_nbor
.
begin
(),
&
this
->
_nbor_data
->
begin
(),
&
this
->
ans
->
dev_ans
.
begin
(),
&
this
->
ans
->
dev_engv
.
begin
(),
&
eflag
,
&
vflag
,
&
ainum
,
&
nbor_pitch
,
&
_ntypes
,
&
_cutforcesq
,
&
_rdr
,
&
_nr
,
&
this
->
_threads_per_atom
);
}
this
->
time_pair2
.
stop
();
}
template
class
EAMLJ
<
PRECISION
,
ACC_PRECISION
>
;
Event Timeline
Log In to Comment