Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F70060504
pair_gpu_device.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Jul 5, 02:17
Size
20 KB
Mime Type
text/x-c++
Expires
Sun, Jul 7, 02:17 (2 d)
Engine
blob
Format
Raw Data
Handle
18782259
Attached To
rLAMMPS lammps
pair_gpu_device.cpp
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include "pair_gpu_device.h"
#include "pair_gpu_precision.h"
#include <map>
#include <math.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#ifdef USE_OPENCL
#include "pair_gpu_dev_cl.h"
#else
#include "pair_gpu_dev_ptx.h"
#endif
#define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
template
<
class
numtyp
,
class
acctyp
>
PairGPUDeviceT
::
PairGPUDevice
()
:
_init_count
(
0
),
_device_init
(
false
),
_gpu_mode
(
GPU_FORCE
),
_first_device
(
0
),
_last_device
(
0
),
_compiled
(
false
)
{
}
template
<
class
numtyp
,
class
acctyp
>
PairGPUDeviceT
::~
PairGPUDevice
()
{
clear_device
();
}
template
<
class
numtyp
,
class
acctyp
>
int
PairGPUDeviceT
::
init_device
(
MPI_Comm
world
,
MPI_Comm
replica
,
const
int
first_gpu
,
const
int
last_gpu
,
const
int
gpu_mode
,
const
double
p_split
,
const
int
nthreads
,
const
int
t_per_atom
)
{
_nthreads
=
nthreads
;
#ifdef _OPENMP
omp_set_num_threads
(
nthreads
);
#endif
_threads_per_atom
=
t_per_atom
;
_threads_per_charge
=
t_per_atom
;
if
(
_device_init
)
return
0
;
_device_init
=
true
;
_comm_world
=
world
;
_comm_replica
=
replica
;
_first_device
=
first_gpu
;
_last_device
=
last_gpu
;
_gpu_mode
=
gpu_mode
;
_particle_split
=
p_split
;
// Get the rank/size within the world
MPI_Comm_rank
(
_comm_world
,
&
_world_me
);
MPI_Comm_size
(
_comm_world
,
&
_world_size
);
// Get the rank/size within the replica
MPI_Comm_rank
(
_comm_replica
,
&
_replica_me
);
MPI_Comm_size
(
_comm_replica
,
&
_replica_size
);
// Get the names of all nodes
int
name_length
;
char
node_name
[
MPI_MAX_PROCESSOR_NAME
];
char
node_names
[
MPI_MAX_PROCESSOR_NAME
*
_world_size
];
MPI_Get_processor_name
(
node_name
,
&
name_length
);
MPI_Allgather
(
&
node_name
,
MPI_MAX_PROCESSOR_NAME
,
MPI_CHAR
,
&
node_names
,
MPI_MAX_PROCESSOR_NAME
,
MPI_CHAR
,
_comm_world
);
std
::
string
node_string
=
std
::
string
(
node_name
);
// Get the number of procs per node
std
::
map
<
std
::
string
,
int
>
name_map
;
std
::
map
<
std
::
string
,
int
>::
iterator
np
;
for
(
int
i
=
0
;
i
<
_world_size
;
i
++
)
{
std
::
string
i_string
=
std
::
string
(
&
node_names
[
i
*
MPI_MAX_PROCESSOR_NAME
]);
np
=
name_map
.
find
(
i_string
);
if
(
np
==
name_map
.
end
())
name_map
[
i_string
]
=
1
;
else
np
->
second
++
;
}
int
procs_per_node
=
name_map
.
begin
()
->
second
;
// Assign a unique id to each node
int
split_num
=
0
,
split_id
=
0
;
for
(
np
=
name_map
.
begin
();
np
!=
name_map
.
end
();
++
np
)
{
if
(
np
->
first
==
node_string
)
split_id
=
split_num
;
split_num
++
;
}
// Set up a per node communicator and find rank within
MPI_Comm
node_comm
;
MPI_Comm_split
(
_comm_world
,
split_id
,
0
,
&
node_comm
);
int
node_rank
;
MPI_Comm_rank
(
node_comm
,
&
node_rank
);
// set the device ID
_procs_per_gpu
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
procs_per_node
)
/
(
last_gpu
-
first_gpu
+
1
)));
int
my_gpu
=
node_rank
/
_procs_per_gpu
+
first_gpu
;
// Time on the device only if 1 proc per gpu
_time_device
=
true
;
if
(
_procs_per_gpu
>
1
)
_time_device
=
false
;
// Set up a per device communicator
MPI_Comm_split
(
node_comm
,
my_gpu
,
0
,
&
_comm_gpu
);
MPI_Comm_rank
(
_comm_gpu
,
&
_gpu_rank
);
gpu
=
new
UCL_Device
();
if
(
my_gpu
>=
gpu
->
num_devices
())
return
-
2
;
gpu
->
set
(
my_gpu
);
_long_range_precompute
=
0
;
int
flag
=
compile_kernels
();
return
flag
;
}
template
<
class
numtyp
,
class
acctyp
>
int
PairGPUDeviceT
::
init
(
PairGPUAns
<
numtyp
,
acctyp
>
&
ans
,
const
bool
charge
,
const
bool
rot
,
const
int
nlocal
,
const
int
host_nlocal
,
const
int
nall
,
PairGPUNbor
*
nbor
,
const
int
maxspecial
,
const
int
gpu_host
,
const
int
max_nbors
,
const
double
cell_size
,
const
bool
pre_cut
)
{
if
(
!
_device_init
)
return
-
1
;
if
(
sizeof
(
acctyp
)
==
sizeof
(
double
)
&&
gpu
->
double_precision
()
==
false
)
return
-
5
;
// Counts of data transfers for timing overhead estimates
_data_in_estimate
=
0
;
_data_out_estimate
=
1
;
// Initial number of local particles
int
ef_nlocal
=
nlocal
;
if
(
_particle_split
<
1.0
&&
_particle_split
>
0.0
)
ef_nlocal
=
static_cast
<
int
>
(
_particle_split
*
nlocal
);
bool
gpu_nbor
=
false
;
if
(
_gpu_mode
==
GPU_NEIGH
)
gpu_nbor
=
true
;
if
(
_init_count
==
0
)
{
// Initialize atom and nbor data
if
(
!
atom
.
init
(
nall
,
charge
,
rot
,
*
gpu
,
gpu_nbor
,
gpu_nbor
&&
maxspecial
>
0
))
return
-
3
;
_data_in_estimate
++
;
if
(
charge
)
_data_in_estimate
++
;
if
(
rot
)
_data_in_estimate
++
;
}
else
{
if
(
atom
.
charge
()
==
false
&&
charge
)
_data_in_estimate
++
;
if
(
atom
.
quat
()
==
false
&&
rot
)
_data_in_estimate
++
;
if
(
!
atom
.
add_fields
(
charge
,
rot
,
gpu_nbor
,
gpu_nbor
&&
maxspecial
))
return
-
3
;
}
if
(
!
ans
.
init
(
ef_nlocal
,
charge
,
rot
,
*
gpu
))
return
-
3
;
if
(
!
nbor
->
init
(
&
_nbor_shared
,
ef_nlocal
,
host_nlocal
,
max_nbors
,
maxspecial
,
*
gpu
,
gpu_nbor
,
gpu_host
,
pre_cut
,
_block_cell_2d
,
_block_cell_id
,
_block_nbor_build
))
return
-
3
;
nbor
->
cell_size
(
cell_size
);
_init_count
++
;
return
0
;
}
template
<
class
numtyp
,
class
acctyp
>
int
PairGPUDeviceT
::
init
(
PairGPUAns
<
numtyp
,
acctyp
>
&
ans
,
const
int
nlocal
,
const
int
nall
)
{
if
(
!
_device_init
)
return
-
1
;
if
(
sizeof
(
acctyp
)
==
sizeof
(
double
)
&&
gpu
->
double_precision
()
==
false
)
return
-
5
;
if
(
_init_count
==
0
)
{
// Initialize atom and nbor data
if
(
!
atom
.
init
(
nall
,
true
,
false
,
*
gpu
,
false
,
false
))
return
-
3
;
}
else
if
(
!
atom
.
add_fields
(
true
,
false
,
false
,
false
))
return
-
3
;
if
(
!
ans
.
init
(
nlocal
,
true
,
false
,
*
gpu
))
return
-
3
;
_init_count
++
;
return
0
;
}
template
<
class
numtyp
,
class
acctyp
>
void
PairGPUDeviceT
::
set_single_precompute
(
PPPMGPUMemory
<
numtyp
,
acctyp
,
float
,
_lgpu_float4
>
*
pppm
)
{
_long_range_precompute
=
1
;
pppm_single
=
pppm
;
}
template
<
class
numtyp
,
class
acctyp
>
void
PairGPUDeviceT
::
set_double_precompute
(
PPPMGPUMemory
<
numtyp
,
acctyp
,
double
,
_lgpu_double4
>
*
pppm
)
{
_long_range_precompute
=
2
;
pppm_double
=
pppm
;
}
template
<
class
numtyp
,
class
acctyp
>
void
PairGPUDeviceT
::
init_message
(
FILE
*
screen
,
const
char
*
name
,
const
int
first_gpu
,
const
int
last_gpu
)
{
#ifdef USE_OPENCL
std
::
string
fs
=
""
;
#else
std
::
string
fs
=
toa
(
gpu
->
free_gigabytes
())
+
"/"
;
#endif
if
(
_replica_me
==
0
&&
screen
)
{
fprintf
(
screen
,
"
\n
-------------------------------------"
);
fprintf
(
screen
,
"-------------------------------------
\n
"
);
fprintf
(
screen
,
"- Using GPGPU acceleration for %s:
\n
"
,
name
);
fprintf
(
screen
,
"- with %d proc(s) per device.
\n
"
,
_procs_per_gpu
);
#ifdef _OPENMP
fprintf
(
screen
,
"- with %d thread(s) per proc.
\n
"
,
_nthreads
);
#endif
fprintf
(
screen
,
"-------------------------------------"
);
fprintf
(
screen
,
"-------------------------------------
\n
"
);
int
last
=
last_gpu
+
1
;
if
(
last
>
gpu
->
num_devices
())
last
=
gpu
->
num_devices
();
for
(
int
i
=
first_gpu
;
i
<
last
;
i
++
)
{
std
::
string
sname
=
gpu
->
name
(
i
)
+
", "
+
toa
(
gpu
->
cores
(
i
))
+
" cores, "
+
fs
+
toa
(
gpu
->
gigabytes
(
i
))
+
" GB, "
+
toa
(
gpu
->
clock_rate
(
i
))
+
" GHZ ("
;
if
(
sizeof
(
PRECISION
)
==
4
)
{
if
(
sizeof
(
ACC_PRECISION
)
==
4
)
sname
+=
"Single Precision)"
;
else
sname
+=
"Mixed Precision)"
;
}
else
sname
+=
"Double Precision)"
;
fprintf
(
screen
,
"GPU %d: %s
\n
"
,
i
,
sname
.
c_str
());
}
fprintf
(
screen
,
"-------------------------------------"
);
fprintf
(
screen
,
"-------------------------------------
\n\n
"
);
}
}
template
<
class
numtyp
,
class
acctyp
>
void
PairGPUDeviceT
::
estimate_gpu_overhead
(
const
int
kernel_calls
,
double
&
gpu_overhead
,
double
&
gpu_driver_overhead
)
{
UCL_H_Vec
<
int
>
*
host_data_in
=
NULL
,
*
host_data_out
=
NULL
;
UCL_D_Vec
<
int
>
*
dev_data_in
=
NULL
,
*
dev_data_out
=
NULL
,
*
kernel_data
=
NULL
;
UCL_Timer
*
timers_in
=
NULL
,
*
timers_out
=
NULL
,
*
timers_kernel
=
NULL
;
UCL_Timer
over_timer
(
*
gpu
);
if
(
_data_in_estimate
>
0
)
{
host_data_in
=
new
UCL_H_Vec
<
int
>
[
_data_in_estimate
];
dev_data_in
=
new
UCL_D_Vec
<
int
>
[
_data_in_estimate
];
timers_in
=
new
UCL_Timer
[
_data_in_estimate
];
}
if
(
_data_out_estimate
>
0
)
{
host_data_out
=
new
UCL_H_Vec
<
int
>
[
_data_out_estimate
];
dev_data_out
=
new
UCL_D_Vec
<
int
>
[
_data_out_estimate
];
timers_out
=
new
UCL_Timer
[
_data_out_estimate
];
}
if
(
kernel_calls
>
0
)
{
kernel_data
=
new
UCL_D_Vec
<
int
>
[
kernel_calls
];
timers_kernel
=
new
UCL_Timer
[
kernel_calls
];
}
for
(
int
i
=
0
;
i
<
_data_in_estimate
;
i
++
)
{
host_data_in
[
i
].
alloc
(
1
,
*
gpu
);
dev_data_in
[
i
].
alloc
(
1
,
*
gpu
);
timers_in
[
i
].
init
(
*
gpu
);
}
for
(
int
i
=
0
;
i
<
_data_out_estimate
;
i
++
)
{
host_data_out
[
i
].
alloc
(
1
,
*
gpu
);
dev_data_out
[
i
].
alloc
(
1
,
*
gpu
);
timers_out
[
i
].
init
(
*
gpu
);
}
for
(
int
i
=
0
;
i
<
kernel_calls
;
i
++
)
{
kernel_data
[
i
].
alloc
(
1
,
*
gpu
);
timers_kernel
[
i
].
init
(
*
gpu
);
}
gpu_overhead
=
0.0
;
gpu_driver_overhead
=
0.0
;
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
gpu
->
sync
();
gpu_barrier
();
over_timer
.
start
();
gpu
->
sync
();
gpu_barrier
();
double
driver_time
=
MPI_Wtime
();
for
(
int
i
=
0
;
i
<
_data_in_estimate
;
i
++
)
{
timers_in
[
i
].
start
();
ucl_copy
(
dev_data_in
[
i
],
host_data_in
[
i
],
true
);
timers_in
[
i
].
stop
();
}
for
(
int
i
=
0
;
i
<
kernel_calls
;
i
++
)
{
timers_kernel
[
i
].
start
();
zero
(
kernel_data
[
i
],
1
);
timers_kernel
[
i
].
stop
();
}
for
(
int
i
=
0
;
i
<
_data_out_estimate
;
i
++
)
{
timers_out
[
i
].
start
();
ucl_copy
(
host_data_out
[
i
],
dev_data_out
[
i
],
true
);
timers_out
[
i
].
stop
();
}
over_timer
.
stop
();
double
time
=
over_timer
.
seconds
();
driver_time
=
MPI_Wtime
()
-
driver_time
;
if
(
time_device
())
{
for
(
int
i
=
0
;
i
<
_data_in_estimate
;
i
++
)
timers_in
[
i
].
add_to_total
();
for
(
int
i
=
0
;
i
<
kernel_calls
;
i
++
)
timers_kernel
[
i
].
add_to_total
();
for
(
int
i
=
0
;
i
<
_data_out_estimate
;
i
++
)
timers_out
[
i
].
add_to_total
();
}
double
mpi_time
,
mpi_driver_time
;
MPI_Allreduce
(
&
time
,
&
mpi_time
,
1
,
MPI_DOUBLE
,
MPI_MAX
,
gpu_comm
());
MPI_Allreduce
(
&
driver_time
,
&
mpi_driver_time
,
1
,
MPI_DOUBLE
,
MPI_MAX
,
gpu_comm
());
gpu_overhead
+=
mpi_time
;
gpu_driver_overhead
+=
mpi_driver_time
;
}
gpu_overhead
/=
10.0
;
gpu_driver_overhead
/=
10.0
;
if
(
_data_in_estimate
>
0
)
{
delete
[]
host_data_in
;
delete
[]
dev_data_in
;
delete
[]
timers_in
;
}
if
(
_data_out_estimate
>
0
)
{
delete
[]
host_data_out
;
delete
[]
dev_data_out
;
delete
[]
timers_out
;
}
if
(
kernel_calls
>
0
)
{
delete
[]
kernel_data
;
delete
[]
timers_kernel
;
}
}
template
<
class
numtyp
,
class
acctyp
>
void
PairGPUDeviceT
::
output_times
(
UCL_Timer
&
time_pair
,
PairGPUAns
<
numtyp
,
acctyp
>
&
ans
,
PairGPUNbor
&
nbor
,
const
double
avg_split
,
const
double
max_bytes
,
const
double
gpu_overhead
,
const
double
driver_overhead
,
const
int
threads_per_atom
,
FILE
*
screen
)
{
double
single
[
8
],
times
[
8
];
single
[
0
]
=
atom
.
transfer_time
()
+
ans
.
transfer_time
();
single
[
1
]
=
nbor
.
time_nbor
.
total_seconds
();
single
[
2
]
=
nbor
.
time_kernel
.
total_seconds
();
single
[
3
]
=
time_pair
.
total_seconds
();
single
[
4
]
=
atom
.
cast_time
()
+
ans
.
cast_time
();
single
[
5
]
=
gpu_overhead
;
single
[
6
]
=
driver_overhead
;
single
[
7
]
=
ans
.
cpu_idle_time
();
MPI_Reduce
(
single
,
times
,
8
,
MPI_DOUBLE
,
MPI_SUM
,
0
,
_comm_replica
);
double
my_max_bytes
=
max_bytes
+
atom
.
max_gpu_bytes
();
double
mpi_max_bytes
;
MPI_Reduce
(
&
my_max_bytes
,
&
mpi_max_bytes
,
1
,
MPI_DOUBLE
,
MPI_MAX
,
0
,
_comm_replica
);
double
max_mb
=
mpi_max_bytes
/
(
1024.0
*
1024.0
);
if
(
replica_me
()
==
0
)
if
(
screen
&&
times
[
5
]
>
0.0
)
{
fprintf
(
screen
,
"
\n\n
-------------------------------------"
);
fprintf
(
screen
,
"--------------------------------
\n
"
);
fprintf
(
screen
,
" GPU Time Info (average): "
);
fprintf
(
screen
,
"
\n
-------------------------------------"
);
fprintf
(
screen
,
"--------------------------------
\n
"
);
if
(
time_device
())
{
fprintf
(
screen
,
"Data Transfer: %.4f s.
\n
"
,
times
[
0
]
/
_replica_size
);
fprintf
(
screen
,
"Data Cast/Pack: %.4f s.
\n
"
,
times
[
4
]
/
_replica_size
);
fprintf
(
screen
,
"Neighbor copy: %.4f s.
\n
"
,
times
[
1
]
/
_replica_size
);
if
(
nbor
.
gpu_nbor
())
fprintf
(
screen
,
"Neighbor build: %.4f s.
\n
"
,
times
[
2
]
/
_replica_size
);
else
fprintf
(
screen
,
"Neighbor unpack: %.4f s.
\n
"
,
times
[
2
]
/
_replica_size
);
fprintf
(
screen
,
"Force calc: %.4f s.
\n
"
,
times
[
3
]
/
_replica_size
);
}
fprintf
(
screen
,
"GPU Overhead: %.4f s.
\n
"
,
times
[
5
]
/
_replica_size
);
fprintf
(
screen
,
"Average split: %.4f.
\n
"
,
avg_split
);
fprintf
(
screen
,
"Threads / atom: %d.
\n
"
,
threads_per_atom
);
fprintf
(
screen
,
"Max Mem / Proc: %.2f MB.
\n
"
,
max_mb
);
fprintf
(
screen
,
"CPU Driver_Time: %.4f s.
\n
"
,
times
[
6
]
/
_replica_size
);
fprintf
(
screen
,
"CPU Idle_Time: %.4f s.
\n
"
,
times
[
7
]
/
_replica_size
);
fprintf
(
screen
,
"-------------------------------------"
);
fprintf
(
screen
,
"--------------------------------
\n\n
"
);
}
}
template
<
class
numtyp
,
class
acctyp
>
void
PairGPUDeviceT
::
output_kspace_times
(
UCL_Timer
&
time_in
,
UCL_Timer
&
time_out
,
UCL_Timer
&
time_map
,
UCL_Timer
&
time_rho
,
UCL_Timer
&
time_interp
,
PairGPUAns
<
numtyp
,
acctyp
>
&
ans
,
const
double
max_bytes
,
const
double
cpu_time
,
const
double
idle_time
,
FILE
*
screen
)
{
double
single
[
8
],
times
[
8
];
single
[
0
]
=
time_out
.
total_seconds
();
single
[
1
]
=
time_in
.
total_seconds
()
+
atom
.
transfer_time
()
+
atom
.
cast_time
();
single
[
2
]
=
time_map
.
total_seconds
();
single
[
3
]
=
time_rho
.
total_seconds
();
single
[
4
]
=
time_interp
.
total_seconds
();
single
[
5
]
=
ans
.
transfer_time
()
+
ans
.
cast_time
();
single
[
6
]
=
cpu_time
;
single
[
7
]
=
idle_time
;
MPI_Reduce
(
single
,
times
,
8
,
MPI_DOUBLE
,
MPI_SUM
,
0
,
_comm_replica
);
double
my_max_bytes
=
max_bytes
+
atom
.
max_gpu_bytes
();
double
mpi_max_bytes
;
MPI_Reduce
(
&
my_max_bytes
,
&
mpi_max_bytes
,
1
,
MPI_DOUBLE
,
MPI_MAX
,
0
,
_comm_replica
);
double
max_mb
=
mpi_max_bytes
/
(
1024.0
*
1024.0
);
if
(
replica_me
()
==
0
)
if
(
screen
&&
times
[
6
]
>
0.0
)
{
fprintf
(
screen
,
"
\n\n
-------------------------------------"
);
fprintf
(
screen
,
"--------------------------------
\n
"
);
fprintf
(
screen
,
" GPU Time Info (average): "
);
fprintf
(
screen
,
"
\n
-------------------------------------"
);
fprintf
(
screen
,
"--------------------------------
\n
"
);
if
(
time_device
())
{
fprintf
(
screen
,
"Data Out: %.4f s.
\n
"
,
times
[
0
]
/
_replica_size
);
fprintf
(
screen
,
"Data In: %.4f s.
\n
"
,
times
[
1
]
/
_replica_size
);
fprintf
(
screen
,
"Kernel (map): %.4f s.
\n
"
,
times
[
2
]
/
_replica_size
);
fprintf
(
screen
,
"Kernel (rho): %.4f s.
\n
"
,
times
[
3
]
/
_replica_size
);
fprintf
(
screen
,
"Force interp: %.4f s.
\n
"
,
times
[
4
]
/
_replica_size
);
fprintf
(
screen
,
"Total rho: %.4f s.
\n
"
,
(
times
[
0
]
+
times
[
2
]
+
times
[
3
])
/
_replica_size
);
fprintf
(
screen
,
"Total interp: %.4f s.
\n
"
,
(
times
[
1
]
+
times
[
4
])
/
_replica_size
);
fprintf
(
screen
,
"Force copy/cast: %.4f s.
\n
"
,
times
[
5
]
/
_replica_size
);
fprintf
(
screen
,
"Total: %.4f s.
\n
"
,
(
times
[
0
]
+
times
[
1
]
+
times
[
2
]
+
times
[
3
]
+
times
[
4
]
+
times
[
5
])
/
_replica_size
);
}
fprintf
(
screen
,
"CPU Poisson: %.4f s.
\n
"
,
times
[
6
]
/
_replica_size
);
fprintf
(
screen
,
"CPU Idle Time: %.4f s.
\n
"
,
times
[
7
]
/
_replica_size
);
fprintf
(
screen
,
"Max Mem / Proc: %.2f MB.
\n
"
,
max_mb
);
fprintf
(
screen
,
"-------------------------------------"
);
fprintf
(
screen
,
"--------------------------------
\n\n
"
);
}
}
template
<
class
numtyp
,
class
acctyp
>
void
PairGPUDeviceT
::
clear
()
{
if
(
_init_count
>
0
)
{
_long_range_precompute
=
0
;
_init_count
--
;
if
(
_init_count
==
0
)
{
atom
.
clear
();
_nbor_shared
.
clear
();
if
(
_compiled
)
{
k_zero
.
clear
();
k_info
.
clear
();
delete
dev_program
;
_compiled
=
false
;
}
}
}
}
template
<
class
numtyp
,
class
acctyp
>
void
PairGPUDeviceT
::
clear_device
()
{
while
(
_init_count
>
0
)
clear
();
if
(
_device_init
)
{
delete
gpu
;
_device_init
=
false
;
}
}
template
<
class
numtyp
,
class
acctyp
>
int
PairGPUDeviceT
::
compile_kernels
()
{
int
flag
=
0
;
if
(
_compiled
)
return
flag
;
std
::
string
flags
=
"-cl-mad-enable"
;
dev_program
=
new
UCL_Program
(
*
gpu
);
int
success
=
dev_program
->
load_string
(
pair_gpu_dev_kernel
,
flags
.
c_str
());
if
(
success
!=
UCL_SUCCESS
)
return
-
4
;
k_zero
.
set_function
(
*
dev_program
,
"kernel_zero"
);
k_info
.
set_function
(
*
dev_program
,
"kernel_info"
);
_compiled
=
true
;
UCL_H_Vec
<
int
>
h_gpu_lib_data
(
14
,
*
gpu
,
UCL_NOT_PINNED
);
UCL_D_Vec
<
int
>
d_gpu_lib_data
(
14
,
*
gpu
);
k_info
.
set_size
(
1
,
1
);
k_info
.
run
(
&
d_gpu_lib_data
.
begin
());
ucl_copy
(
h_gpu_lib_data
,
d_gpu_lib_data
,
false
);
_ptx_arch
=
static_cast
<
double
>
(
h_gpu_lib_data
[
0
])
/
100.0
;
#ifndef USE_OPENCL
if
(
_ptx_arch
>
gpu
->
arch
())
return
-
4
;
#endif
_num_mem_threads
=
h_gpu_lib_data
[
1
];
_warp_size
=
h_gpu_lib_data
[
2
];
if
(
_threads_per_atom
<
1
)
_threads_per_atom
=
h_gpu_lib_data
[
3
];
if
(
_threads_per_charge
<
1
)
_threads_per_charge
=
h_gpu_lib_data
[
13
];
_pppm_max_spline
=
h_gpu_lib_data
[
4
];
_pppm_block
=
h_gpu_lib_data
[
5
];
_block_pair
=
h_gpu_lib_data
[
6
];
_max_shared_types
=
h_gpu_lib_data
[
7
];
_block_cell_2d
=
h_gpu_lib_data
[
8
];
_block_cell_id
=
h_gpu_lib_data
[
9
];
_block_nbor_build
=
h_gpu_lib_data
[
10
];
_block_bio_pair
=
h_gpu_lib_data
[
11
];
_max_bio_shared_types
=
h_gpu_lib_data
[
12
];
if
(
static_cast
<
size_t
>
(
_block_pair
)
>
gpu
->
group_size
())
_block_pair
=
gpu
->
group_size
();
if
(
static_cast
<
size_t
>
(
_block_bio_pair
)
>
gpu
->
group_size
())
_block_bio_pair
=
gpu
->
group_size
();
if
(
_threads_per_atom
>
_warp_size
)
_threads_per_atom
=
_warp_size
;
if
(
_warp_size
%
_threads_per_atom
!=
0
)
_threads_per_atom
=
1
;
if
(
_threads_per_charge
>
_warp_size
)
_threads_per_charge
=
_warp_size
;
if
(
_warp_size
%
_threads_per_charge
!=
0
)
_threads_per_charge
=
1
;
return
flag
;
}
template
<
class
numtyp
,
class
acctyp
>
double
PairGPUDeviceT
::
host_memory_usage
()
const
{
return
atom
.
host_memory_usage
()
+
4
*
sizeof
(
numtyp
)
+
sizeof
(
PairGPUDevice
<
numtyp
,
acctyp
>
);
}
template
class
PairGPUDevice
<
PRECISION
,
ACC_PRECISION
>
;
PairGPUDevice
<
PRECISION
,
ACC_PRECISION
>
pair_gpu_device
;
int
lmp_init_device
(
MPI_Comm
world
,
MPI_Comm
replica
,
const
int
first_gpu
,
const
int
last_gpu
,
const
int
gpu_mode
,
const
double
particle_split
,
const
int
nthreads
,
const
int
t_per_atom
)
{
return
pair_gpu_device
.
init_device
(
world
,
replica
,
first_gpu
,
last_gpu
,
gpu_mode
,
particle_split
,
nthreads
,
t_per_atom
);
}
void
lmp_clear_device
()
{
pair_gpu_device
.
clear_device
();
}
double
lmp_gpu_forces
(
double
**
f
,
double
**
tor
,
double
*
eatom
,
double
**
vatom
,
double
*
virial
,
double
&
ecoul
)
{
return
pair_gpu_device
.
fix_gpu
(
f
,
tor
,
eatom
,
vatom
,
virial
,
ecoul
);
}
Event Timeline
Log In to Comment