Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F86519455
gb_gpu_memory.h
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Oct 6, 23:52
Size
7 KB
Mime Type
text/x-c++
Expires
Tue, Oct 8, 23:52 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
21278996
Attached To
rLAMMPS lammps
gb_gpu_memory.h
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_MEMORY_H
#define GB_GPU_MEMORY_H
#include "pair_gpu_device.h"
#include "pair_gpu_balance.h"
#include "mpi.h"
template
<
class
numtyp
,
class
acctyp
>
class
GB_GPU_Memory
{
public:
GB_GPU_Memory
();
~
GB_GPU_Memory
();
/// Clear any previous data and set up for a new LAMMPS run
/** \param gpu_nbor true if neighboring performed on device
* \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \return false if there is not sufficient memory or device init prob
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int
init
(
const
int
ntypes
,
const
double
gamma
,
const
double
upsilon
,
const
double
mu
,
double
**
host_shape
,
double
**
host_well
,
double
**
host_cutsq
,
double
**
host_sigma
,
double
**
host_epsilon
,
double
*
host_lshape
,
int
**
h_form
,
double
**
host_lj1
,
double
**
host_lj2
,
double
**
host_lj3
,
double
**
host_lj4
,
double
**
host_offset
,
const
double
*
host_special_lj
,
const
int
nlocal
,
const
int
nall
,
const
int
max_nbors
,
const
double
cell_size
,
const
double
gpu_split
,
FILE
*
screen
);
/// Estimate the overhead for GPU context changes and CPU driver
void
estimate_gpu_overhead
();
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline
void
resize_atom
(
const
int
inum
,
const
int
nall
,
bool
&
success
)
{
atom
->
resize
(
nall
,
success
);
ans
->
resize
(
inum
,
success
);
if
(
multiple_forms
)
ans
->
dev_ans
.
zero
();
double
bytes
=
ans
->
gpu_bytes
()
+
nbor
->
gpu_bytes
();
if
(
bytes
>
_max_bytes
)
_max_bytes
=
bytes
;
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \param olist_size size of list of particles from CPU neighboring
* \note host_inum is 0 if the host is performing neighboring
* \note if GPU is neighboring nlocal+host_inum=total number local particles
* \note if CPU is neighboring olist_size=total number of local particles
* \note if GPU is neighboring olist_size=0 **/
inline
void
resize_local
(
const
int
nlocal
,
const
int
host_inum
,
const
int
max_nbors
,
const
int
olist_size
,
bool
&
success
)
{
if
(
olist_size
>
static_cast
<
int
>
(
host_olist
.
numel
()))
{
host_olist
.
clear
();
int
new_size
=
static_cast
<
int
>
(
static_cast
<
double
>
(
olist_size
)
*
1.10
);
success
=
success
&&
(
host_olist
.
alloc
(
new_size
,
*
ucl_device
)
==
UCL_SUCCESS
);
}
nbor
->
resize
(
nlocal
,
host_inum
,
max_nbors
,
success
);
double
bytes
=
ans
->
gpu_bytes
()
+
nbor
->
gpu_bytes
();
if
(
bytes
>
_max_bytes
)
_max_bytes
=
bytes
;
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void
clear
();
/// Returns memory usage on device per atom
int
bytes_per_atom
(
const
int
max_nbors
)
const
;
/// Total host memory used by library for pair style
double
host_memory_usage
()
const
;
/// Accumulate timers
inline
void
acc_timers
()
{
if
(
device
->
time_device
())
{
if
(
nbor_time_avail
)
{
nbor
->
time_nbor
.
add_to_total
();
nbor
->
time_kernel
.
add_to_total
();
nbor_time_avail
=
false
;
}
time_kernel
.
add_to_total
();
time_gayberne
.
add_to_total
();
if
(
multiple_forms
)
{
time_kernel2
.
add_to_total
();
time_gayberne2
.
add_to_total
();
time_pair
.
add_to_total
();
}
atom
->
acc_timers
();
ans
->
acc_timers
();
}
}
/// Accumulate timers
inline
void
zero_timers
()
{
nbor_time_avail
=
false
;
time_kernel
.
zero
();
time_gayberne
.
zero
();
if
(
multiple_forms
)
{
time_kernel2
.
zero
();
time_gayberne2
.
zero
();
time_pair
.
zero
();
}
atom
->
zero_timers
();
ans
->
zero_timers
();
}
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
PairGPUDevice
<
numtyp
,
acctyp
>
*
device
;
/// Geryon device
UCL_Device
*
ucl_device
;
/// Device Error Flag - Set if a bad matrix inversion occurs
UCL_D_Vec
<
int
>
dev_error
;
/// Device timers
UCL_Timer
time_kernel
,
time_gayberne
,
time_kernel2
,
time_gayberne2
,
time_pair
;
/// Host device load balancer
PairGPUBalance
<
numtyp
,
acctyp
>
hd_balancer
;
/// LAMMPS pointer for screen output
FILE
*
screen
;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
UCL_D_Vec
<
numtyp4
>
lj1
;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec
<
numtyp4
>
lj3
;
/// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon
UCL_D_Vec
<
numtyp2
>
sigma_epsilon
;
/// cut_form.x = cutsq, cut_form.y = form
UCL_D_Vec
<
numtyp2
>
cut_form
;
// 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
UCL_D_Vec
<
numtyp
>
gamma_upsilon_mu
;
// True if we want to use fast GB-sphere or sphere-sphere calculations
bool
multiple_forms
;
int
**
host_form
;
/// If atom type constants fit in shared memory, use fast kernels
bool
shared_types
;
int
_lj_types
;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
PairGPUAtom
<
numtyp
,
acctyp
>
*
atom
;
/// Aspherical Const Data for Atoms
UCL_D_Vec
<
numtyp4
>
shape
,
well
;
/// Aspherical Const Data for Atoms
UCL_D_Vec
<
numtyp
>
lshape
;
int
last_ellipse
,
max_last_ellipse
;
// ------------------------ FORCE/ENERGY DATA -----------------------
PairGPUAns
<
numtyp
,
acctyp
>
*
ans
;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
PairGPUNbor
*
nbor
;
/// ilist with particles sorted by type
UCL_H_Vec
<
int
>
host_olist
;
/// True if we should accumulate the neighbor timer
bool
nbor_time_avail
;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program
*
pair_program
,
*
gb_program
,
*
gb_lj_program
;
UCL_Kernel
k_gb_nbor_fast
,
k_gb_nbor
;
UCL_Kernel
k_gayberne
,
k_sphere_gb
,
k_lj_fast
,
k_lj
;
inline
int
block_size
()
{
return
_block_size
;
}
int
_threads_per_atom
;
private:
bool
_allocated
,
_compiled
;
int
_block_size
;
double
_max_bytes
;
double
_gpu_overhead
,
_driver_overhead
;
void
compile_kernels
(
UCL_Device
&
dev
);
};
#endif
Event Timeline
Log In to Comment