Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F71106707
lal_atom.h
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, Jul 9, 16:55
Size
12 KB
Mime Type
text/x-c++
Expires
Thu, Jul 11, 16:55 (2 d)
Engine
blob
Format
Raw Data
Handle
18878934
Attached To
rLAMMPS lammps
lal_atom.h
View Options
/***************************************************************************
atom.h
-------------------
W. Michael Brown (ORNL)
Class for particle data management
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#ifndef PAIR_GPU_ATOM_H
#define PAIR_GPU_ATOM_H
#include <math.h>
#include "mpi.h"
#if defined(USE_OPENCL)
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
#include "geryon/ocl_kernel.h"
using
namespace
ucl_opencl
;
#elif defined(USE_CUDART)
#include "geryon/nvc_timer.h"
#include "geryon/nvc_mat.h"
#include "geryon/nvc_kernel.h"
using
namespace
ucl_cudart
;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
#include "geryon/nvd_kernel.h"
using
namespace
ucl_cudadr
;
#endif
#ifdef USE_CUDPP
#include "cudpp.h"
#endif
#include "lal_precision.h"
namespace
LAMMPS_AL
{
template
<
class
numtyp
,
class
acctyp
>
class
Atom
{
public:
Atom
();
~
Atom
()
{
clear
();
}
/// Maximum number of atoms that can be stored with current allocation
inline
int
max_atoms
()
const
{
return
_max_atoms
;
}
/// Current number of local+ghost atoms stored
inline
int
nall
()
const
{
return
_nall
;
}
/// Set number of local+ghost atoms for future copy operations
inline
void
nall
(
const
int
n
)
{
_nall
=
n
;
}
/// Memory usage per atom in this class
int
bytes_per_atom
()
const
;
/// Clear any previous data and set up for a new LAMMPS run
/** \param rot True if atom storage needs quaternions
* \param gpu_nbor 0 if neighboring will be performed on host
* gpu_nbor 1 if neighboring will be performed on device
* gpu_nbor 2 if binning on host and neighboring on device **/
bool
init
(
const
int
nall
,
const
bool
charge
,
const
bool
rot
,
UCL_Device
&
dev
,
const
int
gpu_nbor
=
0
,
const
bool
bonds
=
false
);
/// Check if we have enough device storage and realloc if not
/** Returns true if resized with any call during this timestep **/
inline
bool
resize
(
const
int
nall
,
bool
&
success
)
{
_nall
=
nall
;
if
(
nall
>
_max_atoms
)
{
clear_resize
();
success
=
success
&&
alloc
(
nall
);
_resized
=
true
;
}
return
_resized
;
}
/// If already initialized by another LAMMPS style, add fields as necessary
/** \param rot True if atom storage needs quaternions
* \param gpu_nbor 0 if neighboring will be performed on host
* gpu_nbor 1 if neighboring will be performed on device
* gpu_nbor 2 if binning on host and neighboring on device **/
bool
add_fields
(
const
bool
charge
,
const
bool
rot
,
const
int
gpu_nbor
,
const
bool
bonds
);
/// Returns true if GPU is using charges
bool
charge
()
{
return
_charge
;
}
/// Returns true if GPU is using quaternions
bool
quaternion
()
{
return
_rot
;
}
/// Only free matrices of length inum or nall for resizing
void
clear_resize
();
/// Free all memory on host and device
void
clear
();
/// Return the total amount of host memory used by class in bytes
double
host_memory_usage
()
const
;
/// Sort arrays for neighbor list calculation on device
void
sort_neighbor
(
const
int
num_atoms
);
/// Add copy times to timers
inline
void
acc_timers
()
{
time_pos
.
add_to_total
();
if
(
_charge
)
time_q
.
add_to_total
();
if
(
_rot
)
time_quat
.
add_to_total
();
}
/// Add copy times to timers
inline
void
zero_timers
()
{
time_pos
.
zero
();
if
(
_charge
)
time_q
.
zero
();
if
(
_rot
)
time_quat
.
zero
();
}
/// Return the total time for host/device data transfer
/** Zeros the total so that the atom times are only included once **/
inline
double
transfer_time
()
{
double
total
=
time_pos
.
total_seconds
();
time_pos
.
zero_total
();
if
(
_charge
)
{
total
+=
time_q
.
total_seconds
();
time_q
.
zero_total
();
}
if
(
_rot
)
{
total
+=
time_q
.
total_seconds
();
time_quat
.
zero_total
();
}
return
total
+
_time_transfer
/
1000.0
;
}
/// Return the total time for data cast/pack
/** Zeros the time so that atom times are only included once **/
inline
double
cast_time
()
{
double
t
=
_time_cast
;
_time_cast
=
0.0
;
return
t
;
}
/// Pack LAMMPS atom type constants into matrix and copy to device
template
<
class
dev_typ
,
class
t1
>
inline
void
type_pack1
(
const
int
n
,
const
int
m_size
,
UCL_D_Vec
<
dev_typ
>
&
dev_v
,
UCL_H_Vec
<
numtyp
>
&
buffer
,
t1
**
one
)
{
int
ii
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
buffer
[
ii
]
=
static_cast
<
numtyp
>
(
one
[
i
][
j
]);
ii
++
;
}
ii
+=
m_size
-
n
;
}
UCL_H_Vec
<
dev_typ
>
view
;
view
.
view
((
dev_typ
*
)
buffer
.
begin
(),
m_size
*
m_size
,
*
dev
);
ucl_copy
(
dev_v
,
view
,
false
);
}
/// Pack LAMMPS atom type constants into 2 vectors and copy to device
template
<
class
dev_typ
,
class
t1
,
class
t2
>
inline
void
type_pack2
(
const
int
n
,
const
int
m_size
,
UCL_D_Vec
<
dev_typ
>
&
dev_v
,
UCL_H_Vec
<
numtyp
>
&
buffer
,
t1
**
one
,
t2
**
two
)
{
int
ii
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
buffer
[
ii
*
2
]
=
static_cast
<
numtyp
>
(
one
[
i
][
j
]);
buffer
[
ii
*
2
+
1
]
=
static_cast
<
numtyp
>
(
two
[
i
][
j
]);
ii
++
;
}
ii
+=
m_size
-
n
;
}
UCL_H_Vec
<
dev_typ
>
view
;
view
.
view
((
dev_typ
*
)
buffer
.
begin
(),
m_size
*
m_size
,
*
dev
);
ucl_copy
(
dev_v
,
view
,
false
);
}
/// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
template
<
class
dev_typ
,
class
t1
,
class
t2
,
class
t3
>
inline
void
type_pack4
(
const
int
n
,
const
int
m_size
,
UCL_D_Vec
<
dev_typ
>
&
dev_v
,
UCL_H_Vec
<
numtyp
>
&
buffer
,
t1
**
one
,
t2
**
two
,
t3
**
three
)
{
int
ii
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
buffer
[
ii
*
4
]
=
static_cast
<
numtyp
>
(
one
[
i
][
j
]);
buffer
[
ii
*
4
+
1
]
=
static_cast
<
numtyp
>
(
two
[
i
][
j
]);
buffer
[
ii
*
4
+
2
]
=
static_cast
<
numtyp
>
(
three
[
i
][
j
]);
ii
++
;
}
ii
+=
m_size
-
n
;
}
UCL_H_Vec
<
dev_typ
>
view
;
view
.
view
((
dev_typ
*
)
buffer
.
begin
(),
m_size
*
m_size
,
*
dev
);
ucl_copy
(
dev_v
,
view
,
false
);
}
/// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
template
<
class
dev_typ
,
class
t1
,
class
t2
,
class
t3
,
class
t4
>
inline
void
type_pack4
(
const
int
n
,
const
int
m_size
,
UCL_D_Vec
<
dev_typ
>
&
dev_v
,
UCL_H_Vec
<
numtyp
>
&
buffer
,
t1
**
one
,
t2
**
two
,
t3
**
three
,
t4
**
four
)
{
int
ii
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
buffer
[
ii
*
4
]
=
static_cast
<
numtyp
>
(
one
[
i
][
j
]);
buffer
[
ii
*
4
+
1
]
=
static_cast
<
numtyp
>
(
two
[
i
][
j
]);
buffer
[
ii
*
4
+
2
]
=
static_cast
<
numtyp
>
(
three
[
i
][
j
]);
buffer
[
ii
*
4
+
3
]
=
static_cast
<
numtyp
>
(
four
[
i
][
j
]);
ii
++
;
}
ii
+=
m_size
-
n
;
}
UCL_H_Vec
<
dev_typ
>
view
;
view
.
view
((
dev_typ
*
)
buffer
.
begin
(),
m_size
*
m_size
,
*
dev
);
ucl_copy
(
dev_v
,
view
,
false
);
}
/// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
template
<
class
dev_typ
,
class
t1
,
class
t2
>
inline
void
self_pack2
(
const
int
n
,
UCL_D_Vec
<
dev_typ
>
&
dev_v
,
UCL_H_Vec
<
numtyp
>
&
buffer
,
t1
**
one
,
t2
**
two
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
buffer
[
i
*
2
]
=
static_cast
<
numtyp
>
(
one
[
i
][
i
]);
buffer
[
i
*
2
+
1
]
=
static_cast
<
numtyp
>
(
two
[
i
][
i
]);
}
UCL_H_Vec
<
dev_typ
>
view
;
view
.
view
((
dev_typ
*
)
buffer
.
begin
(),
n
,
*
dev
);
ucl_copy
(
dev_v
,
view
,
false
);
}
// -------------------------COPY TO GPU ----------------------------------
/// Signal that we need to transfer atom data for next timestep
inline
void
data_unavail
()
{
_x_avail
=
false
;
_q_avail
=
false
;
_quat_avail
=
false
;
_resized
=
false
;
}
/// Cast positions and types to write buffer
inline
void
cast_x_data
(
double
**
host_ptr
,
const
int
*
host_type
)
{
if
(
_x_avail
==
false
)
{
double
t
=
MPI_Wtime
();
#ifdef GPU_CAST
memcpy
(
host_x_cast
.
begin
(),
host_ptr
[
0
],
_nall
*
3
*
sizeof
(
double
));
memcpy
(
host_type_cast
.
begin
(),
host_type
,
_nall
*
sizeof
(
int
));
#else
int
wl
=
0
;
for
(
int
i
=
0
;
i
<
_nall
;
i
++
)
{
x
[
wl
]
=
host_ptr
[
i
][
0
];
x
[
wl
+
1
]
=
host_ptr
[
i
][
1
];
x
[
wl
+
2
]
=
host_ptr
[
i
][
2
];
x
[
wl
+
3
]
=
host_type
[
i
];
wl
+=
4
;
}
#endif
_time_cast
+=
MPI_Wtime
()
-
t
;
}
}
/// Copy positions and types to device asynchronously
/** Copies nall() elements **/
inline
void
add_x_data
(
double
**
host_ptr
,
int
*
host_type
)
{
time_pos
.
start
();
if
(
_x_avail
==
false
)
{
#ifdef GPU_CAST
x_cast
.
update_device
(
_nall
*
3
,
true
);
type_cast
.
update_device
(
_nall
,
true
);
int
block_size
=
64
;
int
GX
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
_nall
)
/
block_size
));
k_cast_x
.
set_size
(
GX
,
block_size
);
k_cast_x
.
run
(
&
x
,
&
x_cast
,
&
type_cast
,
&
_nall
);
#else
x
.
update_device
(
_nall
*
4
,
true
);
#endif
_x_avail
=
true
;
}
time_pos
.
stop
();
}
/// Calls cast_x_data and add_x_data and times the routines
inline
void
cast_copy_x
(
double
**
host_ptr
,
int
*
host_type
)
{
cast_x_data
(
host_ptr
,
host_type
);
add_x_data
(
host_ptr
,
host_type
);
}
// Cast charges to write buffer
template
<
class
cpytyp
>
inline
void
cast_q_data
(
cpytyp
*
host_ptr
)
{
if
(
_q_avail
==
false
)
{
double
t
=
MPI_Wtime
();
// If double precision, still memcpy for async transfers
if
(
_host_view
)
{
q
.
host
.
view
((
numtyp
*
)
host_ptr
,
_nall
,
*
dev
);
q
.
device
.
view
(
q
.
host
);
}
else
if
(
sizeof
(
numtyp
)
==
sizeof
(
double
))
memcpy
(
q
.
host
.
begin
(),
host_ptr
,
_nall
*
sizeof
(
numtyp
));
else
for
(
int
i
=
0
;
i
<
_nall
;
i
++
)
q
[
i
]
=
host_ptr
[
i
];
_time_cast
+=
MPI_Wtime
()
-
t
;
}
}
// Copy charges to device asynchronously
inline
void
add_q_data
()
{
if
(
_q_avail
==
false
)
{
q
.
update_device
(
_nall
,
true
);
_q_avail
=
true
;
}
}
// Cast quaternions to write buffer
template
<
class
cpytyp
>
inline
void
cast_quat_data
(
cpytyp
*
host_ptr
)
{
if
(
_quat_avail
==
false
)
{
double
t
=
MPI_Wtime
();
if
(
_host_view
)
{
quat
.
host
.
view
((
numtyp
*
)
host_ptr
,
_nall
*
4
,
*
dev
);
quat
.
device
.
view
(
quat
.
host
);
}
else
if
(
sizeof
(
numtyp
)
==
sizeof
(
double
))
memcpy
(
quat
.
host
.
begin
(),
host_ptr
,
_nall
*
4
*
sizeof
(
numtyp
));
else
for
(
int
i
=
0
;
i
<
_nall
*
4
;
i
++
)
quat
[
i
]
=
host_ptr
[
i
];
_time_cast
+=
MPI_Wtime
()
-
t
;
}
}
// Copy quaternions to device
/** Copies nall()*4 elements **/
inline
void
add_quat_data
()
{
if
(
_quat_avail
==
false
)
{
quat
.
update_device
(
_nall
*
4
,
true
);
_quat_avail
=
true
;
}
}
/// Add in casting time from additional data (seconds)
inline
void
add_cast_time
(
double
t
)
{
_time_cast
+=
t
;
}
/// Add in transfer time from additional data (ms)
inline
void
add_transfer_time
(
double
t
)
{
_time_transfer
+=
t
;
}
/// Return number of bytes used on device
inline
double
max_gpu_bytes
()
{
double
m
=
_max_gpu_bytes
;
_max_gpu_bytes
=
0.0
;
return
m
;
}
/// Returns true if the device is addressing memory on the host
inline
bool
host_view
()
{
return
_host_view
;
}
// ------------------------------ DATA ----------------------------------
/// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
UCL_Vector
<
numtyp
,
numtyp
>
x
;
/// Charges
UCL_Vector
<
numtyp
,
numtyp
>
q
;
/// Quaterions
UCL_Vector
<
numtyp
,
numtyp
>
quat
;
#ifdef GPU_CAST
UCL_Vector
<
double
,
double
>
x_cast
;
UCL_Vector
<
int
,
int
>
type_cast
;
#endif
/// Cell list identifiers for device nbor builds
UCL_D_Vec
<
unsigned
>
dev_cell_id
;
/// Cell list identifiers for device nbor builds
UCL_D_Vec
<
int
>
dev_particle_id
;
/// Atom tag information for device nbor builds
UCL_D_Vec
<
int
>
dev_tag
;
/// Cell list identifiers for hybrid nbor builds
UCL_H_Vec
<
int
>
host_cell_id
;
/// Cell list identifiers for hybrid nbor builds
UCL_H_Vec
<
int
>
host_particle_id
;
/// Device timers
UCL_Timer
time_pos
,
time_q
,
time_quat
;
/// Geryon device
UCL_Device
*
dev
;
private:
#ifdef GPU_CAST
UCL_Program
*
atom_program
;
UCL_Kernel
k_cast_x
;
void
compile_kernels
(
UCL_Device
&
dev
);
#endif
bool
_compiled
;
// True if data has been copied to device already
bool
_x_avail
,
_q_avail
,
_quat_avail
,
_resized
;
bool
alloc
(
const
int
nall
);
bool
_allocated
,
_rot
,
_charge
,
_bonds
,
_other
;
int
_max_atoms
,
_nall
,
_gpu_nbor
;
bool
_host_view
;
double
_time_cast
,
_time_transfer
;
double
_max_gpu_bytes
;
#ifdef USE_CUDPP
CUDPPConfiguration
sort_config
;
CUDPPHandle
sort_plan
;
#endif
};
}
#endif
Event Timeline
Log In to Comment