Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F86525166
pair_gpu_atom.h
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Oct 7, 00:48
Size
12 KB
Mime Type
text/x-c++
Expires
Wed, Oct 9, 00:48 (2 d)
Engine
blob
Format
Raw Data
Handle
21318055
Attached To
rLAMMPS lammps
pair_gpu_atom.h
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_ATOM_H
#define PAIR_GPU_ATOM_H
#include <math.h>
#include "mpi.h"
#ifdef USE_OPENCL
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
#include "geryon/ocl_kernel.h"
using
namespace
ucl_opencl
;
#else
#include "cudpp.h"
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
#include "geryon/nvd_kernel.h"
using
namespace
ucl_cudadr
;
#endif
#include "pair_gpu_precision.h"
template
<
class
numtyp
,
class
acctyp
>
class
PairGPUAtom
{
public:
PairGPUAtom
();
~
PairGPUAtom
()
{
clear
();
}
/// Maximum number of atoms that can be stored with current allocation
inline
int
max_atoms
()
const
{
return
_max_atoms
;
}
/// Current number of local+ghost atoms stored
inline
int
nall
()
const
{
return
_nall
;
}
/// Set number of local+ghost atoms for future copy operations
inline
void
nall
(
const
int
n
)
{
_nall
=
n
;
}
/// Memory usage per atom in this class
int
bytes_per_atom
()
const
;
/// Clear any previous data and set up for a new LAMMPS run
/** \param rot True if atom storage needs quaternions
* \param gpu_nbor True if neighboring will be performed on device **/
bool
init
(
const
int
nall
,
const
bool
charge
,
const
bool
rot
,
UCL_Device
&
dev
,
const
bool
gpu_nbor
=
false
,
const
bool
bonds
=
false
);
/// Check if we have enough device storage and realloc if not
/** Returns true if resized with any call during this timestep **/
inline
bool
resize
(
const
int
nall
,
bool
&
success
)
{
_nall
=
nall
;
if
(
nall
>
_max_atoms
)
{
clear_resize
();
success
=
success
&&
alloc
(
nall
);
_resized
=
true
;
}
return
_resized
;
}
/// If already initialized by another LAMMPS style, add fields as necessary
/** \param rot True if atom storage needs quaternions
* \param gpu_nbor True if neighboring will be performed on device **/
bool
add_fields
(
const
bool
charge
,
const
bool
rot
,
const
bool
gpu_nbor
,
const
bool
bonds
);
/// Returns true if GPU is using charges
bool
charge
()
{
return
_charge
;
}
/// Returns true if GPU is using quaternions
bool
quat
()
{
return
_rot
;
}
/// Only free matrices of length inum or nall for resizing
void
clear_resize
();
/// Free all memory on host and device
void
clear
();
/// Return the total amount of host memory used by class in bytes
double
host_memory_usage
()
const
;
/// Sort arrays for neighbor list calculation on device
void
sort_neighbor
(
const
int
num_atoms
);
/// Add copy times to timers
inline
void
acc_timers
()
{
time_pos
.
add_to_total
();
if
(
_charge
)
time_q
.
add_to_total
();
if
(
_rot
)
time_quat
.
add_to_total
();
}
/// Add copy times to timers
inline
void
zero_timers
()
{
time_pos
.
zero
();
if
(
_charge
)
time_q
.
zero
();
if
(
_rot
)
time_quat
.
zero
();
}
/// Return the total time for host/device data transfer
/** Zeros the total so that the atom times are only included once **/
inline
double
transfer_time
()
{
double
total
=
time_pos
.
total_seconds
();
time_pos
.
zero_total
();
if
(
_charge
)
{
total
+=
time_q
.
total_seconds
();
time_q
.
zero_total
();
}
if
(
_rot
)
{
total
+=
time_q
.
total_seconds
();
time_quat
.
zero_total
();
}
return
total
;
}
/// Return the total time for data cast/pack
/** Zeros the time so that atom times are only included once **/
inline
double
cast_time
()
{
double
t
=
_time_cast
;
_time_cast
=
0.0
;
return
t
;
}
/// Pack LAMMPS atom type constants into matrix and copy to device
template
<
class
dev_typ
,
class
t1
>
inline
void
type_pack1
(
const
int
n
,
const
int
m_size
,
UCL_D_Vec
<
dev_typ
>
&
dev_v
,
UCL_H_Vec
<
numtyp
>
&
buffer
,
t1
**
one
)
{
int
ii
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
buffer
[
ii
]
=
static_cast
<
numtyp
>
(
one
[
i
][
j
]);
ii
++
;
}
ii
+=
m_size
-
n
;
}
UCL_H_Vec
<
dev_typ
>
view
;
view
.
view
((
dev_typ
*
)
buffer
.
begin
(),
m_size
*
m_size
,
*
dev
);
ucl_copy
(
dev_v
,
view
,
false
);
}
/// Pack LAMMPS atom type constants into 2 vectors and copy to device
template
<
class
dev_typ
,
class
t1
,
class
t2
>
inline
void
type_pack2
(
const
int
n
,
const
int
m_size
,
UCL_D_Vec
<
dev_typ
>
&
dev_v
,
UCL_H_Vec
<
numtyp
>
&
buffer
,
t1
**
one
,
t2
**
two
)
{
int
ii
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
buffer
[
ii
*
2
]
=
static_cast
<
numtyp
>
(
one
[
i
][
j
]);
buffer
[
ii
*
2
+
1
]
=
static_cast
<
numtyp
>
(
two
[
i
][
j
]);
ii
++
;
}
ii
+=
m_size
-
n
;
}
UCL_H_Vec
<
dev_typ
>
view
;
view
.
view
((
dev_typ
*
)
buffer
.
begin
(),
m_size
*
m_size
,
*
dev
);
ucl_copy
(
dev_v
,
view
,
false
);
}
/// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
template
<
class
dev_typ
,
class
t1
,
class
t2
,
class
t3
>
inline
void
type_pack4
(
const
int
n
,
const
int
m_size
,
UCL_D_Vec
<
dev_typ
>
&
dev_v
,
UCL_H_Vec
<
numtyp
>
&
buffer
,
t1
**
one
,
t2
**
two
,
t3
**
three
)
{
int
ii
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
buffer
[
ii
*
4
]
=
static_cast
<
numtyp
>
(
one
[
i
][
j
]);
buffer
[
ii
*
4
+
1
]
=
static_cast
<
numtyp
>
(
two
[
i
][
j
]);
buffer
[
ii
*
4
+
2
]
=
static_cast
<
numtyp
>
(
three
[
i
][
j
]);
ii
++
;
}
ii
+=
m_size
-
n
;
}
UCL_H_Vec
<
dev_typ
>
view
;
view
.
view
((
dev_typ
*
)
buffer
.
begin
(),
m_size
*
m_size
,
*
dev
);
ucl_copy
(
dev_v
,
view
,
false
);
}
/// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
template
<
class
dev_typ
,
class
t1
,
class
t2
,
class
t3
,
class
t4
>
inline
void
type_pack4
(
const
int
n
,
const
int
m_size
,
UCL_D_Vec
<
dev_typ
>
&
dev_v
,
UCL_H_Vec
<
numtyp
>
&
buffer
,
t1
**
one
,
t2
**
two
,
t3
**
three
,
t4
**
four
)
{
int
ii
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
buffer
[
ii
*
4
]
=
static_cast
<
numtyp
>
(
one
[
i
][
j
]);
buffer
[
ii
*
4
+
1
]
=
static_cast
<
numtyp
>
(
two
[
i
][
j
]);
buffer
[
ii
*
4
+
2
]
=
static_cast
<
numtyp
>
(
three
[
i
][
j
]);
buffer
[
ii
*
4
+
3
]
=
static_cast
<
numtyp
>
(
four
[
i
][
j
]);
ii
++
;
}
ii
+=
m_size
-
n
;
}
UCL_H_Vec
<
dev_typ
>
view
;
view
.
view
((
dev_typ
*
)
buffer
.
begin
(),
m_size
*
m_size
,
*
dev
);
ucl_copy
(
dev_v
,
view
,
false
);
}
/// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
template
<
class
dev_typ
,
class
t1
,
class
t2
>
inline
void
self_pack2
(
const
int
n
,
UCL_D_Vec
<
dev_typ
>
&
dev_v
,
UCL_H_Vec
<
numtyp
>
&
buffer
,
t1
**
one
,
t2
**
two
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
buffer
[
i
*
2
]
=
static_cast
<
numtyp
>
(
one
[
i
][
i
]);
buffer
[
i
*
2
+
1
]
=
static_cast
<
numtyp
>
(
two
[
i
][
i
]);
}
UCL_H_Vec
<
dev_typ
>
view
;
view
.
view
((
dev_typ
*
)
buffer
.
begin
(),
n
,
*
dev
);
ucl_copy
(
dev_v
,
view
,
false
);
}
// -------------------------COPY TO GPU ----------------------------------
/// Signal that we need to transfer atom data for next timestep
inline
void
data_unavail
()
{
_x_avail
=
false
;
_q_avail
=
false
;
_quat_avail
=
false
;
_resized
=
false
;
}
/// Cast positions and types to write buffer
inline
void
cast_x_data
(
double
**
host_ptr
,
const
int
*
host_type
)
{
if
(
_x_avail
==
false
)
{
double
t
=
MPI_Wtime
();
#ifdef GPU_CAST
memcpy
(
host_x_cast
.
begin
(),
host_ptr
[
0
],
_nall
*
3
*
sizeof
(
double
));
memcpy
(
host_type_cast
.
begin
(),
host_type
,
_nall
*
sizeof
(
int
));
#else
numtyp
*
_write_loc
=
host_x
.
begin
();
for
(
int
i
=
0
;
i
<
_nall
;
i
++
)
{
*
_write_loc
=
host_ptr
[
i
][
0
];
_write_loc
++
;
*
_write_loc
=
host_ptr
[
i
][
1
];
_write_loc
++
;
*
_write_loc
=
host_ptr
[
i
][
2
];
_write_loc
++
;
*
_write_loc
=
host_type
[
i
];
_write_loc
++
;
}
#endif
_time_cast
+=
MPI_Wtime
()
-
t
;
}
}
/// Copy positions and types to device asynchronously
/** Copies nall() elements **/
inline
void
add_x_data
(
double
**
host_ptr
,
int
*
host_type
)
{
time_pos
.
start
();
if
(
_x_avail
==
false
)
{
#ifdef GPU_CAST
ucl_copy
(
dev_x_cast
,
host_x_cast
,
_nall
*
3
,
true
);
ucl_copy
(
dev_type_cast
,
host_type_cast
,
_nall
,
true
);
int
block_size
=
64
;
int
GX
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
_nall
)
/
block_size
));
k_cast_x
.
set_size
(
GX
,
block_size
);
k_cast_x
.
run
(
&
dev_x
.
begin
(),
&
dev_x_cast
.
begin
(),
&
dev_type_cast
.
begin
(),
&
_nall
);
#else
ucl_copy
(
dev_x
,
host_x
,
_nall
*
4
,
true
);
#endif
_x_avail
=
true
;
}
time_pos
.
stop
();
}
/// Calls cast_x_data and add_x_data and times the routines
inline
void
cast_copy_x
(
double
**
host_ptr
,
int
*
host_type
)
{
cast_x_data
(
host_ptr
,
host_type
);
add_x_data
(
host_ptr
,
host_type
);
}
// Cast charges to write buffer
template
<
class
cpytyp
>
inline
void
cast_q_data
(
cpytyp
*
host_ptr
)
{
if
(
_q_avail
==
false
)
{
double
t
=
MPI_Wtime
();
if
(
dev
->
device_type
()
==
UCL_CPU
)
{
if
(
sizeof
(
numtyp
)
==
sizeof
(
double
))
{
host_q
.
view
((
numtyp
*
)
host_ptr
,
_nall
,
*
dev
);
dev_q
.
view
(
host_q
);
}
else
for
(
int
i
=
0
;
i
<
_nall
;
i
++
)
host_q
[
i
]
=
host_ptr
[
i
];
}
else
{
if
(
sizeof
(
numtyp
)
==
sizeof
(
double
))
memcpy
(
host_q
.
begin
(),
host_ptr
,
_nall
*
sizeof
(
numtyp
));
else
for
(
int
i
=
0
;
i
<
_nall
;
i
++
)
host_q
[
i
]
=
host_ptr
[
i
];
}
_time_cast
+=
MPI_Wtime
()
-
t
;
}
}
// Copy charges to device asynchronously
inline
void
add_q_data
()
{
if
(
_q_avail
==
false
)
{
ucl_copy
(
dev_q
,
host_q
,
_nall
,
true
);
_q_avail
=
true
;
}
}
// Cast quaternions to write buffer
template
<
class
cpytyp
>
inline
void
cast_quat_data
(
cpytyp
*
host_ptr
)
{
if
(
_quat_avail
==
false
)
{
double
t
=
MPI_Wtime
();
if
(
dev
->
device_type
()
==
UCL_CPU
)
{
if
(
sizeof
(
numtyp
)
==
sizeof
(
double
))
{
host_quat
.
view
((
numtyp
*
)
host_ptr
,
_nall
*
4
,
*
dev
);
dev_quat
.
view
(
host_quat
);
}
else
for
(
int
i
=
0
;
i
<
_nall
*
4
;
i
++
)
host_quat
[
i
]
=
host_ptr
[
i
];
}
else
{
if
(
sizeof
(
numtyp
)
==
sizeof
(
double
))
memcpy
(
host_quat
.
begin
(),
host_ptr
,
_nall
*
4
*
sizeof
(
numtyp
));
else
for
(
int
i
=
0
;
i
<
_nall
*
4
;
i
++
)
host_quat
[
i
]
=
host_ptr
[
i
];
}
_time_cast
+=
MPI_Wtime
()
-
t
;
}
}
// Copy quaternions to device
/** Copies nall()*4 elements **/
inline
void
add_quat_data
()
{
if
(
_quat_avail
==
false
)
{
ucl_copy
(
dev_quat
,
host_quat
,
_nall
*
4
,
true
);
_quat_avail
=
true
;
}
}
/// Return number of bytes used on device
inline
double
max_gpu_bytes
()
{
double
m
=
_max_gpu_bytes
;
_max_gpu_bytes
=
0.0
;
return
m
;
}
// ------------------------------ DATA ----------------------------------
/// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
UCL_D_Vec
<
numtyp
>
dev_x
;
/// Charges
UCL_D_Vec
<
numtyp
>
dev_q
;
/// Quaterions
UCL_D_Vec
<
numtyp
>
dev_quat
;
#ifdef GPU_CAST
UCL_D_Vec
<
double
>
dev_x_cast
;
UCL_D_Vec
<
int
>
dev_type_cast
;
UCL_H_Vec
<
double
>
host_x_cast
;
UCL_H_Vec
<
int
>
host_type_cast
;
#endif
/// Buffer for moving positions to device
UCL_H_Vec
<
numtyp
>
host_x
;
/// Buffer for moving charge data to GPU
UCL_H_Vec
<
numtyp
>
host_q
;
/// Buffer for moving quat data to GPU
UCL_H_Vec
<
numtyp
>
host_quat
;
/// Cell list identifiers for device nbor builds
UCL_D_Vec
<
unsigned
>
dev_cell_id
;
/// Cell list identifiers for device nbor builds
UCL_D_Vec
<
int
>
dev_particle_id
;
/// Atom tag information for device nbor builds
UCL_D_Vec
<
int
>
dev_tag
;
/// Device timers
UCL_Timer
time_pos
,
time_q
,
time_quat
;
/// Geryon device
UCL_Device
*
dev
;
private:
#ifdef GPU_CAST
UCL_Program
*
atom_program
;
UCL_Kernel
k_cast_x
;
void
compile_kernels
(
UCL_Device
&
dev
);
#endif
bool
_compiled
;
// True if data has been copied to device already
bool
_x_avail
,
_q_avail
,
_quat_avail
,
_resized
;
bool
alloc
(
const
int
nall
);
bool
_allocated
,
_rot
,
_charge
,
_other
;
int
_max_atoms
,
_nall
;
bool
_gpu_nbor
,
_bonds
;
double
_time_cast
;
double
_max_gpu_bytes
;
#ifndef USE_OPENCL
CUDPPConfiguration
sort_config
;
CUDPPHandle
sort_plan
;
#endif
};
#endif
Event Timeline
Log In to Comment