Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91847673
ucl_d_mat.h
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Nov 15, 02:44
Size
17 KB
Mime Type
text/x-c++
Expires
Sun, Nov 17, 02:44 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
22335757
Attached To
rLAMMPS lammps
ucl_d_mat.h
View Options
/***************************************************************************
ucl_d_mat.h
-------------------
W. Michael Brown
Matrix Container on Device
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
// Only allow this file to be included by CUDA and OpenCL specific headers
#ifdef _UCL_MAT_ALLOW
/// 2D Matrix on device (can have extra column storage to get correct alignment)
template
<
class
numtyp
>
class
UCL_D_Mat
:
public
UCL_BaseMat
{
public:
// Traits for copying data
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
enum
traits
{
DATA_TYPE
=
_UCL_DATA_ID
<
numtyp
>::
id
,
MEM_TYPE
=
0
,
PADDED
=
1
,
ROW_MAJOR
=
1
,
VECTOR
=
0
};
typedef
numtyp
data_type
;
UCL_D_Mat
()
:
_cols
(
0
)
{}
~
UCL_D_Mat
()
{
_device_free
(
*
this
);
}
/// Construct with specified rows and cols
/** \sa alloc() **/
UCL_D_Mat
(
const
size_t
rows
,
const
size_t
cols
,
UCL_Device
&
device
,
const
enum
UCL_MEMOPT
kind
=
UCL_READ_WRITE
)
:
_cols
(
0
)
{
alloc
(
rows
,
cols
,
device
,
kind
);
}
/// Row major matrix on device
/** The kind parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \param cq Default command queue for operations copied from another mat
* \note - Coalesced access using adjacent cols on same row
* UCL_D_Mat(row,col) given by array[row*row_size()+col]
* \return UCL_SUCCESS if the memory allocation is successful **/
template
<
class
mat_type
>
inline
int
alloc
(
const
size_t
rows
,
const
size_t
cols
,
mat_type
&
cq
,
const
enum
UCL_MEMOPT
kind
=
UCL_READ_WRITE
)
{
clear
();
int
err
=
_device_alloc
(
*
this
,
cq
,
rows
,
cols
,
_pitch
,
kind
);
if
(
err
!=
UCL_SUCCESS
)
{
#ifndef UCL_NO_EXIT
std
::
cerr
<<
"UCL Error: Could not allocate "
<<
rows
*
cols
*
sizeof
(
numtyp
)
<<
" bytes on device.
\n
"
;
UCL_GERYON_EXIT
;
#endif
return
err
;
}
_kind
=
kind
;
_rows
=
rows
;
_cols
=
cols
;
_row_size
=
_pitch
/
sizeof
(
numtyp
);
#ifndef _UCL_DEVICE_PTR_MAT
_end
=
_array
+
_row_size
*
cols
;
#endif
#ifdef _OCL_MAT
_offset
=
0
;
#endif
return
err
;
}
/// Row major matrix on device
/** The kind parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \param device Used to get the default command queue for operations
* \note - Coalesced access using adjacent cols on same row
* UCL_D_Mat(row,col) given by array[row*row_size()+col]
* \return UCL_SUCCESS if the memory allocation is successful **/
inline
int
alloc
(
const
size_t
rows
,
const
size_t
cols
,
UCL_Device
&
device
,
const
enum
UCL_MEMOPT
kind
=
UCL_READ_WRITE
)
{
clear
();
int
err
=
_device_alloc
(
*
this
,
device
,
rows
,
cols
,
_pitch
,
kind
);
if
(
err
!=
UCL_SUCCESS
)
{
#ifndef UCL_NO_EXIT
std
::
cerr
<<
"UCL Error: Could not allocate "
<<
rows
*
cols
*
sizeof
(
numtyp
)
<<
" bytes on device.
\n
"
;
UCL_GERYON_EXIT
;
#endif
return
err
;
}
_kind
=
kind
;
_rows
=
rows
;
_cols
=
cols
;
_row_size
=
_pitch
/
sizeof
(
numtyp
);
#ifndef _UCL_DEVICE_PTR_MAT
_end
=
_array
+
_row_size
*
cols
;
#endif
#ifdef _OCL_MAT
_offset
=
0
;
#endif
return
err
;
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template
<
class
ucl_type
>
inline
void
view
(
ucl_type
&
input
,
const
size_t
rows
,
const
size_t
cols
,
const
size_t
stride
)
{
clear
();
_kind
=
UCL_VIEW
;
_rows
=
rows
;
_cols
=
cols
;
_pitch
=
stride
*
sizeof
(
numtyp
);
_row_size
=
stride
;
this
->
_cq
=
input
.
cq
();
#ifdef _OCL_MAT
_offset
=
input
.
offset
();
_array
=
input
.
cbegin
();
CL_SAFE_CALL
(
clRetainMemObject
(
input
.
cbegin
()));
CL_SAFE_CALL
(
clRetainCommandQueue
(
input
.
cq
()));
#else
_device_view
(
&
_array
,
input
.
begin
());
#endif
#ifndef _UCL_DEVICE_PTR_MAT
_end
=
_array
+
_cols
;
#endif
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template
<
class
ucl_type
>
inline
void
view
(
ucl_type
&
input
,
const
size_t
rows
,
const
size_t
cols
)
{
view
(
input
,
rows
,
cols
,
input
.
row_size
());
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
<
class
ucl_type
>
inline
void
view
(
ucl_type
&
input
,
const
size_t
cols
)
{
view
(
input
,
1
,
cols
);
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
<
class
ucl_type
>
inline
void
view
(
ucl_type
&
input
)
{
view
(
input
,
input
.
rows
(),
input
.
cols
());
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template
<
class
ptr_type
>
inline
void
view
(
ptr_type
input
,
const
size_t
rows
,
const
size_t
cols
,
const
size_t
stride
,
UCL_Device
&
dev
)
{
clear
();
_kind
=
UCL_VIEW
;
_cols
=
cols
;
_rows
=
rows
;
_pitch
=
stride
*
sizeof
(
numtyp
);
_row_size
=
stride
;
this
->
_cq
=
dev
.
cq
();
_array
=
input
;
#ifndef _UCL_DEVICE_PTR_MAT
_end
=
_array
+
_cols
;
#endif
#ifdef _OCL_MAT
_offset
=
0
;
CL_SAFE_CALL
(
clRetainMemObject
(
input
));
CL_SAFE_CALL
(
clRetainCommandQueue
(
dev
.
cq
()));
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template
<
class
ptr_type
>
inline
void
view
(
ptr_type
input
,
const
size_t
rows
,
const
size_t
cols
,
UCL_Device
&
dev
)
{
view
(
input
,
rows
,
cols
,
cols
,
dev
);
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template
<
class
ptr_type
>
inline
void
view
(
ptr_type
input
,
const
size_t
cols
,
UCL_Device
&
dev
)
{
view
(
input
,
1
,
cols
,
dev
);
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template
<
class
ucl_type
>
inline
void
view_offset
(
const
size_t
offset
,
ucl_type
&
input
,
const
size_t
rows
,
const
size_t
cols
,
const
size_t
stride
)
{
clear
();
_kind
=
UCL_VIEW
;
_cols
=
cols
;
_rows
=
rows
;
_pitch
=
stride
*
sizeof
(
numtyp
);
_row_size
=
stride
;
this
->
_cq
=
input
.
cq
();
#ifdef _OCL_MAT
_array
=
input
.
begin
();
_offset
=
offset
+
input
.
offset
();
CL_SAFE_CALL
(
clRetainMemObject
(
input
.
cbegin
()));
CL_SAFE_CALL
(
clRetainCommandQueue
(
input
.
cq
()));
#else
_device_view
(
&
_array
,
input
.
begin
(),
offset
,
sizeof
(
numtyp
));
#endif
#ifndef _UCL_DEVICE_PTR_MAT
_end
=
_array
+
_cols
;
#endif
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template
<
class
ucl_type
>
inline
void
view_offset
(
const
size_t
offset
,
ucl_type
&
input
,
const
size_t
rows
,
const
size_t
cols
)
{
view_offset
(
offset
,
input
,
rows
,
cols
,
input
.
row_size
());
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
<
class
ucl_type
>
inline
void
view_offset
(
const
size_t
offset
,
ucl_type
&
input
,
const
size_t
cols
)
{
view_offset
(
offset
,
input
,
1
,
cols
);
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
<
class
ucl_type
>
inline
void
view_offset
(
const
size_t
offset
,
ucl_type
&
input
)
{
if
(
input
.
rows
()
==
1
)
view_offset
(
offset
,
input
,
1
,
input
.
cols
()
-
offset
);
else
view_offset
(
offset
,
input
,
input
.
rows
()
-
offset
/
input
.
row_size
(),
input
.
cols
());
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template
<
class
ptr_type
>
inline
void
view_offset
(
const
size_t
offset
,
ptr_type
input
,
const
size_t
rows
,
const
size_t
cols
,
const
size_t
stride
,
UCL_Device
&
dev
)
{
clear
();
_kind
=
UCL_VIEW
;
_cols
=
cols
;
_rows
=
rows
;
_pitch
=
stride
*
sizeof
(
numtyp
);
_row_size
=
stride
;
this
->
_cq
=
dev
.
cq
();
#ifdef _OCL_MAT
_array
=
input
;
_offset
=
offset
;
CL_SAFE_CALL
(
clRetainMemObject
(
input
));
CL_SAFE_CALL
(
clRetainCommandQueue
(
dev
.
cq
()));
#else
#ifdef _UCL_DEVICE_PTR_MAT
_array
=
input
+
offset
*
sizeof
(
numtyp
);
#else
_array
=
input
+
offset
;
#endif
#endif
#ifndef _UCL_DEVICE_PTR_MAT
_end
=
_array
+
_cols
;
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template
<
class
ptr_type
>
inline
void
view_offset
(
const
size_t
offset
,
ptr_type
input
,
const
size_t
rows
,
const
size_t
cols
,
UCL_Device
&
dev
)
{
view_offset
(
offset
,
input
,
rows
,
cols
,
cols
,
dev
);
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template
<
class
ptr_type
>
inline
void
view_offset
(
const
size_t
offset
,
ptr_type
input
,
const
size_t
cols
,
UCL_Device
&
dev
)
{
view_offset
(
offset
,
input
,
1
,
cols
,
dev
);
}
/// Free memory and set size to 0
inline
void
clear
()
{
_device_free
(
*
this
);
_cols
=
0
;
_kind
=
UCL_VIEW
;
}
/// Resize the allocation to contain cols elements
/** \note Cannot be used on views **/
inline
int
resize
(
const
int
rows
,
const
int
cols
)
{
assert
(
_kind
!=
UCL_VIEW
);
int
err
=
_device_resize
(
*
this
,
rows
,
cols
,
_pitch
);
if
(
err
!=
UCL_SUCCESS
)
{
#ifndef UCL_NO_EXIT
std
::
cerr
<<
"UCL Error: Could not allocate "
<<
rows
*
cols
*
sizeof
(
numtyp
)
<<
" bytes on device.
\n
"
;
UCL_GERYON_EXIT
;
#endif
return
err
;
}
_rows
=
rows
;
_cols
=
cols
;
_row_size
=
_pitch
/
sizeof
(
numtyp
);
#ifndef _UCL_DEVICE_PTR_MAT
_end
=
_array
+
_row_size
*
cols
;
#endif
#ifdef _OCL_MAT
_offset
=
0
;
#endif
return
err
;
}
/// Resize (only if bigger) the allocation to contain rows x cols elements
/** \note Cannot be used on views **/
inline
int
resize_ib
(
const
int
rows
,
const
int
cols
)
{
if
(
cols
>
_cols
||
rows
>
_rows
)
return
resize
(
rows
,
cols
);
else
return
UCL_SUCCESS
;
}
/// Set each element to zero asynchronously in the default command_queue
inline
void
zero
()
{
zero
(
_cq
);
}
/// Set first n elements to zero asynchronously in the default command_queue
inline
void
zero
(
const
int
n
)
{
zero
(
n
,
_cq
);
}
/// Set each element to zero asynchronously
inline
void
zero
(
command_queue
&
cq
)
{
_device_zero
(
*
this
,
row_bytes
()
*
_rows
,
cq
);
}
/// Set first n elements to zero asynchronously
inline
void
zero
(
const
int
n
,
command_queue
&
cq
)
{
_device_zero
(
*
this
,
n
*
sizeof
(
numtyp
),
cq
);
}
#ifdef _UCL_DEVICE_PTR_MAT
/// For OpenCL, returns a (void *) device pointer to memory allocation
inline
device_ptr
&
begin
()
{
return
_array
;
}
/// For OpenCL, returns a (void *) device pointer to memory allocation
inline
const
device_ptr
&
begin
()
const
{
return
_array
;
}
#else
/// For CUDA-RT, get device pointer to first element
inline
numtyp
*
&
begin
()
{
return
_array
;
}
/// For CUDA-RT, get device pointer to first element
inline
numtyp
*
const
&
begin
()
const
{
return
_array
;
}
/// For CUDA-RT, get device pointer to one past last element
inline
numtyp
*
end
()
{
return
_end
;
}
/// For CUDA-RT, get device pointer to one past last element
inline
const
numtyp
*
end
()
const
{
return
_end
;
}
#endif
#ifdef _UCL_DEVICE_PTR_MAT
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns void** **/
inline
device_ptr
&
cbegin
()
{
return
_array
;
}
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns void** **/
inline
const
device_ptr
&
cbegin
()
const
{
return
_array
;
}
#else
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns numtyp** **/
inline
numtyp
**
cbegin
()
{
return
&
_array
;
}
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns numtyp** **/
inline
const
numtyp
**
cbegin
()
const
{
return
&
_array
;
}
#endif
/// Get the number of elements
inline
size_t
numel
()
const
{
return
_cols
*
_rows
;
}
/// Get the number of rows
inline
size_t
rows
()
const
{
return
_rows
;
}
/// Get the number of columns
inline
size_t
cols
()
const
{
return
_cols
;
}
///Get the size of a row (including any padding) in elements
inline
size_t
row_size
()
const
{
return
_row_size
;
}
/// Get the size of a row (including any padding) in bytes
inline
size_t
row_bytes
()
const
{
return
_pitch
;
}
/// Get the size in bytes of 1 element
inline
int
element_size
()
const
{
return
sizeof
(
numtyp
);
}
#ifdef _OCL_MAT
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline
size_t
offset
()
const
{
return
_offset
;
}
#else
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline
size_t
offset
()
const
{
return
0
;
}
#endif
/// Return the offset (in bytes) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline
size_t
byteoff
()
const
{
return
offset
()
*
sizeof
(
numtyp
);
}
private:
size_t
_pitch
,
_row_size
,
_rows
,
_cols
;
#ifdef _UCL_DEVICE_PTR_MAT
device_ptr
_array
;
#else
numtyp
*
_array
,
*
_end
;
#endif
#ifdef _OCL_MAT
size_t
_offset
;
#endif
};
#endif
Event Timeline
Log In to Comment