Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F87255360
nvd_device.h
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Oct 11, 14:04
Size
17 KB
Mime Type
text/x-c
Expires
Sun, Oct 13, 14:04 (1 d, 21 h)
Engine
blob
Format
Raw Data
Handle
21564268
Attached To
rLAMMPS lammps
nvd_device.h
View Options
/***************************************************************************
nvd_device.h
-------------------
W. Michael Brown
Utilities for dealing with cuda devices
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu Jan 21 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef NVD_DEVICE
#define NVD_DEVICE
#include <string>
#include <vector>
#include <iostream>
#include "nvd_macros.h"
#include "ucl_types.h"
namespace
ucl_cudadr
{
// --------------------------------------------------------------------------
// - COMMAND QUEUE STUFF
// --------------------------------------------------------------------------
typedef
CUstream
command_queue
;
inline
void
ucl_sync
(
CUstream
&
stream
)
{
CU_SAFE_CALL
(
cuStreamSynchronize
(
stream
));
}
struct
NVDProperties
{
int
device_id
;
std
::
string
name
;
int
major
;
int
minor
;
CUDA_INT_TYPE
totalGlobalMem
;
int
multiProcessorCount
;
CUdevprop_st
p
;
int
kernelExecTimeoutEnabled
;
int
integrated
;
int
canMapHostMemory
;
int
concurrentKernels
;
int
ECCEnabled
;
int
computeMode
;
};
/// Class for looking at device properties
/** \note Calls to change the device outside of the class results in incorrect
* behavior
* \note There is no error checking for indexing past the number of devices **/
class
UCL_Device
{
public:
/// Collect properties for every GPU on the node
/** \note You must set the active GPU with set() before using the device **/
inline
UCL_Device
();
inline
~
UCL_Device
();
/// Returns 1 (For compatibility with OpenCL)
inline
int
num_platforms
()
{
return
1
;
}
/// Return a string with name and info of the current platform
inline
std
::
string
platform_name
()
{
return
"NVIDIA Corporation NVIDIA CUDA Driver"
;
}
/// Delete any contexts/data and set the platform number to be used
inline
int
set_platform
(
const
int
pid
);
/// Return the number of devices that support CUDA
inline
int
num_devices
()
{
return
_properties
.
size
();
}
/// Set the CUDA device to the specified device number
/** A context and default command queue will be created for the device
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
* be allocated for use. clear() is called to delete any contexts and
* associated data from previous calls to set(). **/
inline
int
set
(
int
num
);
/// Delete any context and associated data stored from a call to set()
inline
void
clear
();
/// Get the current device number
inline
int
device_num
()
{
return
_device
;
}
/// Returns the default stream for the current device
inline
command_queue
&
cq
()
{
return
cq
(
0
);
}
/// Returns the stream indexed by i
inline
command_queue
&
cq
(
const
int
i
)
{
return
_cq
[
i
];
}
/// Block until all commands in the default stream have completed
inline
void
sync
()
{
sync
(
0
);
}
/// Block until all commands in the specified stream have completed
inline
void
sync
(
const
int
i
)
{
ucl_sync
(
cq
(
i
));
}
/// Get the number of command queues currently available on device
inline
int
num_queues
()
{
return
_cq
.
size
();
}
/// Add a stream for device computations
inline
void
push_command_queue
()
{
_cq
.
push_back
(
CUstream
());
CU_SAFE_CALL
(
cuStreamCreate
(
&
_cq
.
back
(),
0
));
}
/// Remove a stream for device computations
/** \note You cannot delete the default stream **/
inline
void
pop_command_queue
()
{
if
(
_cq
.
size
()
<
2
)
return
;
CU_SAFE_CALL_NS
(
cuStreamDestroy
(
_cq
.
back
()));
_cq
.
pop_back
();
}
/// Set the default command queue (by default this is the null stream)
/** \param i index of the command queue (as added by push_command_queue())
If i is 0, the default command queue is set to the null stream **/
inline
void
set_command_queue
(
const
int
i
)
{
if
(
i
==
0
)
_cq
[
0
]
=
0
;
else
_cq
[
0
]
=
_cq
[
i
];
}
/// Get the current CUDA device name
inline
std
::
string
name
()
{
return
name
(
_device
);
}
/// Get the CUDA device name
inline
std
::
string
name
(
const
int
i
)
{
return
std
::
string
(
_properties
[
i
].
name
);
}
/// Get a string telling the type of the current device
inline
std
::
string
device_type_name
()
{
return
device_type_name
(
_device
);
}
/// Get a string telling the type of the device
inline
std
::
string
device_type_name
(
const
int
i
)
{
return
"GPU"
;
}
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline
int
device_type
()
{
return
device_type
(
_device
);
}
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline
int
device_type
(
const
int
i
)
{
return
UCL_GPU
;
}
/// Returns true if host memory is efficiently addressable from device
inline
bool
shared_memory
()
{
return
shared_memory
(
_device
);
}
/// Returns true if host memory is efficiently addressable from device
inline
bool
shared_memory
(
const
int
i
)
{
return
device_type
(
i
)
==
UCL_CPU
;
}
/// Returns true if double precision is support for the current device
inline
bool
double_precision
()
{
return
double_precision
(
_device
);
}
/// Returns true if double precision is support for the device
inline
bool
double_precision
(
const
int
i
)
{
return
arch
(
i
)
>=
1.3
;}
/// Get the number of compute units on the current device
inline
unsigned
cus
()
{
return
cus
(
_device
);
}
/// Get the number of compute units
inline
unsigned
cus
(
const
int
i
)
{
return
_properties
[
i
].
multiProcessorCount
;
}
/// Get the number of cores in the current device
inline
unsigned
cores
()
{
return
cores
(
_device
);
}
/// Get the number of cores
inline
unsigned
cores
(
const
int
i
)
{
if
(
arch
(
i
)
<
2.0
)
return
_properties
[
i
].
multiProcessorCount
*
8
;
else
if
(
arch
(
i
)
<
2.1
)
return
_properties
[
i
].
multiProcessorCount
*
32
;
else
if
(
arch
(
i
)
<
3.0
)
return
_properties
[
i
].
multiProcessorCount
*
48
;
else
return
_properties
[
i
].
multiProcessorCount
*
192
;
}
/// Get the gigabytes of global memory in the current device
inline
double
gigabytes
()
{
return
gigabytes
(
_device
);
}
/// Get the gigabytes of global memory
inline
double
gigabytes
(
const
int
i
)
{
return
static_cast
<
double
>
(
_properties
[
i
].
totalGlobalMem
)
/
1073741824
;
}
/// Get the bytes of global memory in the current device
inline
size_t
bytes
()
{
return
bytes
(
_device
);
}
/// Get the bytes of global memory
inline
size_t
bytes
(
const
int
i
)
{
return
_properties
[
i
].
totalGlobalMem
;
}
// Get the gigabytes of free memory in the current device
inline
double
free_gigabytes
()
{
return
free_gigabytes
(
_device
);
}
// Get the gigabytes of free memory
inline
double
free_gigabytes
(
const
int
i
)
{
return
static_cast
<
double
>
(
free_bytes
(
i
))
/
1073741824
;
}
// Get the bytes of free memory in the current device
inline
size_t
free_bytes
()
{
return
free_bytes
(
_device
);
}
// Get the bytes of free memory
inline
size_t
free_bytes
(
const
int
i
)
{
CUDA_INT_TYPE
dfree
,
dtotal
;
CU_SAFE_CALL_NS
(
cuMemGetInfo
(
&
dfree
,
&
dtotal
));
return
static_cast
<
size_t
>
(
dfree
);
}
/// Return the GPGPU compute capability for current device
inline
double
arch
()
{
return
arch
(
_device
);
}
/// Return the GPGPU compute capability
inline
double
arch
(
const
int
i
)
{
return
static_cast
<
double
>
(
_properties
[
i
].
minor
)
/
10
+
_properties
[
i
].
major
;}
/// Clock rate in GHz for current device
inline
double
clock_rate
()
{
return
clock_rate
(
_device
);
}
/// Clock rate in GHz
inline
double
clock_rate
(
const
int
i
)
{
return
_properties
[
i
].
p
.
clockRate
*
1e-6
;}
/// Get the maximum number of threads per block
inline
size_t
group_size
()
{
return
group_size
(
_device
);
}
/// Get the maximum number of threads per block
inline
size_t
group_size
(
const
int
i
)
{
return
_properties
[
i
].
p
.
maxThreadsPerBlock
;
}
/// Return the maximum memory pitch in bytes for current device
inline
size_t
max_pitch
()
{
return
max_pitch
(
_device
);
}
/// Return the maximum memory pitch in bytes
inline
size_t
max_pitch
(
const
int
i
)
{
return
_properties
[
i
].
p
.
memPitch
;
}
/// Returns false if accelerator cannot be shared by multiple processes
/** If it cannot be determined, true is returned **/
inline
bool
sharing_supported
()
{
return
sharing_supported
(
_device
);
}
/// Returns false if accelerator cannot be shared by multiple processes
/** If it cannot be determined, true is returned **/
inline
bool
sharing_supported
(
const
int
i
)
{
return
(
_properties
[
i
].
computeMode
==
CU_COMPUTEMODE_DEFAULT
);
}
/// True if splitting device into equal subdevices supported
inline
bool
fission_equal
()
{
return
fission_equal
(
_device
);
}
/// True if splitting device into equal subdevices supported
inline
bool
fission_equal
(
const
int
i
)
{
return
false
;
}
/// True if splitting device into subdevices by specified counts supported
inline
bool
fission_by_counts
()
{
return
fission_by_counts
(
_device
);
}
/// True if splitting device into subdevices by specified counts supported
inline
bool
fission_by_counts
(
const
int
i
)
{
return
false
;
}
/// True if splitting device into subdevices by affinity domains supported
inline
bool
fission_by_affinity
()
{
return
fission_by_affinity
(
_device
);
}
/// True if splitting device into subdevices by affinity domains supported
inline
bool
fission_by_affinity
(
const
int
i
)
{
return
false
;
}
/// Maximum number of subdevices allowed from device fission
inline
int
max_sub_devices
()
{
return
max_sub_devices
(
_device
);
}
/// Maximum number of subdevices allowed from device fission
inline
int
max_sub_devices
(
const
int
i
)
{
return
0
;
}
/// List all devices along with all properties
inline
void
print_all
(
std
::
ostream
&
out
);
private:
int
_device
,
_num_devices
;
std
::
vector
<
NVDProperties
>
_properties
;
std
::
vector
<
CUstream
>
_cq
;
CUdevice
_cu_device
;
CUcontext
_context
;
};
// Grabs the properties for all devices
UCL_Device
::
UCL_Device
()
{
CU_SAFE_CALL_NS
(
cuInit
(
0
));
CU_SAFE_CALL_NS
(
cuDeviceGetCount
(
&
_num_devices
));
for
(
int
dev
=
0
;
dev
<
_num_devices
;
++
dev
)
{
CUdevice
m
;
CU_SAFE_CALL_NS
(
cuDeviceGet
(
&
m
,
dev
));
int
major
,
minor
;
CU_SAFE_CALL_NS
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
m
));
if
(
major
==
9999
)
continue
;
_properties
.
push_back
(
NVDProperties
());
_properties
.
back
().
device_id
=
dev
;
_properties
.
back
().
major
=
major
;
_properties
.
back
().
minor
=
minor
;
char
namecstr
[
1024
];
CU_SAFE_CALL_NS
(
cuDeviceGetName
(
namecstr
,
1024
,
m
));
_properties
.
back
().
name
=
namecstr
;
CU_SAFE_CALL_NS
(
cuDeviceTotalMem
(
&
_properties
.
back
().
totalGlobalMem
,
m
));
CU_SAFE_CALL_NS
(
cuDeviceGetAttribute
(
&
_properties
.
back
().
multiProcessorCount
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
m
));
CU_SAFE_CALL_NS
(
cuDeviceGetProperties
(
&
_properties
.
back
().
p
,
m
));
#if CUDA_VERSION >= 2020
CU_SAFE_CALL_NS
(
cuDeviceGetAttribute
(
&
_properties
.
back
().
kernelExecTimeoutEnabled
,
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT
,
dev
));
CU_SAFE_CALL_NS
(
cuDeviceGetAttribute
(
&
_properties
.
back
().
integrated
,
CU_DEVICE_ATTRIBUTE_INTEGRATED
,
dev
));
CU_SAFE_CALL_NS
(
cuDeviceGetAttribute
(
&
_properties
.
back
().
canMapHostMemory
,
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
,
dev
));
CU_SAFE_CALL_NS
(
cuDeviceGetAttribute
(
&
_properties
.
back
().
computeMode
,
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
,
dev
));
#endif
#if CUDA_VERSION >= 3010
CU_SAFE_CALL_NS
(
cuDeviceGetAttribute
(
&
_properties
.
back
().
concurrentKernels
,
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
,
dev
));
CU_SAFE_CALL_NS
(
cuDeviceGetAttribute
(
&
_properties
.
back
().
ECCEnabled
,
CU_DEVICE_ATTRIBUTE_ECC_ENABLED
,
dev
));
#endif
}
_device
=-
1
;
_cq
.
push_back
(
CUstream
());
_cq
.
back
()
=
0
;
}
UCL_Device
::~
UCL_Device
()
{
clear
();
}
int
UCL_Device
::
set_platform
(
const
int
pid
)
{
clear
();
#ifdef UCL_DEBUG
assert
(
pid
<
num_platforms
());
#endif
return
UCL_SUCCESS
;
}
// Set the CUDA device to the specified device number
int
UCL_Device
::
set
(
int
num
)
{
clear
();
_device
=
_properties
[
num
].
device_id
;
CU_SAFE_CALL_NS
(
cuDeviceGet
(
&
_cu_device
,
_device
));
CUresult
err
=
cuCtxCreate
(
&
_context
,
0
,
_cu_device
);
if
(
err
!=
CUDA_SUCCESS
)
{
#ifndef UCL_NO_EXIT
std
::
cerr
<<
"UCL Error: Could not access accelerator number "
<<
num
<<
" for use.
\n
"
;
UCL_GERYON_EXIT
;
#endif
return
UCL_ERROR
;
}
return
UCL_SUCCESS
;
}
void
UCL_Device
::
clear
()
{
if
(
_device
>-
1
)
{
for
(
int
i
=
1
;
i
<
num_queues
();
i
++
)
pop_command_queue
();
cuCtxDestroy
(
_context
);
}
_device
=-
1
;
}
// List all devices along with all properties
void
UCL_Device
::
print_all
(
std
::
ostream
&
out
)
{
#if CUDA_VERSION >= 2020
int
driver_version
;
cuDriverGetVersion
(
&
driver_version
);
out
<<
"CUDA Driver Version: "
<<
driver_version
/
1000
<<
"."
<<
driver_version
%
100
<<
std
::
endl
;
#endif
if
(
num_devices
()
==
0
)
out
<<
"There is no device supporting CUDA
\n
"
;
for
(
int
i
=
0
;
i
<
num_devices
();
++
i
)
{
out
<<
"
\n
Device "
<<
i
<<
":
\"
"
<<
name
(
i
)
<<
"
\"\n
"
;
out
<<
" Type of device: "
<<
device_type_name
(
i
).
c_str
()
<<
std
::
endl
;
out
<<
" Compute capability: "
<<
arch
(
i
)
<<
std
::
endl
;
out
<<
" Double precision support: "
;
if
(
double_precision
(
i
))
out
<<
"Yes
\n
"
;
else
out
<<
"No
\n
"
;
out
<<
" Total amount of global memory: "
<<
gigabytes
(
i
)
<<
" GB
\n
"
;
#if CUDA_VERSION >= 2000
out
<<
" Number of compute units/multiprocessors: "
<<
_properties
[
i
].
multiProcessorCount
<<
std
::
endl
;
out
<<
" Number of cores: "
<<
cores
(
i
)
<<
std
::
endl
;
#endif
out
<<
" Total amount of constant memory: "
<<
_properties
[
i
].
p
.
totalConstantMemory
<<
" bytes
\n
"
;
out
<<
" Total amount of local/shared memory per block: "
<<
_properties
[
i
].
p
.
sharedMemPerBlock
<<
" bytes
\n
"
;
out
<<
" Total number of registers available per block: "
<<
_properties
[
i
].
p
.
regsPerBlock
<<
std
::
endl
;
out
<<
" Warp size: "
<<
_properties
[
i
].
p
.
SIMDWidth
<<
std
::
endl
;
out
<<
" Maximum number of threads per block: "
<<
_properties
[
i
].
p
.
maxThreadsPerBlock
<<
std
::
endl
;
out
<<
" Maximum group size (# of threads per block) "
<<
_properties
[
i
].
p
.
maxThreadsDim
[
0
]
<<
" x "
<<
_properties
[
i
].
p
.
maxThreadsDim
[
1
]
<<
" x "
<<
_properties
[
i
].
p
.
maxThreadsDim
[
2
]
<<
std
::
endl
;
out
<<
" Maximum item sizes (# threads for each dim) "
<<
_properties
[
i
].
p
.
maxGridSize
[
0
]
<<
" x "
<<
_properties
[
i
].
p
.
maxGridSize
[
1
]
<<
" x "
<<
_properties
[
i
].
p
.
maxGridSize
[
2
]
<<
std
::
endl
;
out
<<
" Maximum memory pitch: "
<<
max_pitch
(
i
)
<<
" bytes
\n
"
;
out
<<
" Texture alignment: "
<<
_properties
[
i
].
p
.
textureAlign
<<
" bytes
\n
"
;
out
<<
" Clock rate: "
<<
clock_rate
(
i
)
<<
" GHz
\n
"
;
#if CUDA_VERSION >= 2020
out
<<
" Run time limit on kernels: "
;
if
(
_properties
[
i
].
kernelExecTimeoutEnabled
)
out
<<
"Yes
\n
"
;
else
out
<<
"No
\n
"
;
out
<<
" Integrated: "
;
if
(
_properties
[
i
].
integrated
)
out
<<
"Yes
\n
"
;
else
out
<<
"No
\n
"
;
out
<<
" Support host page-locked memory mapping: "
;
if
(
_properties
[
i
].
canMapHostMemory
)
out
<<
"Yes
\n
"
;
else
out
<<
"No
\n
"
;
out
<<
" Compute mode: "
;
if
(
_properties
[
i
].
computeMode
==
CU_COMPUTEMODE_DEFAULT
)
out
<<
"Default
\n
"
;
// multiple threads can use device
#if CUDA_VERSION >= 8000
else
if
(
_properties
[
i
].
computeMode
==
CU_COMPUTEMODE_EXCLUSIVE_PROCESS
)
#else
else
if
(
_properties
[
i
].
computeMode
==
CU_COMPUTEMODE_EXCLUSIVE
)
#endif
out
<<
"Exclusive
\n
"
;
// only thread can use device
else
if
(
_properties
[
i
].
computeMode
==
CU_COMPUTEMODE_PROHIBITED
)
out
<<
"Prohibited
\n
"
;
// no thread can use device
#if CUDART_VERSION >= 4000
else
if
(
_properties
[
i
].
computeMode
==
CU_COMPUTEMODE_EXCLUSIVE_PROCESS
)
out
<<
"Exclusive Process
\n
"
;
// multiple threads 1 process
#endif
else
out
<<
"Unknown
\n
"
;
#endif
#if CUDA_VERSION >= 3010
out
<<
" Concurrent kernel execution: "
;
if
(
_properties
[
i
].
concurrentKernels
)
out
<<
"Yes
\n
"
;
else
out
<<
"No
\n
"
;
out
<<
" Device has ECC support enabled: "
;
if
(
_properties
[
i
].
ECCEnabled
)
out
<<
"Yes
\n
"
;
else
out
<<
"No
\n
"
;
#endif
}
}
}
#endif
Event Timeline
Log In to Comment