Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F65766584
Kokkos_Cuda_Impl.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Jun 6, 02:13
Size
26 KB
Mime Type
text/x-c++
Expires
Sat, Jun 8, 02:13 (2 d)
Engine
blob
Format
Raw Data
Handle
18081931
Attached To
rLAMMPS lammps
Kokkos_Cuda_Impl.cpp
View Options
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/*--------------------------------------------------------------------------*/
/* Kokkos interfaces */
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <Kokkos_Core.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
/*--------------------------------------------------------------------------*/
/* Standard 'C' libraries */
#include <cstdlib>
/* Standard 'C++' libraries */
#include <vector>
#include <iostream>
#include <sstream>
#include <string>
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
__device__
__constant__
unsigned
long
kokkos_impl_cuda_constant_memory_buffer
[
Kokkos
::
Impl
::
CudaTraits
::
ConstantMemoryUsage
/
sizeof
(
unsigned
long
)
]
;
#endif
/*--------------------------------------------------------------------------*/
namespace
Kokkos
{
namespace
Impl
{
namespace
{
__global__
void
query_cuda_kernel_arch
(
int
*
d_arch
)
{
#if defined( __CUDA_ARCH__ )
*
d_arch
=
__CUDA_ARCH__
;
#else
*
d_arch
=
0
;
#endif
}
/** Query what compute capability is actually launched to the device: */
int
cuda_kernel_arch
()
{
int
*
d_arch
=
0
;
cudaMalloc
(
(
void
**
)
&
d_arch
,
sizeof
(
int
)
);
query_cuda_kernel_arch
<<<
1
,
1
>>>
(
d_arch
);
int
arch
=
0
;
cudaMemcpy
(
&
arch
,
d_arch
,
sizeof
(
int
)
,
cudaMemcpyDefault
);
cudaFree
(
d_arch
);
return
arch
;
}
#ifdef KOKKOS_ENABLE_CUDA_UVM
bool
cuda_launch_blocking
()
{
const
char
*
env
=
getenv
(
"CUDA_LAUNCH_BLOCKING"
);
if
(
env
==
0
)
return
false
;
return
atoi
(
env
);
}
#endif
}
void
cuda_device_synchronize
()
{
CUDA_SAFE_CALL
(
cudaDeviceSynchronize
()
);
}
void
cuda_internal_error_throw
(
cudaError
e
,
const
char
*
name
,
const
char
*
file
,
const
int
line
)
{
std
::
ostringstream
out
;
out
<<
name
<<
" error( "
<<
cudaGetErrorName
(
e
)
<<
"): "
<<
cudaGetErrorString
(
e
);
if
(
file
)
{
out
<<
" "
<<
file
<<
":"
<<
line
;
}
throw_runtime_exception
(
out
.
str
()
);
}
//----------------------------------------------------------------------------
// Some significant cuda device properties:
//
// cudaDeviceProp::name : Text label for device
// cudaDeviceProp::major : Device major number
// cudaDeviceProp::minor : Device minor number
// cudaDeviceProp::warpSize : number of threads per warp
// cudaDeviceProp::multiProcessorCount : number of multiprocessors
// cudaDeviceProp::sharedMemPerBlock : capacity of shared memory per block
// cudaDeviceProp::totalConstMem : capacity of constant memory
// cudaDeviceProp::totalGlobalMem : capacity of global memory
// cudaDeviceProp::maxGridSize[3] : maximum grid size
//
// Section 4.4.2.4 of the CUDA Toolkit Reference Manual
//
// struct cudaDeviceProp {
// char name[256];
// size_t totalGlobalMem;
// size_t sharedMemPerBlock;
// int regsPerBlock;
// int warpSize;
// size_t memPitch;
// int maxThreadsPerBlock;
// int maxThreadsDim[3];
// int maxGridSize[3];
// size_t totalConstMem;
// int major;
// int minor;
// int clockRate;
// size_t textureAlignment;
// int deviceOverlap;
// int multiProcessorCount;
// int kernelExecTimeoutEnabled;
// int integrated;
// int canMapHostMemory;
// int computeMode;
// int concurrentKernels;
// int ECCEnabled;
// int pciBusID;
// int pciDeviceID;
// int tccDriver;
// int asyncEngineCount;
// int unifiedAddressing;
// int memoryClockRate;
// int memoryBusWidth;
// int l2CacheSize;
// int maxThreadsPerMultiProcessor;
// };
namespace
{
class
CudaInternalDevices
{
public
:
enum
{
MAXIMUM_DEVICE_COUNT
=
64
};
struct
cudaDeviceProp
m_cudaProp
[
MAXIMUM_DEVICE_COUNT
]
;
int
m_cudaDevCount
;
CudaInternalDevices
();
static
const
CudaInternalDevices
&
singleton
();
};
CudaInternalDevices
::
CudaInternalDevices
()
{
// See 'cudaSetDeviceFlags' for host-device thread interaction
// Section 4.4.2.6 of the CUDA Toolkit Reference Manual
CUDA_SAFE_CALL
(
cudaGetDeviceCount
(
&
m_cudaDevCount
)
);
if
(
m_cudaDevCount
>
MAXIMUM_DEVICE_COUNT
)
{
Kokkos
::
abort
(
"Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos."
);
}
for
(
int
i
=
0
;
i
<
m_cudaDevCount
;
++
i
)
{
CUDA_SAFE_CALL
(
cudaGetDeviceProperties
(
m_cudaProp
+
i
,
i
)
);
}
}
const
CudaInternalDevices
&
CudaInternalDevices
::
singleton
()
{
static
CudaInternalDevices
self
;
return
self
;
}
}
//----------------------------------------------------------------------------
class
CudaInternal
{
private
:
CudaInternal
(
const
CudaInternal
&
);
CudaInternal
&
operator
=
(
const
CudaInternal
&
);
public
:
typedef
Cuda
::
size_type
size_type
;
int
m_cudaDev
;
int
m_cudaArch
;
unsigned
m_multiProcCount
;
unsigned
m_maxWarpCount
;
unsigned
m_maxBlock
;
unsigned
m_maxSharedWords
;
uint32_t
m_maxConcurrency
;
size_type
m_scratchSpaceCount
;
size_type
m_scratchFlagsCount
;
size_type
m_scratchUnifiedCount
;
size_type
m_scratchUnifiedSupported
;
size_type
m_streamCount
;
size_type
*
m_scratchSpace
;
size_type
*
m_scratchFlags
;
size_type
*
m_scratchUnified
;
uint32_t
*
m_scratchConcurrentBitset
;
cudaStream_t
*
m_stream
;
static
int
was_initialized
;
static
int
was_finalized
;
static
CudaInternal
&
singleton
();
int
verify_is_initialized
(
const
char
*
const
label
)
const
;
int
is_initialized
()
const
{
return
0
!=
m_scratchSpace
&&
0
!=
m_scratchFlags
;
}
void
initialize
(
int
cuda_device_id
,
int
stream_count
);
void
finalize
();
void
print_configuration
(
std
::
ostream
&
)
const
;
~
CudaInternal
();
CudaInternal
()
:
m_cudaDev
(
-
1
)
,
m_cudaArch
(
-
1
)
,
m_multiProcCount
(
0
)
,
m_maxWarpCount
(
0
)
,
m_maxBlock
(
0
)
,
m_maxSharedWords
(
0
)
,
m_maxConcurrency
(
0
)
,
m_scratchSpaceCount
(
0
)
,
m_scratchFlagsCount
(
0
)
,
m_scratchUnifiedCount
(
0
)
,
m_scratchUnifiedSupported
(
0
)
,
m_streamCount
(
0
)
,
m_scratchSpace
(
0
)
,
m_scratchFlags
(
0
)
,
m_scratchUnified
(
0
)
,
m_scratchConcurrentBitset
(
0
)
,
m_stream
(
0
)
{}
size_type
*
scratch_space
(
const
size_type
size
);
size_type
*
scratch_flags
(
const
size_type
size
);
size_type
*
scratch_unified
(
const
size_type
size
);
};
int
CudaInternal
::
was_initialized
=
0
;
int
CudaInternal
::
was_finalized
=
0
;
//----------------------------------------------------------------------------
void
CudaInternal
::
print_configuration
(
std
::
ostream
&
s
)
const
{
const
CudaInternalDevices
&
dev_info
=
CudaInternalDevices
::
singleton
();
#if defined( KOKKOS_ENABLE_CUDA )
s
<<
"macro KOKKOS_ENABLE_CUDA : defined"
<<
std
::
endl
;
#endif
#if defined( CUDA_VERSION )
s
<<
"macro CUDA_VERSION = "
<<
CUDA_VERSION
<<
" = version "
<<
CUDA_VERSION
/
1000
<<
"."
<<
(
CUDA_VERSION
%
1000
)
/
10
<<
std
::
endl
;
#endif
for
(
int
i
=
0
;
i
<
dev_info
.
m_cudaDevCount
;
++
i
)
{
s
<<
"Kokkos::Cuda[ "
<<
i
<<
" ] "
<<
dev_info
.
m_cudaProp
[
i
].
name
<<
" capability "
<<
dev_info
.
m_cudaProp
[
i
].
major
<<
"."
<<
dev_info
.
m_cudaProp
[
i
].
minor
<<
", Total Global Memory: "
<<
human_memory_size
(
dev_info
.
m_cudaProp
[
i
].
totalGlobalMem
)
<<
", Shared Memory per Block: "
<<
human_memory_size
(
dev_info
.
m_cudaProp
[
i
].
sharedMemPerBlock
);
if
(
m_cudaDev
==
i
)
s
<<
" : Selected"
;
s
<<
std
::
endl
;
}
}
//----------------------------------------------------------------------------
CudaInternal
::~
CudaInternal
()
{
if
(
m_stream
||
m_scratchSpace
||
m_scratchFlags
||
m_scratchUnified
||
m_scratchConcurrentBitset
)
{
std
::
cerr
<<
"Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
<<
std
::
endl
;
std
::
cerr
.
flush
();
}
m_cudaDev
=
-
1
;
m_cudaArch
=
-
1
;
m_multiProcCount
=
0
;
m_maxWarpCount
=
0
;
m_maxBlock
=
0
;
m_maxSharedWords
=
0
;
m_maxConcurrency
=
0
;
m_scratchSpaceCount
=
0
;
m_scratchFlagsCount
=
0
;
m_scratchUnifiedCount
=
0
;
m_scratchUnifiedSupported
=
0
;
m_streamCount
=
0
;
m_scratchSpace
=
0
;
m_scratchFlags
=
0
;
m_scratchUnified
=
0
;
m_scratchConcurrentBitset
=
0
;
m_stream
=
0
;
}
int
CudaInternal
::
verify_is_initialized
(
const
char
*
const
label
)
const
{
if
(
m_cudaDev
<
0
)
{
std
::
cerr
<<
"Kokkos::Cuda::"
<<
label
<<
" : ERROR device not initialized"
<<
std
::
endl
;
}
return
0
<=
m_cudaDev
;
}
CudaInternal
&
CudaInternal
::
singleton
()
{
static
CudaInternal
self
;
return
self
;
}
void
CudaInternal
::
initialize
(
int
cuda_device_id
,
int
stream_count
)
{
if
(
was_finalized
)
Kokkos
::
abort
(
"Calling Cuda::initialize after Cuda::finalize is illegal
\n
"
);
was_initialized
=
1
;
if
(
is_initialized
()
)
return
;
enum
{
WordSize
=
sizeof
(
size_type
)
};
if
(
!
HostSpace
::
execution_space
::
is_initialized
()
)
{
const
std
::
string
msg
(
"Cuda::initialize ERROR : HostSpace::execution_space is not initialized"
);
throw_runtime_exception
(
msg
);
}
const
CudaInternalDevices
&
dev_info
=
CudaInternalDevices
::
singleton
();
const
bool
ok_init
=
0
==
m_scratchSpace
||
0
==
m_scratchFlags
;
const
bool
ok_id
=
0
<=
cuda_device_id
&&
cuda_device_id
<
dev_info
.
m_cudaDevCount
;
// Need device capability 3.0 or better
const
bool
ok_dev
=
ok_id
&&
(
3
<=
dev_info
.
m_cudaProp
[
cuda_device_id
].
major
&&
0
<=
dev_info
.
m_cudaProp
[
cuda_device_id
].
minor
);
if
(
ok_init
&&
ok_dev
)
{
const
struct
cudaDeviceProp
&
cudaProp
=
dev_info
.
m_cudaProp
[
cuda_device_id
];
m_cudaDev
=
cuda_device_id
;
CUDA_SAFE_CALL
(
cudaSetDevice
(
m_cudaDev
)
);
CUDA_SAFE_CALL
(
cudaDeviceReset
()
);
Kokkos
::
Impl
::
cuda_device_synchronize
();
// Query what compute capability architecture a kernel executes:
m_cudaArch
=
cuda_kernel_arch
();
int
compiled_major
=
m_cudaArch
/
100
;
int
compiled_minor
=
(
m_cudaArch
%
100
)
/
10
;
if
(
compiled_major
<
5
&&
cudaProp
.
major
>=
5
)
{
std
::
stringstream
ss
;
ss
<<
"Kokkos::Cuda::initialize ERROR: running kernels compiled for compute capability "
<<
compiled_major
<<
"."
<<
compiled_minor
<<
" (< 5.0) on device with compute capability "
<<
cudaProp
.
major
<<
"."
<<
cudaProp
.
minor
<<
" (>=5.0), this would give incorrect results!"
<<
std
::
endl
;
std
::
string
msg
=
ss
.
str
();
Kokkos
::
abort
(
msg
.
c_str
()
);
}
if
(
compiled_major
!=
cudaProp
.
major
||
compiled_minor
!=
cudaProp
.
minor
)
{
std
::
cerr
<<
"Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
<<
compiled_major
<<
"."
<<
compiled_minor
<<
" on device with compute capability "
<<
cudaProp
.
major
<<
"."
<<
cudaProp
.
minor
<<
" , this will likely reduce potential performance."
<<
std
::
endl
;
}
// number of multiprocessors
m_multiProcCount
=
cudaProp
.
multiProcessorCount
;
//----------------------------------
// Maximum number of warps,
// at most one warp per thread in a warp for reduction.
// HCE 2012-February :
// Found bug in CUDA 4.1 that sometimes a kernel launch would fail
// if the thread count == 1024 and a functor is passed to the kernel.
// Copying the kernel to constant memory and then launching with
// thread count == 1024 would work fine.
//
// HCE 2012-October :
// All compute capabilities support at least 16 warps (512 threads).
// However, we have found that 8 warps typically gives better performance.
m_maxWarpCount
=
8
;
// m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ;
if
(
Impl
::
CudaTraits
::
WarpSize
<
m_maxWarpCount
)
{
m_maxWarpCount
=
Impl
::
CudaTraits
::
WarpSize
;
}
m_maxSharedWords
=
cudaProp
.
sharedMemPerBlock
/
WordSize
;
//----------------------------------
// Maximum number of blocks:
m_maxBlock
=
cudaProp
.
maxGridSize
[
0
]
;
//----------------------------------
m_scratchUnifiedSupported
=
cudaProp
.
unifiedAddressing
;
if
(
!
m_scratchUnifiedSupported
)
{
std
::
cout
<<
"Kokkos::Cuda device "
<<
cudaProp
.
name
<<
" capability "
<<
cudaProp
.
major
<<
"."
<<
cudaProp
.
minor
<<
" does not support unified virtual address space"
<<
std
::
endl
;
}
//----------------------------------
// Multiblock reduction uses scratch flags for counters
// and scratch space for partial reduction values.
// Allocate some initial space. This will grow as needed.
{
const
unsigned
reduce_block_count
=
m_maxWarpCount
*
Impl
::
CudaTraits
::
WarpSize
;
(
void
)
scratch_unified
(
16
*
sizeof
(
size_type
)
);
(
void
)
scratch_flags
(
reduce_block_count
*
2
*
sizeof
(
size_type
)
);
(
void
)
scratch_space
(
reduce_block_count
*
16
*
sizeof
(
size_type
)
);
}
//----------------------------------
// Concurrent bitset for obtaining unique tokens from within
// an executing kernel.
{
const
unsigned
max_threads_per_sm
=
2048
;
// up to capability 7.0
m_maxConcurrency
=
max_threads_per_sm
*
cudaProp
.
multiProcessorCount
;
const
int32_t
buffer_bound
=
Kokkos
::
Impl
::
concurrent_bitset
::
buffer_bound
(
m_maxConcurrency
);
// Allocate and initialize uint32_t[ buffer_bound ]
typedef
Kokkos
::
Experimental
::
Impl
::
SharedAllocationRecord
<
Kokkos
::
CudaSpace
,
void
>
Record
;
Record
*
const
r
=
Record
::
allocate
(
Kokkos
::
CudaSpace
()
,
"InternalScratchBitset"
,
sizeof
(
uint32_t
)
*
buffer_bound
);
Record
::
increment
(
r
);
m_scratchConcurrentBitset
=
reinterpret_cast
<
uint32_t
*>
(
r
->
data
()
);
CUDA_SAFE_CALL
(
cudaMemset
(
m_scratchConcurrentBitset
,
0
,
sizeof
(
uint32_t
)
*
buffer_bound
)
);
}
//----------------------------------
if
(
stream_count
)
{
m_stream
=
(
cudaStream_t
*
)
::
malloc
(
stream_count
*
sizeof
(
cudaStream_t
)
);
m_streamCount
=
stream_count
;
for
(
size_type
i
=
0
;
i
<
m_streamCount
;
++
i
)
m_stream
[
i
]
=
0
;
}
}
else
{
std
::
ostringstream
msg
;
msg
<<
"Kokkos::Cuda::initialize("
<<
cuda_device_id
<<
") FAILED"
;
if
(
!
ok_init
)
{
msg
<<
" : Already initialized"
;
}
if
(
!
ok_id
)
{
msg
<<
" : Device identifier out of range "
<<
"[0.."
<<
dev_info
.
m_cudaDevCount
<<
"]"
;
}
else
if
(
!
ok_dev
)
{
msg
<<
" : Device "
;
msg
<<
dev_info
.
m_cudaProp
[
cuda_device_id
].
major
;
msg
<<
"."
;
msg
<<
dev_info
.
m_cudaProp
[
cuda_device_id
].
minor
;
msg
<<
" has insufficient capability, required 3.0 or better"
;
}
Kokkos
::
Impl
::
throw_runtime_exception
(
msg
.
str
()
);
}
#ifdef KOKKOS_ENABLE_CUDA_UVM
if
(
!
cuda_launch_blocking
())
{
std
::
cout
<<
"Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default"
<<
std
::
endl
;
std
::
cout
<<
" without setting CUDA_LAUNCH_BLOCKING=1."
<<
std
::
endl
;
std
::
cout
<<
" The code must call Cuda::fence() after each kernel"
<<
std
::
endl
;
std
::
cout
<<
" or will likely crash when accessing data on the host."
<<
std
::
endl
;
}
const
char
*
env_force_device_alloc
=
getenv
(
"CUDA_MANAGED_FORCE_DEVICE_ALLOC"
);
bool
force_device_alloc
;
if
(
env_force_device_alloc
==
0
)
force_device_alloc
=
false
;
else
force_device_alloc
=
atoi
(
env_force_device_alloc
)
!=
0
;
const
char
*
env_visible_devices
=
getenv
(
"CUDA_VISIBLE_DEVICES"
);
bool
visible_devices_one
=
true
;
if
(
env_visible_devices
==
0
)
visible_devices_one
=
false
;
if
(
!
visible_devices_one
&&
!
force_device_alloc
)
{
std
::
cout
<<
"Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default"
<<
std
::
endl
;
std
::
cout
<<
" without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or "
<<
std
::
endl
;
std
::
cout
<<
" setting CUDA_VISIBLE_DEVICES."
<<
std
::
endl
;
std
::
cout
<<
" This could on multi GPU systems lead to severe performance"
<<
std
::
endl
;
std
::
cout
<<
" penalties."
<<
std
::
endl
;
}
#endif
cudaThreadSetCacheConfig
(
cudaFuncCachePreferShared
);
// Init the array for used for arbitrarily sized atomics
Impl
::
initialize_host_cuda_lock_arrays
();
}
//----------------------------------------------------------------------------
typedef
Cuda
::
size_type
ScratchGrain
[
Impl
::
CudaTraits
::
WarpSize
]
;
enum
{
sizeScratchGrain
=
sizeof
(
ScratchGrain
)
};
Cuda
::
size_type
*
CudaInternal
::
scratch_flags
(
const
Cuda
::
size_type
size
)
{
if
(
verify_is_initialized
(
"scratch_flags"
)
&&
m_scratchFlagsCount
*
sizeScratchGrain
<
size
)
{
m_scratchFlagsCount
=
(
size
+
sizeScratchGrain
-
1
)
/
sizeScratchGrain
;
typedef
Kokkos
::
Experimental
::
Impl
::
SharedAllocationRecord
<
Kokkos
::
CudaSpace
,
void
>
Record
;
Record
*
const
r
=
Record
::
allocate
(
Kokkos
::
CudaSpace
()
,
"InternalScratchFlags"
,
(
sizeof
(
ScratchGrain
)
*
m_scratchFlagsCount
)
);
Record
::
increment
(
r
);
m_scratchFlags
=
reinterpret_cast
<
size_type
*>
(
r
->
data
()
);
CUDA_SAFE_CALL
(
cudaMemset
(
m_scratchFlags
,
0
,
m_scratchFlagsCount
*
sizeScratchGrain
)
);
}
return
m_scratchFlags
;
}
Cuda
::
size_type
*
CudaInternal
::
scratch_space
(
const
Cuda
::
size_type
size
)
{
if
(
verify_is_initialized
(
"scratch_space"
)
&&
m_scratchSpaceCount
*
sizeScratchGrain
<
size
)
{
m_scratchSpaceCount
=
(
size
+
sizeScratchGrain
-
1
)
/
sizeScratchGrain
;
typedef
Kokkos
::
Experimental
::
Impl
::
SharedAllocationRecord
<
Kokkos
::
CudaSpace
,
void
>
Record
;
Record
*
const
r
=
Record
::
allocate
(
Kokkos
::
CudaSpace
()
,
"InternalScratchSpace"
,
(
sizeof
(
ScratchGrain
)
*
m_scratchSpaceCount
)
);
Record
::
increment
(
r
);
m_scratchSpace
=
reinterpret_cast
<
size_type
*>
(
r
->
data
()
);
}
return
m_scratchSpace
;
}
Cuda
::
size_type
*
CudaInternal
::
scratch_unified
(
const
Cuda
::
size_type
size
)
{
if
(
verify_is_initialized
(
"scratch_unified"
)
&&
m_scratchUnifiedSupported
&&
m_scratchUnifiedCount
*
sizeScratchGrain
<
size
)
{
m_scratchUnifiedCount
=
(
size
+
sizeScratchGrain
-
1
)
/
sizeScratchGrain
;
typedef
Kokkos
::
Experimental
::
Impl
::
SharedAllocationRecord
<
Kokkos
::
CudaHostPinnedSpace
,
void
>
Record
;
Record
*
const
r
=
Record
::
allocate
(
Kokkos
::
CudaHostPinnedSpace
()
,
"InternalScratchUnified"
,
(
sizeof
(
ScratchGrain
)
*
m_scratchUnifiedCount
)
);
Record
::
increment
(
r
);
m_scratchUnified
=
reinterpret_cast
<
size_type
*>
(
r
->
data
()
);
}
return
m_scratchUnified
;
}
//----------------------------------------------------------------------------
void
CudaInternal
::
finalize
()
{
was_finalized
=
1
;
if
(
0
!=
m_scratchSpace
||
0
!=
m_scratchFlags
)
{
Impl
::
finalize_host_cuda_lock_arrays
();
if
(
m_stream
)
{
for
(
size_type
i
=
1
;
i
<
m_streamCount
;
++
i
)
{
cudaStreamDestroy
(
m_stream
[
i
]
);
m_stream
[
i
]
=
0
;
}
::
free
(
m_stream
);
}
typedef
Kokkos
::
Experimental
::
Impl
::
SharedAllocationRecord
<
CudaSpace
>
RecordCuda
;
typedef
Kokkos
::
Experimental
::
Impl
::
SharedAllocationRecord
<
CudaHostPinnedSpace
>
RecordHost
;
RecordCuda
::
decrement
(
RecordCuda
::
get_record
(
m_scratchFlags
)
);
RecordCuda
::
decrement
(
RecordCuda
::
get_record
(
m_scratchSpace
)
);
RecordHost
::
decrement
(
RecordHost
::
get_record
(
m_scratchUnified
)
);
RecordCuda
::
decrement
(
RecordCuda
::
get_record
(
m_scratchConcurrentBitset
)
);
m_cudaDev
=
-
1
;
m_multiProcCount
=
0
;
m_maxWarpCount
=
0
;
m_maxBlock
=
0
;
m_maxSharedWords
=
0
;
m_scratchSpaceCount
=
0
;
m_scratchFlagsCount
=
0
;
m_scratchUnifiedCount
=
0
;
m_streamCount
=
0
;
m_scratchSpace
=
0
;
m_scratchFlags
=
0
;
m_scratchUnified
=
0
;
m_scratchConcurrentBitset
=
0
;
m_stream
=
0
;
}
}
//----------------------------------------------------------------------------
Cuda
::
size_type
cuda_internal_multiprocessor_count
()
{
return
CudaInternal
::
singleton
().
m_multiProcCount
;
}
CudaSpace
::
size_type
cuda_internal_maximum_concurrent_block_count
()
{
// Compute capability 5.0 through 6.2
enum
:
int
{
max_resident_blocks_per_multiprocessor
=
32
};
return
CudaInternal
::
singleton
().
m_multiProcCount
*
max_resident_blocks_per_multiprocessor
;
};
Cuda
::
size_type
cuda_internal_maximum_warp_count
()
{
return
CudaInternal
::
singleton
().
m_maxWarpCount
;
}
Cuda
::
size_type
cuda_internal_maximum_grid_count
()
{
return
CudaInternal
::
singleton
().
m_maxBlock
;
}
Cuda
::
size_type
cuda_internal_maximum_shared_words
()
{
return
CudaInternal
::
singleton
().
m_maxSharedWords
;
}
Cuda
::
size_type
*
cuda_internal_scratch_space
(
const
Cuda
::
size_type
size
)
{
return
CudaInternal
::
singleton
().
scratch_space
(
size
);
}
Cuda
::
size_type
*
cuda_internal_scratch_flags
(
const
Cuda
::
size_type
size
)
{
return
CudaInternal
::
singleton
().
scratch_flags
(
size
);
}
Cuda
::
size_type
*
cuda_internal_scratch_unified
(
const
Cuda
::
size_type
size
)
{
return
CudaInternal
::
singleton
().
scratch_unified
(
size
);
}
}
// namespace Impl
}
// namespace Kokkos
//----------------------------------------------------------------------------
namespace
Kokkos
{
Cuda
::
size_type
Cuda
::
detect_device_count
()
{
return
Impl
::
CudaInternalDevices
::
singleton
().
m_cudaDevCount
;
}
int
Cuda
::
concurrency
()
{
return
Impl
::
CudaInternal
::
singleton
().
m_maxConcurrency
;
}
int
Cuda
::
is_initialized
()
{
return
Impl
::
CudaInternal
::
singleton
().
is_initialized
();
}
void
Cuda
::
initialize
(
const
Cuda
::
SelectDevice
config
,
size_t
num_instances
)
{
Impl
::
CudaInternal
::
singleton
().
initialize
(
config
.
cuda_device_id
,
num_instances
);
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos
::
Profiling
::
initialize
();
#endif
}
std
::
vector
<
unsigned
>
Cuda
::
detect_device_arch
()
{
const
Impl
::
CudaInternalDevices
&
s
=
Impl
::
CudaInternalDevices
::
singleton
();
std
::
vector
<
unsigned
>
output
(
s
.
m_cudaDevCount
);
for
(
int
i
=
0
;
i
<
s
.
m_cudaDevCount
;
++
i
)
{
output
[
i
]
=
s
.
m_cudaProp
[
i
].
major
*
100
+
s
.
m_cudaProp
[
i
].
minor
;
}
return
output
;
}
Cuda
::
size_type
Cuda
::
device_arch
()
{
const
int
dev_id
=
Impl
::
CudaInternal
::
singleton
().
m_cudaDev
;
int
dev_arch
=
0
;
if
(
0
<=
dev_id
)
{
const
struct
cudaDeviceProp
&
cudaProp
=
Impl
::
CudaInternalDevices
::
singleton
().
m_cudaProp
[
dev_id
]
;
dev_arch
=
cudaProp
.
major
*
100
+
cudaProp
.
minor
;
}
return
dev_arch
;
}
void
Cuda
::
finalize
()
{
Impl
::
CudaInternal
::
singleton
().
finalize
();
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos
::
Profiling
::
finalize
();
#endif
}
Cuda
::
Cuda
()
:
m_device
(
Impl
::
CudaInternal
::
singleton
().
m_cudaDev
)
,
m_stream
(
0
)
{
Impl
::
CudaInternal
::
singleton
().
verify_is_initialized
(
"Cuda instance constructor"
);
}
Cuda
::
Cuda
(
const
int
instance_id
)
:
m_device
(
Impl
::
CudaInternal
::
singleton
().
m_cudaDev
)
,
m_stream
(
Impl
::
CudaInternal
::
singleton
().
verify_is_initialized
(
"Cuda instance constructor"
)
?
Impl
::
CudaInternal
::
singleton
().
m_stream
[
instance_id
%
Impl
::
CudaInternal
::
singleton
().
m_streamCount
]
:
0
)
{}
void
Cuda
::
print_configuration
(
std
::
ostream
&
s
,
const
bool
)
{
Impl
::
CudaInternal
::
singleton
().
print_configuration
(
s
);
}
bool
Cuda
::
sleep
()
{
return
false
;
}
bool
Cuda
::
wake
()
{
return
true
;
}
void
Cuda
::
fence
()
{
Kokkos
::
Impl
::
cuda_device_synchronize
();
}
const
char
*
Cuda
::
name
()
{
return
"Cuda"
;
}
}
// namespace Kokkos
namespace
Kokkos
{
namespace
Experimental
{
UniqueToken
<
Kokkos
::
Cuda
,
Kokkos
::
Experimental
::
UniqueTokenScope
::
Global
>::
UniqueToken
(
Kokkos
::
Cuda
const
&
)
:
m_buffer
(
Kokkos
::
Impl
::
CudaInternal
::
singleton
().
m_scratchConcurrentBitset
)
,
m_count
(
Kokkos
::
Impl
::
CudaInternal
::
singleton
().
m_maxConcurrency
)
{}
}
// namespace Experimental
}
// namespace Kokkos
#else
void
KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR
()
{}
#endif
// KOKKOS_ENABLE_CUDA
Event Timeline
Log In to Comment