Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F76922560
util_arch.cuh
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Aug 11, 05:24
Size
11 KB
Mime Type
text/x-c++
Expires
Tue, Aug 13, 05:24 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
19782566
Attached To
rLAMMPS lammps
util_arch.cuh
View Options
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Static architectural properties by SM version.
*/
/******************************************************************************
* Static architectural properties by SM version.
*
* "Device" reflects the PTX architecture targeted by the active compiler
* pass. It provides useful compile-time statics within device code. E.g.,:
*
* __shared__ int[Device::WARP_THREADS];
*
* int padded_offset = threadIdx.x + (threadIdx.x >> Device::LOG_SMEM_BANKS);
*
******************************************************************************/
#pragma once
#include "util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilModule
* @{
*/
/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
#ifndef __CUDA_ARCH__
#define CUB_PTX_ARCH 0
#else
#define CUB_PTX_ARCH __CUDA_ARCH__
#endif
/// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API.
#if !defined(__CUDA_ARCH__) || defined(CUB_CDP)
#define CUB_RUNTIME_ENABLED
#endif
/// Execution space for destructors
#if ((CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH < 200))
#define CUB_DESTRUCTOR __host__
#else
#define CUB_DESTRUCTOR __host__ __device__
#endif
/**
* \brief Structure for statically reporting CUDA device properties, parameterized by SM architecture.
*
* The default specialization is for SM10.
*/
template <int SM_ARCH>
struct ArchProps
{
enum
{
LOG_WARP_THREADS =
5, /// Log of the number of threads per warp
WARP_THREADS =
1 << LOG_WARP_THREADS, /// Number of threads per warp
LOG_SMEM_BANKS =
4, /// Log of the number of smem banks
SMEM_BANKS =
1 << LOG_SMEM_BANKS, /// The number of smem banks
SMEM_BANK_BYTES =
4, /// Size of smem bank words
SMEM_BYTES =
16 * 1024, /// Maximum SM shared memory
SMEM_ALLOC_UNIT =
512, /// Smem allocation size in bytes
REGS_BY_BLOCK =
true, /// Whether or not the architecture allocates registers by block (or by warp)
REG_ALLOC_UNIT =
256, /// Number of registers allocated at a time per block (or by warp)
WARP_ALLOC_UNIT =
2, /// Granularity of warps for which registers are allocated
MAX_SM_THREADS =
768, /// Maximum number of threads per SM
MAX_SM_THREADBLOCKS =
8, /// Maximum number of thread blocks per SM
MAX_BLOCK_THREADS =
512, /// Maximum number of thread per thread block
MAX_SM_REGISTERS =
8 * 1024, /// Maximum number of registers per SM
};
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Architecture properties for SM30
*/
template <>
struct ArchProps<300>
{
enum
{
LOG_WARP_THREADS = 5, // 32 threads per warp
WARP_THREADS = 1 << LOG_WARP_THREADS,
LOG_SMEM_BANKS = 5, // 32 banks
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
SMEM_BANK_BYTES = 4, // 4 byte bank words
SMEM_BYTES = 48 * 1024, // 48KB shared memory
SMEM_ALLOC_UNIT = 256, // 256B smem allocation segment size
REGS_BY_BLOCK = false, // Allocates registers by warp
REG_ALLOC_UNIT = 256, // 256 registers allocated at a time per warp
WARP_ALLOC_UNIT = 4, // Registers are allocated at a granularity of every 4 warps per threadblock
MAX_SM_THREADS = 2048, // 2K max threads per SM
MAX_SM_THREADBLOCKS = 16, // 16 max threadblocks per SM
MAX_BLOCK_THREADS = 1024, // 1024 max threads per threadblock
MAX_SM_REGISTERS = 64 * 1024, // 64K max registers per SM
};
// Callback utility
template <typename T>
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
{
target.template Callback<ArchProps>();
}
};
/**
* Architecture properties for SM20
*/
template <>
struct ArchProps<200>
{
enum
{
LOG_WARP_THREADS = 5, // 32 threads per warp
WARP_THREADS = 1 << LOG_WARP_THREADS,
LOG_SMEM_BANKS = 5, // 32 banks
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
SMEM_BANK_BYTES = 4, // 4 byte bank words
SMEM_BYTES = 48 * 1024, // 48KB shared memory
SMEM_ALLOC_UNIT = 128, // 128B smem allocation segment size
REGS_BY_BLOCK = false, // Allocates registers by warp
REG_ALLOC_UNIT = 64, // 64 registers allocated at a time per warp
WARP_ALLOC_UNIT = 2, // Registers are allocated at a granularity of every 2 warps per threadblock
MAX_SM_THREADS = 1536, // 1536 max threads per SM
MAX_SM_THREADBLOCKS = 8, // 8 max threadblocks per SM
MAX_BLOCK_THREADS = 1024, // 1024 max threads per threadblock
MAX_SM_REGISTERS = 32 * 1024, // 32K max registers per SM
};
// Callback utility
template <typename T>
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
{
if (sm_version > 200) {
ArchProps<300>::Callback(target, sm_version);
} else {
target.template Callback<ArchProps>();
}
}
};
/**
* Architecture properties for SM12
*/
template <>
struct ArchProps<120>
{
enum
{
LOG_WARP_THREADS = 5, // 32 threads per warp
WARP_THREADS = 1 << LOG_WARP_THREADS,
LOG_SMEM_BANKS = 4, // 16 banks
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
SMEM_BANK_BYTES = 4, // 4 byte bank words
SMEM_BYTES = 16 * 1024, // 16KB shared memory
SMEM_ALLOC_UNIT = 512, // 512B smem allocation segment size
REGS_BY_BLOCK = true, // Allocates registers by threadblock
REG_ALLOC_UNIT = 512, // 512 registers allocated at time per threadblock
WARP_ALLOC_UNIT = 2, // Registers are allocated at a granularity of every 2 warps per threadblock
MAX_SM_THREADS = 1024, // 1024 max threads per SM
MAX_SM_THREADBLOCKS = 8, // 8 max threadblocks per SM
MAX_BLOCK_THREADS = 512, // 512 max threads per threadblock
MAX_SM_REGISTERS = 16 * 1024, // 16K max registers per SM
};
// Callback utility
template <typename T>
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
{
if (sm_version > 120) {
ArchProps<200>::Callback(target, sm_version);
} else {
target.template Callback<ArchProps>();
}
}
};
/**
* Architecture properties for SM10. Derives from the default ArchProps specialization.
*/
template <>
struct ArchProps<100> : ArchProps<0>
{
// Callback utility
template <typename T>
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
{
if (sm_version > 100) {
ArchProps<120>::Callback(target, sm_version);
} else {
target.template Callback<ArchProps>();
}
}
};
/**
* Architecture properties for SM35
*/
template <>
struct ArchProps<350> : ArchProps<300> {}; // Derives from SM30
/**
* Architecture properties for SM21
*/
template <>
struct ArchProps<210> : ArchProps<200> {}; // Derives from SM20
/**
* Architecture properties for SM13
*/
template <>
struct ArchProps<130> : ArchProps<120> {}; // Derives from SM12
/**
* Architecture properties for SM11
*/
template <>
struct ArchProps<110> : ArchProps<100> {}; // Derives from SM10
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief The architectural properties for the PTX version targeted by the active compiler pass.
*/
struct PtxArchProps : ArchProps<CUB_PTX_ARCH> {};
/** @} */ // end group UtilModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
Event Timeline
Log In to Comment