Value *d_values_in, ///< [in] Input values ping buffer
Value *d_values_out, ///< [in] Output values pong buffer
SizeT *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
SizeT num_items, ///< [in] Total number of input data items
int current_bit, ///< [in] Bit position of current radix digit
bool use_primary_bit_granularity, ///< [in] Whether nor not to use the primary policy (or the embedded alternate policy for smaller bit granularity)
bool first_pass, ///< [in] Whether this is the first digit pass
bool last_pass, ///< [in] Whether this is the last digit pass
GridEvenShare<SizeT> even_share) ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
* \brief DeviceRadixSort provides operations for computing a device-wide, parallel radix sort across data items residing within global memory. ![](sorting_logo.png)
* \ingroup DeviceModule
*
* \par Overview
* The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
* items into ascending order. It relies upon a positional representation for
* keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
* characters, etc.) specified from least-significant to most-significant. For a
* given input sequence of keys and a set of rules specifying a total ordering
* of the symbolic alphabet, the radix sorting method produces a lexicographic
* ordering of those keys.
*
* \par
* DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
* <tt>unsigned char</tt>, \p int, \p double, etc. Although the direct radix sorting
* method can only be applied to unsigned integral types, BlockRadixSort
* is able to sort signed and floating-point types via simple bit-wise transformations
* that ensure lexicographic key ordering.
*
* \par Usage Considerations
* \cdp_class{DeviceRadixSort}
*
* \par Performance
*
* \image html lsd_sort_perf.png
*
*/
struct DeviceRadixSort
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
* Internal dispatch routine for computing a device-wide reduction using a two-stages of kernel invocations.
*/
template <
typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel
typename SpineKernelPtr, ///< Function type of cub::SpineScanKernel
typename DownsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel
typename Key, ///< Key type
typename Value, ///< Value type
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
SpineKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
DownsweepKernelPtr downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
KernelDispachParams &upsweep_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for
KernelDispachParams &scan_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
KernelDispachParams &downsweep_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
SizeT num_items, ///< [in] Number of items to reduce
int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
int end_bit = sizeof(Key) * 8, ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported );
#else
cudaError error = cudaSuccess;
do
{
// Get device ordinal
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
// Get SM count
int sm_count;
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
// Get a rough estimate of downsweep_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
* // Sorted keys and values are referenced by d_keys.Current() and d_values.Current()
*
* \endcode
*
* \tparam Key <b>[inferred]</b> Key type
* \tparam Value <b>[inferred]</b> Value type
*/
template <
typename Key,
typename Value>
__host__ __device__ __forceinline__
static cudaError_t SortPairs(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer of values whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
int num_items, ///< [in] Number of items to reduce
int begin_bit = 0, ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
int end_bit = sizeof(Key) * 8, ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
* // Sorted keys are referenced by d_keys.Current()
*
* \endcode
*
* \tparam Key <b>[inferred]</b> Key type
*/
template <typename Key>
__host__ __device__ __forceinline__
static cudaError_t SortKeys(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
int num_items, ///< [in] Number of items to reduce
int begin_bit = 0, ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
int end_bit = sizeof(Key) * 8, ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.