device_radix_sort.cuh
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Sep 14, 19:50

device_radix_sort.cuh
View Options


	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::DeviceRadixSort provides operations for computing a device-wide, parallel reduction across data items residing within global memory.
	*/

	#pragma once

	#include <stdio.h>
	#include <iterator>

	#include "block/block_radix_sort_upsweep_tiles.cuh"
	#include "block/block_radix_sort_downsweep_tiles.cuh"
	#include "block/block_scan_tiles.cuh"
	#include "../grid/grid_even_share.cuh"
	#include "../util_debug.cuh"
	#include "../util_device.cuh"
	#include "../util_namespace.cuh"

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {

	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document




	/******************************************************************************
	* Kernel entry points
	*****************************************************************************/

	/**
	* Upsweep pass kernel entry point (multi-block). Computes privatized digit histograms, one per block.
	*/
	template <
	typename BlockRadixSortUpsweepTilesPolicy, ///< Tuning policy for cub::BlockRadixSortUpsweepTiles abstraction
	typename Key, ///< Key type
	typename SizeT> ///< Integer type used for global array indexing
	__launch_bounds__ (int(BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS), 1)
	__global__ void RadixSortUpsweepKernel(
	Key *d_keys, ///< [in] Input keys buffer
	SizeT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
	SizeT num_items, ///< [in] Total number of input data items
	int current_bit, ///< [in] Bit position of current radix digit
	bool use_primary_bit_granularity, ///< [in] Whether nor not to use the primary policy (or the embedded alternate policy for smaller bit granularity)
	bool first_pass, ///< [in] Whether this is the first digit pass
	GridEvenShare<SizeT> even_share) ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
	{

	// Alternate policy for when fewer bits remain
	typedef typename BlockRadixSortUpsweepTilesPolicy::AltPolicy AltPolicy;

	// Parameterize two versions of BlockRadixSortUpsweepTiles type for the current configuration
	typedef BlockRadixSortUpsweepTiles<BlockRadixSortUpsweepTilesPolicy, Key, SizeT> BlockRadixSortUpsweepTilesT; // Primary
	typedef BlockRadixSortUpsweepTiles<AltPolicy, Key, SizeT> AltBlockRadixSortUpsweepTilesT; // Alternate (smaller bit granularity)

	// Shared memory storage
	__shared__ union
	{
	typename BlockRadixSortUpsweepTilesT::TempStorage pass_storage;
	typename AltBlockRadixSortUpsweepTilesT::TempStorage alt_pass_storage;
	} temp_storage;

	// Initialize even-share descriptor for this thread block
	even_share.BlockInit();

	// Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
	if (use_primary_bit_granularity)
	{
	// Primary granularity
	SizeT bin_count;
	BlockRadixSortUpsweepTilesT(temp_storage.pass_storage, d_keys, current_bit).ProcessTiles(
	even_share.block_offset,
	even_share.block_oob,
	bin_count);

	// Write out digit counts (striped)
	if (threadIdx.x < BlockRadixSortUpsweepTilesT::RADIX_DIGITS)
	{
	d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
	}
	}
	else
	{
	// Alternate granularity
	// Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
	SizeT bin_count;
	AltBlockRadixSortUpsweepTilesT(temp_storage.alt_pass_storage, d_keys, current_bit).ProcessTiles(
	even_share.block_offset,
	even_share.block_oob,
	bin_count);

	// Write out digit counts (striped)
	if (threadIdx.x < AltBlockRadixSortUpsweepTilesT::RADIX_DIGITS)
	{
	d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
	}
	}
	}


	/**
	* Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms
	*/
	template <
	typename BlockScanTilesPolicy, ///< Tuning policy for cub::BlockScanTiles abstraction
	typename SizeT> ///< Integer type used for global array indexing
	__launch_bounds__ (int(BlockScanTilesPolicy::BLOCK_THREADS), 1)
	__global__ void RadixSortScanKernel(
	SizeT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
	int num_counts) ///< [in] Total number of bin-counts
	{
	// Parameterize the BlockScanTiles type for the current configuration
	typedef BlockScanTiles<BlockScanTilesPolicy, SizeT, SizeT, cub::Sum, SizeT, SizeT> BlockScanTilesT;

	// Shared memory storage
	__shared__ typename BlockScanTilesT::TempStorage temp_storage;

	// Block scan instance
	BlockScanTilesT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), SizeT(0)) ;

	// Process full input tiles
	int block_offset = 0;
	RunningBlockPrefixOp<SizeT> prefix_op;
	prefix_op.running_total = 0;
	while (block_offset < num_counts)
	{
	block_scan.ConsumeTile<true, false>(block_offset, prefix_op);
	block_offset += BlockScanTilesT::TILE_ITEMS;
	}
	}


	/**
	* Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place.
	*/
	template <
	typename BlockRadixSortDownsweepTilesPolicy, ///< Tuning policy for cub::BlockRadixSortUpsweepTiles abstraction
	typename Key, ///< Key type
	typename Value, ///< Value type
	typename SizeT> ///< Integer type used for global array indexing
	__launch_bounds__ (int(BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS))
	__global__ void RadixSortDownsweepKernel(
	Key *d_keys_in, ///< [in] Input keys ping buffer
	Key *d_keys_out, ///< [in] Output keys pong buffer
	Value *d_values_in, ///< [in] Input values ping buffer
	Value *d_values_out, ///< [in] Output values pong buffer
	SizeT *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
	SizeT num_items, ///< [in] Total number of input data items
	int current_bit, ///< [in] Bit position of current radix digit
	bool use_primary_bit_granularity, ///< [in] Whether nor not to use the primary policy (or the embedded alternate policy for smaller bit granularity)
	bool first_pass, ///< [in] Whether this is the first digit pass
	bool last_pass, ///< [in] Whether this is the last digit pass
	GridEvenShare<SizeT> even_share) ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
	{

	// Alternate policy for when fewer bits remain
	typedef typename BlockRadixSortDownsweepTilesPolicy::AltPolicy AltPolicy;

	// Parameterize two versions of BlockRadixSortDownsweepTiles type for the current configuration
	typedef BlockRadixSortDownsweepTiles<BlockRadixSortDownsweepTilesPolicy, Key, Value, SizeT> BlockRadixSortDownsweepTilesT;
	typedef BlockRadixSortDownsweepTiles<AltPolicy, Key, Value, SizeT> AltBlockRadixSortDownsweepTilesT;

	// Shared memory storage
	__shared__ union
	{
	typename BlockRadixSortDownsweepTilesT::TempStorage pass_storage;
	typename AltBlockRadixSortDownsweepTilesT::TempStorage alt_pass_storage;

	} temp_storage;

	// Initialize even-share descriptor for this thread block
	even_share.BlockInit();

	if (use_primary_bit_granularity)
	{
	// Process input tiles
	BlockRadixSortDownsweepTilesT(temp_storage.pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
	even_share.block_offset,
	even_share.block_oob);
	}
	else
	{
	// Process input tiles
	AltBlockRadixSortDownsweepTilesT(temp_storage.alt_pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
	even_share.block_offset,
	even_share.block_oob);
	}
	}


	#endif // DOXYGEN_SHOULD_SKIP_THIS





	/******************************************************************************
	* DeviceRadixSort
	*****************************************************************************/

	/**
	* \brief DeviceRadixSort provides operations for computing a device-wide, parallel radix sort across data items residing within global memory. ![](sorting_logo.png)
	* \ingroup DeviceModule
	*
	* \par Overview
	* The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
	* items into ascending order. It relies upon a positional representation for
	* keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
	* characters, etc.) specified from least-significant to most-significant. For a
	* given input sequence of keys and a set of rules specifying a total ordering
	* of the symbolic alphabet, the radix sorting method produces a lexicographic
	* ordering of those keys.
	*
	* \par
	* DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
	* <tt>unsigned char</tt>, \p int, \p double, etc. Although the direct radix sorting
	* method can only be applied to unsigned integral types, BlockRadixSort
	* is able to sort signed and floating-point types via simple bit-wise transformations
	* that ensure lexicographic key ordering.
	*
	* \par Usage Considerations
	* \cdp_class{DeviceRadixSort}
	*
	* \par Performance
	*
	* \image html lsd_sort_perf.png
	*
	*/
	struct DeviceRadixSort
	{
	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document


	/******************************************************************************
	* Constants and typedefs
	******************************************************************************/

	/// Generic structure for encapsulating dispatch properties codified in block policy.
	struct KernelDispachParams
	{
	int block_threads;
	int items_per_thread;
	cudaSharedMemConfig smem_config;
	int radix_bits;
	int alt_radix_bits;
	int subscription_factor;
	int tile_size;

	template <typename SortBlockPolicy>
	__host__ __device__ __forceinline__
	void InitUpsweepPolicy(int subscription_factor = 1)
	{
	block_threads = SortBlockPolicy::BLOCK_THREADS;
	items_per_thread = SortBlockPolicy::ITEMS_PER_THREAD;
	radix_bits = SortBlockPolicy::RADIX_BITS;
	alt_radix_bits = SortBlockPolicy::AltPolicy::RADIX_BITS;
	smem_config = cudaSharedMemBankSizeFourByte;
	this->subscription_factor = subscription_factor;
	tile_size = block_threads * items_per_thread;
	}

	template <typename ScanBlockPolicy>
	__host__ __device__ __forceinline__
	void InitScanPolicy()
	{
	block_threads = ScanBlockPolicy::BLOCK_THREADS;
	items_per_thread = ScanBlockPolicy::ITEMS_PER_THREAD;
	radix_bits = 0;
	alt_radix_bits = 0;
	smem_config = cudaSharedMemBankSizeFourByte;
	subscription_factor = 0;
	tile_size = block_threads * items_per_thread;
	}

	template <typename SortBlockPolicy>
	__host__ __device__ __forceinline__
	void InitDownsweepPolicy(int subscription_factor = 1)
	{
	block_threads = SortBlockPolicy::BLOCK_THREADS;
	items_per_thread = SortBlockPolicy::ITEMS_PER_THREAD;
	radix_bits = SortBlockPolicy::RADIX_BITS;
	alt_radix_bits = SortBlockPolicy::AltPolicy::RADIX_BITS;
	smem_config = SortBlockPolicy::SMEM_CONFIG;
	this->subscription_factor = subscription_factor;
	tile_size = block_threads * items_per_thread;
	}
	};



	/******************************************************************************
	* Tuning policies
	******************************************************************************/

	/// Specializations of tuned policy types for different PTX architectures
	template <typename Key, typename Value, typename SizeT, int ARCH>
	struct TunedPolicies;

	/// SM35 tune
	template <typename Key, typename Value, typename SizeT>
	struct TunedPolicies<Key, Value, SizeT, 350>
	{
	enum {
	KEYS_ONLY = (Equals<Value, NullType>::VALUE),
	SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
	RADIX_BITS = 5,
	};

	// UpsweepPolicy
	typedef BlockRadixSortUpsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
	typedef BlockRadixSortUpsweepTilesPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
	typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
	/*
	// 4bit
	typedef BlockRadixSortUpsweepTilesPolicy <128, 15, LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
	typedef BlockRadixSortUpsweepTilesPolicy <256, 13, LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
	*/
	// ScanPolicy
	typedef BlockScanTilesPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;

	// DownsweepPolicy
	typedef BlockRadixSortDownsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
	typedef BlockRadixSortDownsweepTilesPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
	typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;

	/*
	// 4bit
	typedef BlockRadixSortDownsweepTilesPolicy <128, 15, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
	typedef BlockRadixSortDownsweepTilesPolicy <256, 13, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
	*/
	enum { SUBSCRIPTION_FACTOR = 7 };
	};


	/// SM20 tune
	template <typename Key, typename Value, typename SizeT>
	struct TunedPolicies<Key, Value, SizeT, 200>
	{
	enum {
	KEYS_ONLY = (Equals<Value, NullType>::VALUE),
	SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
	RADIX_BITS = 5,
	};

	// UpsweepPolicy
	typedef BlockRadixSortUpsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
	typedef BlockRadixSortUpsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
	typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;

	// ScanPolicy
	typedef BlockScanTilesPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;

	// DownsweepPolicy
	typedef BlockRadixSortDownsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
	typedef BlockRadixSortDownsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
	typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;

	enum { SUBSCRIPTION_FACTOR = 3 };
	};


	/// SM10 tune
	template <typename Key, typename Value, typename SizeT>
	struct TunedPolicies<Key, Value, SizeT, 100>
	{
	enum {
	RADIX_BITS = 4,
	};

	// UpsweepPolicy
	typedef BlockRadixSortUpsweepTilesPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy;

	// ScanPolicy
	typedef BlockScanTilesPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;

	// DownsweepPolicy
	typedef BlockRadixSortDownsweepTilesPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy;

	enum { SUBSCRIPTION_FACTOR = 3 };
	};



	/******************************************************************************
	* Default policy initializer
	******************************************************************************/

	/// Tuning policy for the PTX architecture that DeviceRadixSort operations will get dispatched to
	template <typename Key, typename Value, typename SizeT>
	struct PtxDefaultPolicies
	{

	static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ?
	350 :
	(CUB_PTX_ARCH >= 200) ?
	200 :
	100;

	// Tuned policy set for the current PTX compiler pass
	typedef TunedPolicies<Key, Value, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;

	// UpsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
	struct UpsweepPolicy : PtxTunedPolicies::UpsweepPolicy {};

	// ScanPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
	struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};

	// DownsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
	struct DownsweepPolicy : PtxTunedPolicies::DownsweepPolicy {};

	// Subscription factor for the current PTX compiler pass
	enum { SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR };


	/**
	* Initialize dispatch params with the policies corresponding to the PTX assembly we will use
	*/
	static void InitDispatchParams(
	int ptx_version,
	KernelDispachParams &upsweep_dispatch_params,
	KernelDispachParams &scan_dispatch_params,
	KernelDispachParams &downsweep_dispatch_params)
	{
	if (ptx_version >= 350)
	{
	typedef TunedPolicies<Key, Value, SizeT, 350> TunedPolicies;
	upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
	scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
	downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
	}
	else if (ptx_version >= 200)
	{
	typedef TunedPolicies<Key, Value, SizeT, 200> TunedPolicies;
	upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
	scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
	downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
	}
	else
	{
	typedef TunedPolicies<Key, Value, SizeT, 100> TunedPolicies;
	upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
	scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
	downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
	}
	}
	};



	/******************************************************************************
	* Utility methods
	******************************************************************************/

	/**
	* Internal dispatch routine for computing a device-wide reduction using a two-stages of kernel invocations.
	*/
	template <
	typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel
	typename SpineKernelPtr, ///< Function type of cub::SpineScanKernel
	typename DownsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel
	typename Key, ///< Key type
	typename Value, ///< Value type
	typename SizeT> ///< Integer type used for global array indexing
	__host__ __device__ __forceinline__
	static cudaError_t Dispatch(
	void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
	size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
	UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
	SpineKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
	DownsweepKernelPtr downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
	KernelDispachParams &upsweep_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for
	KernelDispachParams &scan_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
	KernelDispachParams &downsweep_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
	DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
	DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
	SizeT num_items, ///< [in] Number of items to reduce
	int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
	int end_bit = sizeof(Key) * 8, ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
	cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
	{
	#ifndef CUB_RUNTIME_ENABLED

	// Kernel launch not supported from this device
	return CubDebug(cudaErrorNotSupported );

	#else

	cudaError error = cudaSuccess;
	do
	{
	// Get device ordinal
	int device_ordinal;
	if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;

	// Get SM count
	int sm_count;
	if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;

	// Get a rough estimate of downsweep_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
	int downsweep_sm_occupancy = CUB_MIN(
	ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
	ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / downsweep_dispatch_params.block_threads);
	int upsweep_sm_occupancy = downsweep_sm_occupancy;

	#ifndef __CUDA_ARCH__
	// We're on the host, so come up with more accurate estimates of SM occupancy from actual device properties
	Device device_props;
	if (CubDebug(error = device_props.Init(device_ordinal))) break;

	if (CubDebug(error = device_props.MaxSmOccupancy(
	downsweep_sm_occupancy,
	downsweep_kernel,
	downsweep_dispatch_params.block_threads))) break;

	if (CubDebug(error = device_props.MaxSmOccupancy(
	upsweep_sm_occupancy,
	upsweep_kernel,
	upsweep_dispatch_params.block_threads))) break;
	#endif
	// Get device occupancies
	int downsweep_occupancy = downsweep_sm_occupancy * sm_count;

	// Get even-share work distribution descriptor
	GridEvenShare<SizeT> even_share;
	int max_downsweep_grid_size = downsweep_occupancy * downsweep_dispatch_params.subscription_factor;
	int downsweep_grid_size;
	even_share.GridInit(num_items, max_downsweep_grid_size, downsweep_dispatch_params.tile_size);
	downsweep_grid_size = even_share.grid_size;

	// Get number of spine elements (round up to nearest spine scan kernel tile size)
	int bins = 1 << downsweep_dispatch_params.radix_bits;
	int spine_size = downsweep_grid_size * bins;
	int spine_tiles = (spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
	spine_size = spine_tiles * scan_dispatch_params.tile_size;

	int alt_bins = 1 << downsweep_dispatch_params.alt_radix_bits;
	int alt_spine_size = downsweep_grid_size * alt_bins;
	int alt_spine_tiles = (alt_spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
	alt_spine_size = alt_spine_tiles * scan_dispatch_params.tile_size;

	// Temporary storage allocation requirements
	void* allocations[1];
	size_t allocation_sizes[1] =
	{
	spine_size * sizeof(SizeT), // bytes needed for privatized block digit histograms
	};

	// Alias temporaries (or set the necessary size of the storage allocation)
	if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;

	// Return if the caller is simply requesting the size of the storage allocation
	if (d_temp_storage == NULL)
	return cudaSuccess;

	// Privatized per-block digit histograms
	SizeT d_spine = (SizeT) allocations[0];

	#ifndef __CUDA_ARCH__
	// Get current smem bank configuration
	cudaSharedMemConfig original_smem_config;
	if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
	cudaSharedMemConfig current_smem_config = original_smem_config;
	#endif
	// Iterate over digit places
	int current_bit = begin_bit;
	while (current_bit < end_bit)
	{
	// Use primary bit granularity if bits remaining is a whole multiple of bit primary granularity
	int bits_remaining = end_bit - current_bit;
	bool use_primary_bit_granularity = (bits_remaining % downsweep_dispatch_params.radix_bits == 0);
	int radix_bits = (use_primary_bit_granularity) ?
	downsweep_dispatch_params.radix_bits :
	downsweep_dispatch_params.alt_radix_bits;

	#ifndef __CUDA_ARCH__
	// Update smem config if necessary
	if (current_smem_config != upsweep_dispatch_params.smem_config)
	{
	if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_dispatch_params.smem_config))) break;
	current_smem_config = upsweep_dispatch_params.smem_config;
	}
	#endif

	// Log upsweep_kernel configuration
	if (stream_synchronous)
	CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n",
	downsweep_grid_size, upsweep_dispatch_params.block_threads, (long long) stream, upsweep_dispatch_params.smem_config, upsweep_dispatch_params.items_per_thread, upsweep_sm_occupancy, d_keys.selector, current_bit, radix_bits);

	// Invoke upsweep_kernel with same grid size as downsweep_kernel
	upsweep_kernel<<<downsweep_grid_size, upsweep_dispatch_params.block_threads, 0, stream>>>(
	d_keys.d_buffers[d_keys.selector],
	d_spine,
	num_items,
	current_bit,
	use_primary_bit_granularity,
	(current_bit == begin_bit),
	even_share);

	// Sync the stream if specified
	if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;

	// Log scan_kernel configuration
	if (stream_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
	1, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread);

	// Invoke scan_kernel
	scan_kernel<<<1, scan_dispatch_params.block_threads, 0, stream>>>(
	d_spine,
	(use_primary_bit_granularity) ? spine_size : alt_spine_size);

	// Sync the stream if specified
	if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;

	#ifndef __CUDA_ARCH__
	// Update smem config if necessary
	if (current_smem_config != downsweep_dispatch_params.smem_config)
	{
	if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_dispatch_params.smem_config))) break;
	current_smem_config = downsweep_dispatch_params.smem_config;
	}
	#endif

	// Log downsweep_kernel configuration
	if (stream_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n",
	downsweep_grid_size, downsweep_dispatch_params.block_threads, (long long) stream, downsweep_dispatch_params.smem_config, downsweep_dispatch_params.items_per_thread, downsweep_sm_occupancy);

	// Invoke downsweep_kernel
	downsweep_kernel<<<downsweep_grid_size, downsweep_dispatch_params.block_threads, 0, stream>>>(
	d_keys.d_buffers[d_keys.selector],
	d_keys.d_buffers[d_keys.selector ^ 1],
	d_values.d_buffers[d_values.selector],
	d_values.d_buffers[d_values.selector ^ 1],
	d_spine,
	num_items,
	current_bit,
	use_primary_bit_granularity,
	(current_bit == begin_bit),
	(current_bit + downsweep_dispatch_params.radix_bits >= end_bit),
	even_share);

	// Sync the stream if specified
	if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;

	// Invert selectors
	d_keys.selector ^= 1;
	d_values.selector ^= 1;

	// Update current bit position
	current_bit += radix_bits;
	}

	#ifndef __CUDA_ARCH__
	// Reset smem config if necessary
	if (current_smem_config != original_smem_config)
	{
	if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
	}
	#endif

	}
	while (0);

	return error;

	#endif // CUB_RUNTIME_ENABLED
	}



	#endif // DOXYGEN_SHOULD_SKIP_THIS

	/******************************************************************************
	* Interface
	******************************************************************************/


	/**
	* \brief Sorts key-value pairs.
	*
	* \par
	* The sorting operation requires a pair of key buffers and a pair of value
	* buffers. Each pair is wrapped in a DoubleBuffer structure whose member
	* DoubleBuffer::Current() references the active buffer. The currently-active
	* buffer may be changed by the sorting operation.
	*
	* \devicestorage
	*
	* \cdp
	*
	* \par
	* The code snippet below illustrates the sorting of a device vector of \p int keys
	* with associated vector of \p int values.
	* \par
	* \code
	* #include <cub/cub.cuh>
	* ...
	*
	* // Create a set of DoubleBuffers to wrap pairs of device pointers for
	* // sorting data (keys, values, and equivalently-sized alternate buffers)
	* int num_items = ...
	* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
	* cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
	*
	* // Determine temporary device storage requirements for sorting operation
	* void *d_temp_storage = NULL;
	* size_t temp_storage_bytes = 0;
	* cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
	*
	* // Allocate temporary storage for sorting operation
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run sorting operation
	* cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
	*
	* // Sorted keys and values are referenced by d_keys.Current() and d_values.Current()
	*
	* \endcode
	*
	* \tparam Key <b>[inferred]</b> Key type
	* \tparam Value <b>[inferred]</b> Value type
	*/
	template <
	typename Key,
	typename Value>
	__host__ __device__ __forceinline__
	static cudaError_t SortPairs(
	void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
	size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
	DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
	DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer of values whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
	int num_items, ///< [in] Number of items to reduce
	int begin_bit = 0, ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
	int end_bit = sizeof(Key) * 8, ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
	cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
	{
	// Type used for array indexing
	typedef int SizeT;

	// Tuning polices
	typedef PtxDefaultPolicies<Key, Value, SizeT> PtxDefaultPolicies; // Wrapper of default kernel policies
	typedef typename PtxDefaultPolicies::UpsweepPolicy UpsweepPolicy; // Upsweep kernel policy
	typedef typename PtxDefaultPolicies::ScanPolicy ScanPolicy; // Scan kernel policy
	typedef typename PtxDefaultPolicies::DownsweepPolicy DownsweepPolicy; // Downsweep kernel policy

	cudaError error = cudaSuccess;
	do
	{
	// Declare dispatch parameters
	KernelDispachParams upsweep_dispatch_params;
	KernelDispachParams scan_dispatch_params;
	KernelDispachParams downsweep_dispatch_params;

	#ifdef __CUDA_ARCH__
	// We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
	upsweep_dispatch_params.InitUpsweepPolicy<UpsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
	scan_dispatch_params.InitScanPolicy<ScanPolicy>();
	downsweep_dispatch_params.InitDownsweepPolicy<DownsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
	#else
	// We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
	int ptx_version;
	if (CubDebug(error = PtxVersion(ptx_version))) break;
	PtxDefaultPolicies::InitDispatchParams(
	ptx_version,
	upsweep_dispatch_params,
	scan_dispatch_params,
	downsweep_dispatch_params);
	#endif
	// Dispatch
	if (CubDebug(error = Dispatch(
	d_temp_storage,
	temp_storage_bytes,
	RadixSortUpsweepKernel<UpsweepPolicy, Key, SizeT>,
	RadixSortScanKernel<ScanPolicy, SizeT>,
	RadixSortDownsweepKernel<DownsweepPolicy, Key, Value, SizeT>,
	upsweep_dispatch_params,
	scan_dispatch_params,
	downsweep_dispatch_params,
	d_keys,
	d_values,
	num_items,
	begin_bit,
	end_bit,
	stream,
	stream_synchronous))) break;
	}
	while (0);

	return error;
	}


	/**
	* \brief Sorts keys
	*
	* \par
	* The sorting operation requires a pair of key buffers. The pair is
	* wrapped in a DoubleBuffer structure whose member DoubleBuffer::Current()
	* references the active buffer. The currently-active buffer may be changed
	* by the sorting operation.
	*
	* \devicestorage
	*
	* \cdp
	*
	* \par
	* The code snippet below illustrates the sorting of a device vector of \p int keys.
	* \par
	* \code
	* #include <cub/cub.cuh>
	* ...
	*
	* // Create a set of DoubleBuffers to wrap pairs of device pointers for
	* // sorting data (keys and equivalently-sized alternate buffer)
	* int num_items = ...
	* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
	*
	* // Determine temporary device storage requirements for sorting operation
	* void *d_temp_storage = NULL;
	* size_t temp_storage_bytes = 0;
	* cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
	*
	* // Allocate temporary storage for sorting operation
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run sorting operation
	* cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
	*
	* // Sorted keys are referenced by d_keys.Current()
	*
	* \endcode
	*
	* \tparam Key <b>[inferred]</b> Key type
	*/
	template <typename Key>
	__host__ __device__ __forceinline__
	static cudaError_t SortKeys(
	void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
	size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
	DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
	int num_items, ///< [in] Number of items to reduce
	int begin_bit = 0, ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
	int end_bit = sizeof(Key) * 8, ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
	cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
	{
	DoubleBuffer<NullType> d_values;
	return SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream, stream_synchronous);
	}

	};


	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)

device_radix_sort.cuhNo OneTemporaryActions

File Metadata

device_radix_sort.cuhView Options

Event Timeline

device_radix_sort.cuh
No OneTemporary
Actions

device_radix_sort.cuh
View Options