RADIX_SORT_SCATTER_DIRECT, ///< Scatter directly from registers to global bins
RADIX_SORT_SCATTER_TWO_PHASE, ///< First scatter from registers into shared memory bins, then into global bins
};
/**
* Tuning policy for BlockRadixSortDownsweepTiles
*/
template <
int _BLOCK_THREADS, ///< The number of threads per CTA
int _ITEMS_PER_THREAD, ///< The number of consecutive downsweep keys to process per thread
BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
PtxLoadModifier _LOAD_MODIFIER, ///< The PTX cache-modifier to use for loads
bool _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
bool _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
BlockScanAlgorithm _INNER_SCAN_ALGORITHM, ///< The cub::BlockScanAlgorithm algorithm to use
RadixSortScatterAlgorithm _SCATTER_ALGORITHM, ///< The scattering strategy to use
cudaSharedMemConfig _SMEM_CONFIG, ///< Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins)