* \brief DeviceReduceByKey provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory. ![](scan_logo.png)
*/
struct DeviceReduceByKey
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
typename InitScanKernelPtr, ///< Function type of cub::InitScanKernel
typename MultiBlockScanKernelPtr, ///< Function type of cub::MultiBlockScanKernel
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename ReductionOp, ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InitScanKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::InitScanKernel
MultiBlockScanKernelPtr multi_block_kernel, ///< [in] Kernel function pointer to parameterization of cub::MultiBlockScanKernel
KernelDispachParams &multi_block_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel was compiled for
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
SizeT num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
#else
if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
#endif
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
/**
* Internal scan dispatch routine for using default tuning policies
*/
template <
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename ReductionOp, ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
SizeT num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
* \brief Computes device-wide reductions of consecutive values whose corresponding keys are equal.
*
* The resulting output lists of value-aggregates and their corresponding keys are compacted.
*
* \devicestorage
*
* \tparam KeyInputIteratorRA <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
* \tparam KeyOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
* \tparam ValueInputIteratorRA <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
* \tparam ValueOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
*/
template <
typename KeyInputIteratorRA,
typename KeyOutputIteratorRA,
typename ValueInputIteratorRA,
typename ValueOutputIteratorRA,
typename ReductionOp>
__host__ __device__ __forceinline__
static cudaError_t ReduceValues(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
KeyInputIteratorRA d_keys_in, ///< [in] Key input data
KeyOutputIteratorRA d_keys_out, ///< [out] Key output data (compacted)
ValueInputIteratorRA d_values_in, ///< [in] Value input data
ValueOutputIteratorRA d_values_out, ///< [out] Value output data (compacted)
int num_items, ///< [in] Total number of input pairs
ReductionOp reduction_op, ///< [in] Binary value reduction operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
* \brief Computes device-wide sums of consecutive values whose corresponding keys are equal.
*
* The resulting output lists of value-aggregates and their corresponding keys are compacted.
*
* \devicestorage
*
* \tparam KeyInputIteratorRA <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
* \tparam KeyOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
* \tparam ValueInputIteratorRA <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
* \tparam ValueOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
*/
template <
typename KeyInputIteratorRA,
typename KeyOutputIteratorRA,
typename ValueInputIteratorRA,
typename ValueOutputIteratorRA>
__host__ __device__ __forceinline__
static cudaError_t SumValues(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
KeyInputIteratorRA d_keys_in, ///< [in] Key input data
KeyOutputIteratorRA d_keys_out, ///< [in] Key output data (compacted)
ValueInputIteratorRA d_values_in, ///< [in] Value input data
ValueOutputIteratorRA d_values_out, ///< [in] Value output data (compacted)
int num_items, ///< [in] Total number of input pairs
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
* \brief Computes the "run-length" of each group of consecutive, equal-valued keys.
*
* The resulting output lists of run-length counts and their corresponding keys are compacted.
*
* \devicestorage
*
* \tparam KeyInputIteratorRA <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
* \tparam KeyOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
* \tparam CountOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for output of key-counts whose value type must be convertible to an integer type (may be a simple pointer type)
*/
template <
typename KeyInputIteratorRA,
typename KeyOutputIteratorRA,
typename CountOutputIteratorRA>
__host__ __device__ __forceinline__
static cudaError_t RunLengths(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
KeyInputIteratorRA d_keys_in, ///< [in] Key input data
KeyOutputIteratorRA d_keys_out, ///< [in] Key output data (compacted)
CountOutputIteratorRA d_counts_out, ///< [in] Run-length counts output data (compacted)
int num_items, ///< [in] Total number of keys
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
* \brief Removes duplicates within each group of consecutive, equal-valued keys. Only the first key from each group (and corresponding value) is kept.
*
* The resulting keys are compacted.
*
* \devicestorage
*
* \tparam KeyInputIteratorRA <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
* \tparam KeyOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
* \tparam ValueInputIteratorRA <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
* \tparam ValueOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
*/
template <
typename KeyInputIteratorRA,
typename KeyOutputIteratorRA,
typename ValueInputIteratorRA,
typename ValueOutputIteratorRA,
typename ReductionOp>
__host__ __device__ __forceinline__
static cudaError_t Unique(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
KeyInputIteratorRA d_keys_in, ///< [in] Key input data
KeyOutputIteratorRA d_keys_out, ///< [out] Key output data (compacted)
ValueInputIteratorRA d_values_in, ///< [in] Value input data
ValueOutputIteratorRA d_values_out, ///< [out] Value output data (compacted)
int num_items, ///< [in] Total number of input pairs
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.