* \brief DeviceReduce provides operations for computing a device-wide, parallel reduction across data items residing within global memory. ![](reduce_logo.png)
* \ingroup DeviceModule
*
* \par Overview
* A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
* uses a binary combining operator to compute a single aggregate from a list of input elements.
*
* \par Usage Considerations
* \cdp_class{DeviceReduce}
*
* \par Performance
*
* \image html reduction_perf.png
*
*/
struct DeviceReduce
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
* Internal dispatch routine for computing a device-wide reduction using a two-stages of kernel invocations.
*/
template <
typename ReducePrivatizedKernelPtr, ///< Function type of cub::ReducePrivatizedKernel
typename ReduceSingleKernelPtr, ///< Function type of cub::ReduceSingleKernel
typename ResetDrainKernelPtr, ///< Function type of cub::ResetDrainKernel
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename SizeT, ///< Integer type used for global array indexing
typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
ReducePrivatizedKernelPtr privatized_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReducePrivatizedKernel
ReduceSingleKernelPtr single_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReduceSingleKernel
ResetDrainKernelPtr prepare_drain_kernel, ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel
KernelDispachParams &privatized_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p privatized_kernel_ptr was compiled for
KernelDispachParams &single_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p single_kernel was compiled for
InputIteratorRA d_in, ///< [in] Input data to reduce
OutputIteratorRA d_out, ///< [out] Output location for result
SizeT num_items, ///< [in] Number of items to reduce
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA,
typename ReductionOp>
__host__ __device__ __forceinline__
static cudaError_t Reduce(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Input data to reduce
OutputIteratorRA d_out, ///< [out] Output location for result
int num_items, ///< [in] Number of items to reduce
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA>
__host__ __device__ __forceinline__
static cudaError_t Sum(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Input data to reduce
OutputIteratorRA d_out, ///< [out] Output location for result
int num_items, ///< [in] Number of items to reduce
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.