* \brief DeviceScan provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory. ![](device_scan.png)
* \ingroup DeviceModule
*
* \par Overview
* Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
* produces an output list where each element is computed to be the reduction
* of the elements occurring earlier in the input list. <em>Prefix sum</em>
* connotes a prefix scan with the addition operator. The term \em inclusive indicates
* that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
* The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
* the <em>i</em><sup>th</sup> output reduction.
*
* \par Usage Considerations
* \cdp_class{DeviceScan}
*
* \par Performance
*
* \image html scan_perf.png
*
*/
struct DeviceScan
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel
typename ScanKernelPtr, ///< Function type of cub::ScanKernel
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename ScanOp, ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
int ptx_version, ///< [in] PTX version
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel
ScanKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanKernel
KernelDispachParams &scan_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
ScanOp scan_op, ///< [in] Binary scan operator
Identity identity, ///< [in] Identity element
SizeT num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
/**
* Internal scan dispatch routine for using default tuning policies
*/
template <
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename ScanOp, ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
ScanOp scan_op, ///< [in] Binary scan operator
Identity identity, ///< [in] Identity element
SizeT num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA>
__host__ __device__ __forceinline__
static cudaError_t ExclusiveSum(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
int num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
* \tparam Identity <b>[inferred]</b> Type of the \p identity value used Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA,
typename ScanOp,
typename Identity>
__host__ __device__ __forceinline__
static cudaError_t ExclusiveScan(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
ScanOp scan_op, ///< [in] Binary scan operator
Identity identity, ///< [in] Identity element
int num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA>
__host__ __device__ __forceinline__
static cudaError_t InclusiveSum(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
int num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA,
typename ScanOp>
__host__ __device__ __forceinline__
static cudaError_t InclusiveScan(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
ScanOp scan_op, ///< [in] Binary scan operator
int num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.