* Thread utilities for sequential prefix scan over statically-sized array types
*/
#pragma once
#include "../thread/thread_operators.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup ThreadModule
* @{
*/
/**
* \name Sequential prefix scan over statically-sized array types
* @{
*/
/**
* \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned.
*
* \tparam LENGTH Length of \p input and \p output arrays
* \tparam T <b>[inferred]</b> The data type to be scanned.
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ScanOp>
__device__ __forceinline__ T ThreadScanExclusive(
T *input, ///< [in] Input array
T *output, ///< [out] Output array (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T prefix, ///< [in] Prefix to seed scan with
bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.)
{
T inclusive = input[0];
if (apply_prefix)
{
inclusive = scan_op(prefix, inclusive);
}
output[0] = prefix;
T exclusive = inclusive;
#pragma unroll
for (int i = 1; i < LENGTH; ++i)
{
inclusive = scan_op(exclusive, input[i]);
output[i] = exclusive;
exclusive = inclusive;
}
return inclusive;
}
/**
* \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned.
*
* \tparam LENGTH <b>[inferred]</b> Length of \p input and \p output arrays
* \tparam T <b>[inferred]</b> The data type to be scanned.
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ScanOp>
__device__ __forceinline__ T ThreadScanExclusive(
T (&input)[LENGTH], ///< [in] Input array
T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T prefix, ///< [in] Prefix to seed scan with
bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.)
* \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned.
*
* \tparam LENGTH Length of \p input and \p output arrays
* \tparam T <b>[inferred]</b> The data type to be scanned.
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ScanOp>
__device__ __forceinline__ T ThreadScanInclusive(
T *input, ///< [in] Input array
T *output, ///< [out] Output array (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T prefix, ///< [in] Prefix to seed scan with
bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.)
{
T inclusive = input[0];
if (apply_prefix)
{
inclusive = scan_op(prefix, inclusive);
}
output[0] = inclusive;
// Continue scan
#pragma unroll
for (int i = 1; i < LENGTH; ++i)
{
inclusive = scan_op(inclusive, input[i]);
output[i] = inclusive;
}
return inclusive;
}
/**
* \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned.
*
* \tparam LENGTH <b>[inferred]</b> Length of \p input and \p output arrays
* \tparam T <b>[inferred]</b> The data type to be scanned.
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ScanOp>
__device__ __forceinline__ T ThreadScanInclusive(
T (&input)[LENGTH], ///< [in] Input array
T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T prefix, ///< [in] Prefix to seed scan with
bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.)