* The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads.
*/
#pragma once
#include "specializations/warp_reduce_shfl.cuh"
#include "specializations/warp_reduce_smem.cuh"
#include "../thread/thread_operators.cuh"
#include "../util_arch.cuh"
#include "../util_type.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup WarpModule
* @{
*/
/**
* \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads. ![](warp_reduce_logo.png)
*
* \par Overview
* A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
* uses a binary combining operator to compute a single aggregate from a list of input elements.
*
* \tparam T The reduction input/output element type
* \tparam LOGICAL_WARPS <b>[optional]</b> The number of entrant "logical" warps performing concurrent warp reductions. Default is 1.
* \tparam LOGICAL_WARP_THREADS <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
*
* \par Simple Examples
* \warpcollective{WarpReduce}
* \par
* The code snippet below illustrates four concurrent warp sum reductions within a block of
* 128 threads (one per each of the 32-thread warps).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for 4 warps on type int
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and ((only one logical warp) or (LOGICAL_WARP_THREADS is a power-of-two))
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ WarpReduce(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Threads are identified using the given warp and lane identifiers.
*/
__device__ __forceinline__ WarpReduce(
int warp_id, ///< [in] A suitable warp membership identifier
int lane_id) ///< [in] A lane identifier within the warp
:
temp_storage(PrivateStorage()),
warp_id(warp_id),
lane_id(lane_id)
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Threads are identified using the given warp and lane identifiers.
*/
__device__ __forceinline__ WarpReduce(
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
int warp_id, ///< [in] A suitable warp membership identifier
int lane_id) ///< [in] A lane identifier within the warp
* \brief Computes a segmented sum in each active warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
*
* \smemreuse
*
* The code snippet below illustrates a head-segmented warp sum
* reduction within a block of 32 threads (one warp).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for a single warp on type int
* \brief Computes a segmented sum in each active warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
*
* \smemreuse
*
* The code snippet below illustrates a tail-segmented warp sum
* reduction within a block of 32 threads (one warp).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for a single warp on type int
* \brief Computes a warp-wide reduction in each active warp using the specified binary reduction functor. The output is valid in warp <em>lane</em><sub>0</sub>.
*
* Supports non-commutative reduction operators
*
* \smemreuse
*
* The code snippet below illustrates four concurrent warp max reductions within a block of
* 128 threads (one per each of the 32-thread warps).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for 4 warps on type int
* \brief Computes a partially-full warp-wide reduction in each active warp using the specified binary reduction functor. The output is valid in warp <em>lane</em><sub>0</sub>.
*
* All threads in each logical warp must agree on the same value for \p valid_items. Otherwise the result is undefined.
*
* Supports non-commutative reduction operators
*
* \smemreuse
*
* The code snippet below illustrates a max reduction within a single, partially-full
* block of 32 threads (one warp).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, int valid_items)
* {
* // Specialize WarpReduce for a single warp on type int
* \brief Computes a segmented reduction in each active warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
*
* Supports non-commutative reduction operators
*
* \smemreuse
*
* The code snippet below illustrates a head-segmented warp max
* reduction within a block of 32 threads (one warp).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for a single warp on type int
* \brief Computes a segmented reduction in each active warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
*
* Supports non-commutative reduction operators
*
* \smemreuse
*
* The code snippet below illustrates a tail-segmented warp max
* reduction within a block of 32 threads (one warp).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for a single warp on type int