* The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
*/
#pragma once
#include "../util_arch.cuh"
#include "../util_macro.cuh"
#include "../util_type.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
* \ingroup BlockModule
*
* \par Overview
* It is commonplace for blocks of threads to rearrange data items between
* threads. For example, the global memory subsystem prefers access patterns
* where data items are "striped" across threads (where consecutive threads access consecutive items),
* yet most block-wide operations prefer a "blocked" partitioning of items across threads
* (where consecutive items belong to a single thread).
*
* \par
* BlockExchange supports the following types of data exchanges:
* - Transposing between [<em>blocked</em>](index.html#sec5sec4) and [<em>striped</em>](index.html#sec5sec4) arrangements
* - Transposing between [<em>blocked</em>](index.html#sec5sec4) and [<em>warp-striped</em>](index.html#sec5sec4) arrangements
* - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec4)
* - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec4)
*
* \tparam T The data type to be exchanged.
* \tparam BLOCK_THREADS The thread block size in threads.
* \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread.
* \tparam WARP_TIME_SLICING <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false)
*
* \par A Simple Example
* \blockcollective{BlockExchange}
* \par
* The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for 128 threads owning 4 integer items each
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockExchange()
:
temp_storage(PrivateStorage()),
linear_tid(threadIdx.x),
warp_lane(linear_tid & (WARP_THREADS - 1)),
warp_id(linear_tid >> LOG_WARP_THREADS),
warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
{}
/**
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockExchange(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(threadIdx.x),
warp_lane(linear_tid & (WARP_THREADS - 1)),
warp_id(linear_tid >> LOG_WARP_THREADS),
warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
{}
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Each thread is identified using the supplied linear thread identifier
*/
__device__ __forceinline__ BlockExchange(
int linear_tid) ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(PrivateStorage()),
linear_tid(linear_tid),
warp_lane(linear_tid & (WARP_THREADS - 1)),
warp_id(linear_tid >> LOG_WARP_THREADS),
warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Each thread is identified using the supplied linear thread identifier.
*/
__device__ __forceinline__ BlockExchange(
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
int linear_tid) ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)