* cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).
*/
#pragma once
#include "../util_namespace.cuh"
#include "../util_macro.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup GridModule
* @{
*/
/**
* \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).
*
* \par Overview
* GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
* Threadblocks may receive one of three different amounts of work: "big", "normal",
* and "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit
* for the last threadblock may be partially-full if the input is not an even multiple of
* the scheduling grain size.
*
* \par
* Before invoking a child grid, a parent thread will typically construct and initialize an instance of
* GridEvenShare using \p GridInit(). The instance can be passed to child threadblocks which can
* initialize their per-threadblock offsets using \p BlockInit().
*
* \tparam SizeT Integer type for array indexing
*/
template <typename SizeT>
class GridEvenShare
{
private:
SizeT total_grains;
int big_blocks;
SizeT big_share;
SizeT normal_share;
SizeT normal_base_offset;
public:
/// Total number of input items
SizeT num_items;
/// Grid size in threadblocks
int grid_size;
/// Offset into input marking the beginning of the owning thread block's segment of input tiles
SizeT block_offset;
/// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles
SizeT block_oob;
/**
* \brief Block-based constructor for single-block grids.
int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
int schedule_granularity) ///< Granularity by which the input can be parcelled into and distributed among threablocks. Usually the thread block's native tile size (or a multiple thereof.