block_scan_tiles.cuh
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Jul 28, 21:42

block_scan_tiles.cuh
View Options

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::BlockScanTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
	*/

	#pragma once

	#include <iterator>

	#include "scan_tiles_types.cuh"
	#include "../../block/block_load.cuh"
	#include "../../block/block_store.cuh"
	#include "../../block/block_scan.cuh"
	#include "../../grid/grid_queue.cuh"
	#include "../../util_namespace.cuh"

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {


	/******************************************************************************
	* Tuning policy types
	******************************************************************************/

	/**
	* Tuning policy for BlockScanTiles
	*/
	template <
	int _BLOCK_THREADS,
	int _ITEMS_PER_THREAD,
	BlockLoadAlgorithm _LOAD_ALGORITHM,
	bool _LOAD_WARP_TIME_SLICING,
	PtxLoadModifier _LOAD_MODIFIER,
	BlockStoreAlgorithm _STORE_ALGORITHM,
	bool _STORE_WARP_TIME_SLICING,
	BlockScanAlgorithm _SCAN_ALGORITHM>
	struct BlockScanTilesPolicy
	{
	enum
	{
	BLOCK_THREADS = _BLOCK_THREADS,
	ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
	LOAD_WARP_TIME_SLICING = _LOAD_WARP_TIME_SLICING,
	STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,
	};

	static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
	static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
	static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
	static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
	};


	/******************************************************************************
	* Thread block abstractions
	******************************************************************************/

	/**
	* \brief BlockScanTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
	*
	* Implements a single-pass "domino" strategy with adaptive prefix lookback.
	*/
	template <
	typename BlockScanTilesPolicy, ///< Tuning policy
	typename InputIteratorRA, ///< Input iterator type
	typename OutputIteratorRA, ///< Output iterator type
	typename ScanOp, ///< Scan functor type
	typename Identity, ///< Identity element type (cub::NullType for inclusive scan)
	typename SizeT> ///< Offset integer type
	struct BlockScanTiles
	{
	//---------------------------------------------------------------------
	// Types and constants
	//---------------------------------------------------------------------

	// Data type of input iterator
	typedef typename std::iterator_traits<InputIteratorRA>::value_type T;

	// Constants
	enum
	{
	INCLUSIVE = Equals<Identity, NullType>::VALUE, // Inclusive scan if no identity type is provided
	BLOCK_THREADS = BlockScanTilesPolicy::BLOCK_THREADS,
	ITEMS_PER_THREAD = BlockScanTilesPolicy::ITEMS_PER_THREAD,
	TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
	};

	// Block load type
	typedef BlockLoad<
	InputIteratorRA,
	BlockScanTilesPolicy::BLOCK_THREADS,
	BlockScanTilesPolicy::ITEMS_PER_THREAD,
	BlockScanTilesPolicy::LOAD_ALGORITHM,
	BlockScanTilesPolicy::LOAD_MODIFIER,
	BlockScanTilesPolicy::LOAD_WARP_TIME_SLICING> BlockLoadT;

	// Block store type
	typedef BlockStore<
	OutputIteratorRA,
	BlockScanTilesPolicy::BLOCK_THREADS,
	BlockScanTilesPolicy::ITEMS_PER_THREAD,
	BlockScanTilesPolicy::STORE_ALGORITHM,
	STORE_DEFAULT,
	BlockScanTilesPolicy::STORE_WARP_TIME_SLICING> BlockStoreT;

	// Tile status descriptor type
	typedef ScanTileDescriptor<T> ScanTileDescriptorT;

	// Block scan type
	typedef BlockScan<
	T,
	BlockScanTilesPolicy::BLOCK_THREADS,
	BlockScanTilesPolicy::SCAN_ALGORITHM> BlockScanT;

	// Callback type for obtaining inter-tile prefix during block scan
	typedef DeviceScanBlockPrefixOp<T, ScanOp> InterblockPrefixOp;

	// Shared memory type for this threadblock
	struct _TempStorage
	{
	union
	{
	typename BlockLoadT::TempStorage load; // Smem needed for tile loading
	typename BlockStoreT::TempStorage store; // Smem needed for tile storing
	struct
	{
	typename InterblockPrefixOp::TempStorage prefix; // Smem needed for cooperative prefix callback
	typename BlockScanT::TempStorage scan; // Smem needed for tile scanning
	};
	};

	SizeT tile_idx; // Shared tile index
	};

	// Alias wrapper allowing storage to be unioned
	struct TempStorage : Uninitialized<_TempStorage> {};


	//---------------------------------------------------------------------
	// Per-thread fields
	//---------------------------------------------------------------------

	_TempStorage &temp_storage; ///< Reference to temp_storage
	InputIteratorRA d_in; ///< Input data
	OutputIteratorRA d_out; ///< Output data
	ScanOp scan_op; ///< Binary scan operator
	Identity identity; ///< Identity element



	//---------------------------------------------------------------------
	// Block scan utility methods (first tile)
	//---------------------------------------------------------------------

	/**
	* Exclusive scan specialization
	*/
	template <typename _ScanOp, typename _Identity>
	__device__ __forceinline__
	void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate)
	{
	BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate);
	}

	/**
	* Exclusive sum specialization
	*/
	template <typename _Identity>
	__device__ __forceinline__
	void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate)
	{
	BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate);
	}

	/**
	* Inclusive scan specialization
	*/
	template <typename _ScanOp>
	__device__ __forceinline__
	void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate)
	{
	BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
	}

	/**
	* Inclusive sum specialization
	*/
	__device__ __forceinline__
	void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate)
	{
	BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate);
	}

	//---------------------------------------------------------------------
	// Block scan utility methods (subsequent tiles)
	//---------------------------------------------------------------------

	/**
	* Exclusive scan specialization (with prefix from predecessors)
	*/
	template <typename _ScanOp, typename _Identity, typename PrefixCallback>
	__device__ __forceinline__
	void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
	{
	BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op);
	}

	/**
	* Exclusive sum specialization (with prefix from predecessors)
	*/
	template <typename _Identity, typename PrefixCallback>
	__device__ __forceinline__
	void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
	{
	BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op);
	}

	/**
	* Inclusive scan specialization (with prefix from predecessors)
	*/
	template <typename _ScanOp, typename PrefixCallback>
	__device__ __forceinline__
	void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
	{
	BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op);
	}

	/**
	* Inclusive sum specialization (with prefix from predecessors)
	*/
	template <typename PrefixCallback>
	__device__ __forceinline__
	void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
	{
	BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op);
	}

	//---------------------------------------------------------------------
	// Constructor
	//---------------------------------------------------------------------

	// Constructor
	__device__ __forceinline__
	BlockScanTiles(
	TempStorage &temp_storage, ///< Reference to temp_storage
	InputIteratorRA d_in, ///< Input data
	OutputIteratorRA d_out, ///< Output data
	ScanOp scan_op, ///< Binary scan operator
	Identity identity) ///< Identity element
	:
	temp_storage(temp_storage.Alias()),
	d_in(d_in),
	d_out(d_out),
	scan_op(scan_op),
	identity(identity)
	{}


	//---------------------------------------------------------------------
	// Domino scan
	//---------------------------------------------------------------------

	/**
	* Process a tile of input (domino scan)
	*/
	template <bool FULL_TILE>
	__device__ __forceinline__ void ConsumeTile(
	SizeT num_items, ///< Total number of input items
	int tile_idx, ///< Tile index
	SizeT block_offset, ///< Tile offset
	ScanTileDescriptorT *d_tile_status) ///< Global list of tile status
	{
	// Load items
	T items[ITEMS_PER_THREAD];

	if (FULL_TILE)
	BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
	else
	BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_items - block_offset);

	__syncthreads();

	T block_aggregate;
	if (tile_idx == 0)
	{
	ScanBlock(items, scan_op, identity, block_aggregate);

	// Update tile status if there are successor tiles
	if (FULL_TILE && (threadIdx.x == 0))
	ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
	}
	else
	{
	InterblockPrefixOp prefix_op(d_tile_status, temp_storage.prefix, scan_op, tile_idx);
	ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
	}

	__syncthreads();

	// Store items
	if (FULL_TILE)
	BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
	else
	BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_items - block_offset);
	}

	/**
	* Dequeue and scan tiles of items as part of a domino scan
	*/
	__device__ __forceinline__ void ConsumeTiles(
	int num_items, ///< Total number of input items
	GridQueue<int> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
	ScanTileDescriptorT *d_tile_status) ///< Global list of tile status
	{
	#if CUB_PTX_ARCH < 200

	// No concurrent kernels allowed and blocks are launched in increasing order, so just assign one tile per block (up to 65K blocks)
	int tile_idx = blockIdx.x;
	SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;

	if (block_offset + TILE_ITEMS <= num_items)
	ConsumeTile<true>(num_items, tile_idx, block_offset, d_tile_status);
	else if (block_offset < num_items)
	ConsumeTile<false>(num_items, tile_idx, block_offset, d_tile_status);

	#else

	// Get first tile
	if (threadIdx.x == 0)
	temp_storage.tile_idx = queue.Drain(1);

	__syncthreads();

	int tile_idx = temp_storage.tile_idx;
	SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;

	while (block_offset + TILE_ITEMS <= num_items)
	{
	// Consume full tile
	ConsumeTile<true>(num_items, tile_idx, block_offset, d_tile_status);

	// Get next tile
	if (threadIdx.x == 0)
	temp_storage.tile_idx = queue.Drain(1);

	__syncthreads();

	tile_idx = temp_storage.tile_idx;
	block_offset = SizeT(TILE_ITEMS) * tile_idx;
	}

	// Consume a partially-full tile
	if (block_offset < num_items)
	{
	ConsumeTile<false>(num_items, tile_idx, block_offset, d_tile_status);
	}
	#endif

	}


	//---------------------------------------------------------------------
	// Even-share scan
	//---------------------------------------------------------------------

	/**
	* Process a tile of input
	*/
	template <
	bool FULL_TILE,
	bool FIRST_TILE>
	__device__ __forceinline__ void ConsumeTile(
	SizeT block_offset, ///< Tile offset
	RunningBlockPrefixOp<T> &prefix_op, ///< Running prefix operator
	int valid_items = TILE_ITEMS) ///< Number of valid items in the tile
	{
	// Load items
	T items[ITEMS_PER_THREAD];

	if (FULL_TILE)
	BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
	else
	BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items);

	__syncthreads();

	// Block scan
	T block_aggregate;
	if (FIRST_TILE)
	{
	ScanBlock(items, scan_op, identity, block_aggregate);
	prefix_op.running_total = block_aggregate;
	}
	else
	{
	ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
	}

	__syncthreads();

	// Store items
	if (FULL_TILE)
	BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
	else
	BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items);
	}


	/**
	* Scan a consecutive share of input tiles
	*/
	__device__ __forceinline__ void ConsumeTiles(
	SizeT block_offset, ///< [in] Threadblock begin offset (inclusive)
	SizeT block_oob) ///< [in] Threadblock end offset (exclusive)
	{
	RunningBlockPrefixOp<T> prefix_op;

	if (block_offset + TILE_ITEMS <= block_oob)
	{
	// Consume first tile of input (full)
	ConsumeTile<true, true>(block_offset, prefix_op);
	block_offset += TILE_ITEMS;

	// Consume subsequent full tiles of input
	while (block_offset + TILE_ITEMS <= block_oob)
	{
	ConsumeTile<true, false>(block_offset, prefix_op);
	block_offset += TILE_ITEMS;
	}

	// Consume a partially-full tile
	if (block_offset < block_oob)
	{
	int valid_items = block_oob - block_offset;
	ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
	}
	}
	else
	{
	// Consume the first tile of input (partially-full)
	int valid_items = block_oob - block_offset;
	ConsumeTile<false, true>(block_offset, prefix_op, valid_items);
	}
	}


	/**
	* Scan a consecutive share of input tiles, seeded with the specified prefix value
	*/
	__device__ __forceinline__ void ConsumeTiles(
	SizeT block_offset, ///< [in] Threadblock begin offset (inclusive)
	SizeT block_oob, ///< [in] Threadblock end offset (exclusive)
	T prefix) ///< [in] The prefix to apply to the scan segment
	{
	RunningBlockPrefixOp<T> prefix_op;
	prefix_op.running_total = prefix;

	// Consume full tiles of input
	while (block_offset + TILE_ITEMS <= block_oob)
	{
	ConsumeTile<true, false>(block_offset, prefix_op);
	block_offset += TILE_ITEMS;
	}

	// Consume a partially-full tile
	if (block_offset < block_oob)
	{
	int valid_items = block_oob - block_offset;
	ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
	}
	}

	};


	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)

block_scan_tiles.cuhNo OneTemporaryActions

File Metadata

block_scan_tiles.cuhView Options

Event Timeline

block_scan_tiles.cuh
No OneTemporary
Actions

block_scan_tiles.cuh
View Options