block_partition_tiles.cuh
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Sep 3, 00:31

block_partition_tiles.cuh
View Options

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::BlockPartitionTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide list partitioning.
	*/

	#pragma once

	#include <iterator>

	#include "scan_tiles_types.cuh"
	#include "../../thread/thread_operators.cuh"
	#include "../../block/block_load.cuh"
	#include "../../block/block_store.cuh"
	#include "../../block/block_scan.cuh"
	#include "../../grid/grid_queue.cuh"
	#include "../../util_vector.cuh"
	#include "../../util_namespace.cuh"

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {


	/******************************************************************************
	* Tuning policy types
	******************************************************************************/

	/**
	* Tuning policy for BlockPartitionTiles
	*/
	template <
	int _PARTITIONS,
	int _BLOCK_THREADS,
	int _ITEMS_PER_THREAD,
	PtxLoadModifier _LOAD_MODIFIER,
	BlockScanAlgorithm _SCAN_ALGORITHM>
	struct BlockPartitionTilesPolicy
	{
	enum
	{
	PARTITIONS = _PARTITIONS,
	BLOCK_THREADS = _BLOCK_THREADS,
	ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
	};

	static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
	static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
	};



	/**
	* Tuple type for scanning partition membership flags
	*/
	template <
	typename SizeT,
	int PARTITIONS>
	struct PartitionScanTuple;


	/**
	* Tuple type for scanning partition membership flags (specialized for 1 output partition)
	*/
	template <typename SizeT>
	struct PartitionScanTuple<SizeT, 1> : VectorHelper<SizeT, 1>::Type
	{
	__device__ __forceinline__ PartitionScanTuple operator+(const PartitionScanTuple &other)
	{
	PartitionScanTuple retval;
	retval.x = x + other.x;
	return retval;
	}

	template <typename PredicateOp, typename T>
	__device__ __forceinline__ void SetFlags(PredicateOp pred_op, T val)
	{
	this->x = pred_op(val);
	}

	template <typename PredicateOp, typename T, typename OutputIteratorRA, SizeT num_items>
	__device__ __forceinline__ void Scatter(PredicateOp pred_op, T val, OutputIteratorRA d_out, SizeT num_items)
	{
	if (pred_op(val))
	d_out[this->x - 1] = val;
	}

	};


	/**
	* Tuple type for scanning partition membership flags (specialized for 2 output partitions)
	*/
	template <typename SizeT>
	struct PartitionScanTuple<SizeT, 2> : VectorHelper<SizeT, 2>::Type
	{
	__device__ __forceinline__ PartitionScanTuple operator+(const PartitionScanTuple &other)
	{
	PartitionScanTuple retval;
	retval.x = x + other.x;
	retval.y = y + other.y;
	return retval;
	}

	template <typename PredicateOp, typename T>
	__device__ __forceinline__ void SetFlags(PredicateOp pred_op, T val)
	{
	bool pred = pred_op(val);
	this->x = pred;
	this->y = !pred;
	}

	template <typename PredicateOp, typename T, typename OutputIteratorRA, SizeT num_items>
	__device__ __forceinline__ void Scatter(PredicateOp pred_op, T val, OutputIteratorRA d_out, SizeT num_items)
	{
	SizeT scatter_offset = (pred_op(val)) ?
	this->x - 1 :
	num_items - this->y;

	d_out[scatter_offset] = val;
	}
	};




	/******************************************************************************
	* Thread block abstractions
	******************************************************************************/

	/**
	* \brief BlockPartitionTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide list partitioning.
	*
	* Implements a single-pass "domino" strategy with adaptive prefix lookback.
	*/
	template <
	typename BlockPartitionTilesPolicy, ///< Tuning policy
	typename InputIteratorRA, ///< Input iterator type
	typename OutputIteratorRA, ///< Output iterator type
	typename PredicateOp, ///< Partition predicate functor type
	typename SizeT> ///< Offset integer type
	struct BlockPartitionTiles
	{
	//---------------------------------------------------------------------
	// Types and constants
	//---------------------------------------------------------------------

	// Constants
	enum
	{
	PARTITIONS = BlockPartitionTilesPolicy::PARTITIONS,
	BLOCK_THREADS = BlockPartitionTilesPolicy::BLOCK_THREADS,
	ITEMS_PER_THREAD = BlockPartitionTilesPolicy::ITEMS_PER_THREAD,
	TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
	};

	// Load modifier
	static const PtxLoadModifier LOAD_MODIFIER = BlockPartitionTilesPolicy::LOAD_MODIFIER;

	// Data type of input iterator
	typedef typename std::iterator_traits<InputIteratorRA>::value_type T;

	// Tuple type for scanning partition membership flags
	typedef PartitionScanTuple<SizeT, PARTITIONS> PartitionScanTuple;

	// Tile status descriptor type
	typedef ScanTileDescriptor<PartitionScanTuple> ScanTileDescriptorT;

	// Block scan type for scanning membership flag scan_tuples
	typedef BlockScan<
	PartitionScanTuple,
	BlockPartitionTilesPolicy::BLOCK_THREADS,
	BlockPartitionTilesPolicy::SCAN_ALGORITHM> BlockScanT;

	// Callback type for obtaining inter-tile prefix during block scan
	typedef DeviceScanBlockPrefixOp<PartitionScanTuple, Sum> InterblockPrefixOp;

	// Shared memory type for this threadblock
	struct TempStorage
	{
	typename InterblockPrefixOp::TempStorage prefix; // Smem needed for cooperative prefix callback
	typename BlockScanT::TempStorage scan; // Smem needed for tile scanning
	SizeT tile_idx; // Shared tile index
	};


	//---------------------------------------------------------------------
	// Per-thread fields
	//---------------------------------------------------------------------

	TempStorage &temp_storage; ///< Reference to temp_storage
	InputIteratorRA d_in; ///< Input data
	OutputIteratorRA d_out; ///< Output data
	ScanTileDescriptorT *d_tile_status; ///< Global list of tile status
	PredicateOp pred_op; ///< Unary predicate operator indicating membership in the first partition
	SizeT num_items; ///< Total number of input items


	//---------------------------------------------------------------------
	// Constructor
	//---------------------------------------------------------------------

	// Constructor
	__device__ __forceinline__
	BlockPartitionTiles(
	TempStorage &temp_storage, ///< Reference to temp_storage
	InputIteratorRA d_in, ///< Input data
	OutputIteratorRA d_out, ///< Output data
	ScanTileDescriptorT *d_tile_status, ///< Global list of tile status
	PredicateOp pred_op, ///< Unary predicate operator indicating membership in the first partition
	SizeT num_items) ///< Total number of input items
	:
	temp_storage(temp_storage.Alias()),
	d_in(d_in),
	d_out(d_out),
	d_tile_status(d_tile_status),
	pred_op(pred_op),
	num_items(num_items)
	{}


	//---------------------------------------------------------------------
	// Domino scan
	//---------------------------------------------------------------------

	/**
	* Process a tile of input
	*/
	template <bool FULL_TILE>
	__device__ __forceinline__ void ConsumeTile(
	int tile_idx, ///< Tile index
	SizeT block_offset, ///< Tile offset
	PartitionScanTuple &partition_ends) ///< Running total
	{
	T items[ITEMS_PER_THREAD];
	PartitionScanTuple scan_tuples[ITEMS_PER_THREAD];

	// Load items
	int valid_items = num_items - block_offset;
	if (FULL_TILE)
	LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
	else
	LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);

	// Prevent hoisting
	// __syncthreads();
	// __threadfence_block();

	// Set partition membership flags in scan scan_tuples
	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
	{
	scan_tuples[ITEM].SetFlags(pred_op, items[ITEM]);
	}

	// Perform inclusive scan over scan scan_tuples
	PartitionScanTuple block_aggregate;
	if (tile_idx == 0)
	{
	BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, Sum(), block_aggregate);
	partition_ends = block_aggregate;

	// Update tile status if there are successor tiles
	if (FULL_TILE && (threadIdx.x == 0))
	ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
	}
	else
	{
	InterblockPrefixOp prefix_op(d_tile_status, temp_storage.prefix, Sum(), tile_idx);
	BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, Sum(), block_aggregate, prefix_op);
	partition_ends = prefix_op.inclusive_prefix;
	}

	// Scatter items
	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
	{
	// Scatter if not out-of-bounds
	if (FULL_TILE \|\| (threadIdx.x + (ITEM * BLOCK_THREADS) < valid_items))
	{
	scan_tuples[ITEM].Scatter(pred_op, items[ITEM], d_out, num_items);
	}
	}
	}


	/**
	* Dequeue and scan tiles of items as part of a domino scan
	*/
	__device__ __forceinline__ void ConsumeTiles(
	GridQueue<int> queue, ///< [in] Queue descriptor for assigning tiles of work to thread blocks
	SizeT num_tiles, ///< [in] Total number of input tiles
	PartitionScanTuple &partition_ends, ///< [out] Running partition end offsets
	bool &is_last_tile) ///< [out] Whether or not this block handled the last tile (i.e., partition_ends is valid for the entire input)
	{
	#if CUB_PTX_ARCH < 200

	// No concurrent kernels allowed and blocks are launched in increasing order, so just assign one tile per block (up to 65K blocks)
	int tile_idx = blockIdx.x;
	SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;

	if (block_offset + TILE_ITEMS <= num_items)
	{
	ConsumeTile<true>(tile_idx, block_offset, partition_ends);
	}
	else if (block_offset < num_items)
	{
	ConsumeTile<false>(tile_idx, block_offset, partition_ends);
	}
	is_last_tile = (tile_idx == num_tiles - 1);

	#else

	// Get first tile
	if (threadIdx.x == 0)
	temp_storage.tile_idx = queue.Drain(1);

	__syncthreads();

	int tile_idx = temp_storage.tile_idx;
	SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;

	while (block_offset + TILE_ITEMS <= num_items)
	{
	// Consume full tile
	ConsumeTile<true>(tile_idx, block_offset, partition_ends);
	is_last_tile = (tile_idx == num_tiles - 1);

	// Get next tile
	if (threadIdx.x == 0)
	temp_storage.tile_idx = queue.Drain(1);

	__syncthreads();

	tile_idx = temp_storage.tile_idx;
	block_offset = SizeT(TILE_ITEMS) * tile_idx;
	}

	// Consume a partially-full tile
	if (block_offset < num_items)
	{
	ConsumeTile<false>(tile_idx, block_offset, partition_ends);
	is_last_tile = (tile_idx == num_tiles - 1);
	}
	#endif
	}
	};


	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)

block_partition_tiles.cuhNo OneTemporaryActions

File Metadata

block_partition_tiles.cuhView Options

Event Timeline

block_partition_tiles.cuh
No OneTemporary
Actions

block_partition_tiles.cuh
View Options