if (threadIdx.x == 0) grid_queue.ResetDrain(num_samples);
}
/**
* Histogram pass kernel entry point (multi-block). Computes privatized histograms, one per thread block.
*/
template <
typename BlockHistogramTilesPolicy, ///< Tuning policy for cub::BlockHistogramTiles abstraction
int BINS, ///< Number of histogram bins per channel
int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to <tt>unsigned char</tt>
typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
typename SizeT> ///< Integer type used for global array indexing
InputIteratorRA d_samples, ///< [in] Array of sample data. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_out_histograms, ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][gridDim.x][BINS]</tt>
SizeT num_samples, ///< [in] Total number of samples \p d_samples for all channels
GridEvenShare<SizeT> even_share, ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
GridQueue<SizeT> queue) ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
* \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from samples data residing within global memory. ![](histogram_logo.png)
* \ingroup DeviceModule
*
* \par Overview
* A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
* counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
*
* \par Usage Considerations
* \cdp_class{DeviceHistogram}
*
* \par Performance
*
* \image html histo_perf.png
*
*/
struct DeviceHistogram
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
* Internal dispatch routine for invoking device-wide, multi-channel, histogram
*/
template <
int BINS, ///< Number of histogram bins per channel
int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
typename InitHistoKernelPtr, ///< Function type of cub::InitHistoKernel
typename MultiBlockHistogramKernelPtr, ///< Function type of cub::MultiBlockHistogramKernel
typename AggregateHistoKernelPtr, ///< Function type of cub::AggregateHistoKernel
typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to <tt>unsigned char</tt>
typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InitHistoKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::InitHistoKernel
MultiBlockHistogramKernelPtr multi_block_kernel, ///< [in] Kernel function pointer to parameterization of cub::MultiBlockHistogramKernel
AggregateHistoKernelPtr aggregate_kernel, ///< [in] Kernel function pointer to parameterization of cub::AggregateHistoKernel
KernelDispachParams &multi_block_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel was compiled for
InputIteratorRA d_samples, ///< [in] Input samples to histogram
HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
SizeT num_samples, ///< [in] Number of samples to process
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported);
#else
cudaError error = cudaSuccess;
do
{
// Get device ordinal
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
// Get SM count
int sm_count;
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
// Get a rough estimate of multi_block_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
/**
* \brief Computes a device-wide histogram
*
* \tparam GRID_ALGORITHM cub::BlockHistogramTilesAlgorithm enumerator specifying the underlying algorithm to use
* \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
* \tparam ACTIVE_CHANNELS <b>[inferred]</b> Number of channels actively being histogrammed
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type) Must have a value type that is assignable to <tt>unsigned char</tt>
* \tparam HistoCounter <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
*/
template <
BlockHistogramTilesAlgorithm GRID_ALGORITHM,
int BINS, ///< Number of histogram bins per channel
int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to <tt>unsigned char</tt>
typename HistoCounter> ///< Integral type for counting sample occurrences per histogram bin
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_samples, ///< [in] Input samples to histogram
HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
int num_samples, ///< [in] Number of samples to process
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
// Type used for array indexing
typedef int SizeT;
// Tuning polices for the PTX architecture that will get dispatched to
* \brief Computes a device-wide histogram. Uses fast block-sorting to compute the histogram. Delivers consistent throughput regardless of sample diversity, but occupancy may be limited by histogram bin count.
*
* However, because histograms are privatized in shared memory, a large
* number of bins (e.g., thousands) may adversely affect occupancy and
* performance (or even the ability to launch).
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* \par
* The code snippet below illustrates the computation of a 256-bin histogram of
* single-channel <tt>unsigned char</tt> samples.
* \par
* \code
* #include <cub/cub.cuh>
* ...
*
* // Declare and initialize device pointers for input samples and 256-bin output histogram
* unsigned char *d_samples;
* unsigned int *d_histogram;
* int num_items = ...
* ...
*
* // Wrap d_samples device pointer in a random-access texture iterator
* \tparam BINS Number of histogram bins per channel
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type) Must have a value type that can be cast as an integer in the range [0..BINS-1]
* \tparam HistoCounter <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
*/
template <
int BINS,
typename InputIteratorRA,
typename HistoCounter>
__host__ __device__ __forceinline__
static cudaError_t SingleChannelSorting(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
HistoCounter* d_histogram, ///< [out] Array of BINS counters of integral type \p HistoCounter.
int num_samples, ///< [in] Number of samples to process
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
* \brief Computes a device-wide histogram. Uses shared-memory atomic read-modify-write operations to compute the histogram. Input samples having lower diversity can cause performance to be degraded, and occupancy may be limited by histogram bin count.
*
* However, because histograms are privatized in shared memory, a large
* number of bins (e.g., thousands) may adversely affect occupancy and
* performance (or even the ability to launch).
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* \par
* The code snippet below illustrates the computation of a 256-bin histogram of
* single-channel <tt>unsigned char</tt> samples.
* \par
* \code
* #include <cub/cub.cuh>
* ...
*
* // Declare and initialize device pointers for input samples and 256-bin output histogram
* unsigned char *d_samples;
* unsigned int *d_histogram;
* int num_items = ...
* ...
*
* // Wrap d_samples device pointer in a random-access texture iterator
* \tparam BINS Number of histogram bins per channel
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type) Must have a value type that can be cast as an integer in the range [0..BINS-1]
* \tparam HistoCounter <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
*/
template <
int BINS,
typename InputIteratorRA,
typename HistoCounter>
__host__ __device__ __forceinline__
static cudaError_t SingleChannelSharedAtomic(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
HistoCounter* d_histogram, ///< [out] Array of BINS counters of integral type \p HistoCounter.
int num_samples, ///< [in] Number of samples to process
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
* \brief Computes a device-wide histogram. Uses global-memory atomic read-modify-write operations to compute the histogram. Input samples having lower diversity can cause performance to be degraded.
*
* Performance is not significantly impacted when computing histograms having large
* numbers of bins (e.g., thousands).
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* \par
* The code snippet below illustrates the computation of a 256-bin histogram of
* single-channel <tt>unsigned char</tt> samples.
* \par
* \code
* #include <cub/cub.cuh>
* ...
*
* // Declare and initialize device pointers for input samples and 256-bin output histogram
* unsigned char *d_samples;
* unsigned int *d_histogram;
* int num_items = ...
* ...
*
* // Wrap d_samples device pointer in a random-access texture iterator
* \tparam BINS Number of histogram bins per channel
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type) Must have a value type that can be cast as an integer in the range [0..BINS-1]
* \tparam HistoCounter <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
*/
template <
int BINS,
typename InputIteratorRA,
typename HistoCounter>
__host__ __device__ __forceinline__
static cudaError_t SingleChannelGlobalAtomic(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
HistoCounter* d_histogram, ///< [out] Array of BINS counters of integral type \p HistoCounter.
int num_samples, ///< [in] Number of samples to process
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
* \brief Computes a device-wide histogram from multi-channel data. Uses fast block-sorting to compute the histogram. Delivers consistent throughput regardless of sample diversity, but occupancy may be limited by histogram bin count.
*
* However, because histograms are privatized in shared memory, a large
* number of bins (e.g., thousands) may adversely affect occupancy and
* performance (or even the ability to launch).
*
* The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* \par
* The code snippet below illustrates the computation of three 256-bin histograms from
* \tparam BINS Number of histogram bins per channel
* \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
* \tparam ACTIVE_CHANNELS <b>[inferred]</b> Number of channels actively being histogrammed
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type) Must have a value type that can be cast as an integer in the range [0..BINS-1]
* \tparam HistoCounter <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
*/
template <
int BINS,
int CHANNELS,
int ACTIVE_CHANNELS,
typename InputIteratorRA,
typename HistoCounter>
__host__ __device__ __forceinline__
static cudaError_t MultiChannelSorting(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_samples, ///< [in] Input samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histogram counter arrays, each having BINS counters of integral type \p HistoCounter.
int num_samples, ///< [in] Total number of samples to process in all channels, including non-active channels
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
* \brief Computes a device-wide histogram from multi-channel data. Uses shared-memory atomic read-modify-write operations to compute the histogram. Input samples having lower diversity can cause performance to be degraded, and occupancy may be limited by histogram bin count.
*
* However, because histograms are privatized in shared memory, a large
* number of bins (e.g., thousands) may adversely affect occupancy and
* performance (or even the ability to launch).
*
* The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* \par
* The code snippet below illustrates the computation of three 256-bin histograms from
* \tparam BINS Number of histogram bins per channel
* \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
* \tparam ACTIVE_CHANNELS <b>[inferred]</b> Number of channels actively being histogrammed
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type) Must have a value type that can be cast as an integer in the range [0..BINS-1]
* \tparam HistoCounter <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
*/
template <
int BINS,
int CHANNELS,
int ACTIVE_CHANNELS,
typename InputIteratorRA,
typename HistoCounter>
__host__ __device__ __forceinline__
static cudaError_t MultiChannelSharedAtomic(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_samples, ///< [in] Input samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histogram counter arrays, each having BINS counters of integral type \p HistoCounter.
int num_samples, ///< [in] Total number of samples to process in all channels, including non-active channels
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
* \brief Computes a device-wide histogram from multi-channel data. Uses global-memory atomic read-modify-write operations to compute the histogram. Input samples having lower diversity can cause performance to be degraded.
*
* Performance is not significantly impacted when computing histograms having large
* numbers of bins (e.g., thousands).
*
* The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* Performance is often improved when referencing input samples through a texture-caching iterator, e.g., cub::TexIteratorRA or cub::TexTransformIteratorRA.
*
* \par
* The code snippet below illustrates the computation of three 256-bin histograms from
* \tparam BINS Number of histogram bins per channel
* \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
* \tparam ACTIVE_CHANNELS <b>[inferred]</b> Number of channels actively being histogrammed
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type) Must have a value type that can be cast as an integer in the range [0..BINS-1]
* \tparam HistoCounter <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
*/
template <
int BINS,
int CHANNELS,
int ACTIVE_CHANNELS,
typename InputIteratorRA,
typename HistoCounter>
__host__ __device__ __forceinline__
static cudaError_t MultiChannelGlobalAtomic(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_samples, ///< [in] Input samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histogram counter arrays, each having BINS counters of integral type \p HistoCounter.
int num_samples, ///< [in] Total number of samples to process in all channels, including non-active channels
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.