#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Partition kernel entry point (multi-block)
*/
template <
typename BlockPartitionTilesPolicy, ///< Tuning policy for cub::BlockPartitionTiles abstraction
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename LengthOutputIterator, ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
typename PredicateOp, ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
typename SizeT> ///< Integer type used for global array indexing
typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel
typename PartitionKernelPtr, ///< Function type of cub::PartitionKernel
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename LengthOutputIterator, ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
typename PredicateOp, ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
int ptx_version, ///< [in] PTX version
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::PartitionInitKernel
PartitionKernelPtr partition_kernel, ///< [in] Kernel function pointer to parameterization of cub::PartitionKernel
KernelDispachParams &scan_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p partition_kernel was compiled for
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
LengthOutputIterator d_partition_length, ///< [out] Output iterator referencing the location where the pivot offset (i.e., the length of the first partition) is to be recorded
PredicateOp pred_op, ///< [in] Unary predicate operator indicating membership in the first partition
SizeT num_items, ///< [in] Total number of items to partition
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
/**
* Internal partition dispatch routine for using default tuning policies
*/
template <
typename PARTITIONS, ///< Number of partitions we are keeping
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename LengthOutputIterator, ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
typename PredicateOp, ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to input items
OutputIteratorRA d_out, ///< [in] Iterator pointing to output items
LengthOutputIterator d_partition_length, ///< [out] Output iterator referencing the location where the pivot offset (i.e., the length of the first partition) is to be recorded
PredicateOp pred_op, ///< [in] Unary predicate operator indicating membership in the first partition
SizeT num_items, ///< [in] Total number of items to partition
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
* \brief Splits a list of input items into two partitions within the given output list using the specified predicate. The relative ordering of inputs is not necessarily preserved.
*
* An item \p val is placed in the first partition if <tt>pred_op(val) == true</tt>, otherwise
* it is placed in the second partition. The offset of the partitioning pivot (equivalent to
* the total length of the first partition as well as the starting offset of the second), is
* recorded to \p d_partition_length.
*
* The length of the output referenced by \p d_out is assumed to be the same as that of \p d_in.
*
* \devicestorage
*
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
* \tparam LengthOutputIterator <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
* \tparam PredicateOp <b>[inferred]</b> Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA,
typename LengthOutputIterator,
typename PredicateOp>
__host__ __device__ __forceinline__
static cudaError_t Partition(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to input items
OutputIteratorRA d_out, ///< [in] Iterator pointing to output items
LengthOutputIterator d_pivot_offset, ///< [out] Output iterator referencing the location where the pivot offset is to be recorded
PredicateOp pred_op, ///< [in] Unary predicate operator indicating membership in the first partition
int num_items, ///< [in] Total number of items to partition
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.