Page MenuHomec4science

vector_kernel.cu
No OneTemporary

File Metadata

Created
Mon, Oct 21, 21:04

vector_kernel.cu

// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 5632 $
// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* vector_kernel.cu
*
* @brief CUDA kernel methods for basic operations on vectors.
*
* CUDA kernel methods for basic operations on vectors.
*
* Examples:
* - vectorAddConstant(): d_vector + constant
* - vectorAddUniform(): d_vector + uniform (per-block constants)
* - vectorAddVectorVector(): d_vector + d_vector
*/
// MJH: these functions assume there are 2N elements for N threads.
// Is this always going to be a good idea? There may be cases where
// we have as many threads as elements, but for large problems
// we are probably limited by max CTA size for simple kernels like
// this so we should process multiple elements per thread.
// we may want to extend these with looping versions that process
// many elements per thread.
#include "cudpp_util.h"
#include "sharedmem.h"
#include "cudpp.h"
/** \addtogroup cudpp_kernel
* @{
*/
/** @name Vector Functions
* CUDA kernel methods for basic operations on vectors.
* @{
*/
/** @brief Adds a constant value to all values in the input d_vector
*
* Each thread adds two pairs of elements.
* @todo Test this function -- it is currently not yet used.
*
* @param[in,out] d_vector The array of elements to be modified
* @param[in] constant The constant value to be added to elements of
* \a d_vector
* @param[in] n The number of elements in the d_vector to be modified
* @param[in] baseIndex An optional offset to the beginning of the
* elements in the input array to be processed
*/
template <class T>
__global__ void vectorAddConstant(T *d_vector,
T constant,
int n,
int baseIndex)
{
// Compute this thread's output address
unsigned int address = baseIndex + threadIdx.x +
__mul24(blockIdx.x, (blockDim.x << 1));
// note two adds per thread: one in first half of the block, one in last
d_vector[address] += constant;
d_vector[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * constant;
}
/** @brief Add a uniform value to each data element of an array
*
* This function reads one value per CTA from \a d_uniforms into shared
* memory and adds that value to all values "owned" by the CTA in \a
* d_vector. Each thread adds two pairs of values.
*
* @param[out] d_vector The d_vector whose values will have the uniform added
* @param[in] d_uniforms The array of uniform values (one per CTA)
* @param[in] numElements The number of elements in \a d_vector to process
* @param[in] blockOffset an optional offset to the beginning of this block's
* data.
* @param[in] baseIndex an optional offset to the beginning of the array
* within \a d_vector.
*/
template <class T>
__global__ void vectorAddUniform(T *d_vector,
const T *d_uniforms,
int numElements,
int blockOffset,
int baseIndex)
{
__shared__ T uni;
// Get this block's uniform value from the uniform array in device memory
// We store it in shared memory so that the hardware's shared memory
// broadcast capability can be used to share among all threads in each warp
// in a single cycle
if (threadIdx.x == 0)
{
uni = d_uniforms[blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset];
}
// Compute this thread's output address
int width = __mul24(gridDim.x,(blockDim.x << 1));
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 1));
__syncthreads();
// note two adds per thread: one in first half of the block, one in last
d_vector[address] += uni;
if (threadIdx.x + blockDim.x < numElements) d_vector[address + blockDim.x] += uni;
}
/** @brief Add a uniform value to each data element of an array (vec4 version)
*
* This function reads one value per CTA from \a d_uniforms into shared
* memory and adds that value to all values "owned" by the CTA in \a d_vector.
* Each thread adds the uniform value to eight values in \a d_vector.
*
* @param[out] d_vector The d_vector whose values will have the uniform added
* @param[in] d_uniforms The array of uniform values (one per CTA)
* @param[in] numElements The number of elements in \a d_vector to process
* @param[in] vectorRowPitch For 2D arrays, the pitch (in elements) of the
* rows of \a d_vector.
* @param[in] uniformRowPitch For 2D arrays, the pitch (in elements) of the
* rows of \a d_uniforms.
* @param[in] blockOffset an optional offset to the beginning of this block's
* data.
* @param[in] baseIndex an optional offset to the beginning of the array
* within \a d_vector.
*/
template <class T, CUDPPOperator op, int elementsPerThread>
__global__ void vectorAddUniform4(T *d_vector,
const T *d_uniforms,
int numElements,
int vectorRowPitch, // width of input array in elements
int uniformRowPitch, // width of uniform array in elements
int blockOffset,
int baseIndex)
{
__shared__ T uni;
// Get this block's uniform value from the uniform array in device memory
// We store it in shared memory so that the hardware's shared memory
// broadcast capability can be used to share among all threads in each warp
// in a single cycle
if (threadIdx.x == 0)
{
uni = d_uniforms[blockIdx.x + __umul24(uniformRowPitch, blockIdx.y) + blockOffset];
}
// Compute this thread's output address
//int width = __mul24(gridDim.x,(blockDim.x << 1));
unsigned int address = baseIndex + __umul24(vectorRowPitch, blockIdx.y)
+ threadIdx.x + __umul24(blockIdx.x, (blockDim.x * elementsPerThread));
numElements += __umul24(vectorRowPitch, blockIdx.y);
__syncthreads();
switch (op)
{
case CUDPP_ADD:
for (int i = 0; i < elementsPerThread && address < numElements; i++)
{
d_vector[address] += uni;
address += blockDim.x;
}
break;
case CUDPP_MULTIPLY:
for (int i = 0; i < elementsPerThread && address < numElements; i++)
{
d_vector[address] *= uni;
address += blockDim.x;
}
break;
case CUDPP_MAX:
for (int i = 0; i < elementsPerThread && address < numElements; i++)
{
d_vector[address] = max(d_vector[address], uni);
address += blockDim.x;
}
break;
case CUDPP_MIN:
for (int i = 0; i < elementsPerThread && address < numElements; i++)
{
d_vector[address] = min(d_vector[address], uni);
address += blockDim.x;
}
break;
default:
break;
}
}
/** @brief Adds together two vectors
*
* Each thread adds two pairs of elements.
* @todo Test this function -- it is currently not yet used.
*
* @param[out] d_vectorA The left operand array and the result
* @param[in] d_vectorB The right operand array
* @param[in] numElements The number of elements in the vectors to be added.
* @param[in] baseIndex An optional offset to the beginning of the
* elements in the input arrays to be processed
*/
template <class T>
__global__ void vectorAddVector(T *d_vectorA, // A += B
const T *d_vectorB,
int numElements,
int baseIndex)
{
// Compute this thread's output address
unsigned int address = baseIndex + threadIdx.x +
__mul24(blockIdx.x, (blockDim.x << 1));
// note two adds per thread: one in first half of the block, one in last
d_vectorA[address] += d_vectorB[address];
d_vectorA[address + blockDim.x] +=
(threadIdx.x + blockDim.x < numElements) * d_vectorB[address];
}
/** @brief Add a uniform value to data elements of an array (vec4 version)
*
* This function reads one value per CTA from \a d_uniforms into shared
* memory and adds that value to values "owned" by the CTA in \a d_vector.
* The uniform value is added to only those values "owned" by the CTA which
* have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX
* it adds the uniform to all values "owned" by the CTA.
* Each thread adds the uniform value to eight values in \a d_vector.
*
* @param[out] d_vector The d_vector whose values will have the uniform added
* @param[in] d_uniforms The array of uniform values (one per CTA)
* @param[in] d_maxIndices The array of maximum indices (one per CTA). This is
* index upto which the uniform would be added. If this is UINT_MAX
* the uniform is added to all elements of the CTA. This index is
* 1-based.
* @param[in] numElements The number of elements in \a d_vector to process
* @param[in] blockOffset an optional offset to the beginning of this block's
* data.
* @param[in] baseIndex an optional offset to the beginning of the array
* within \a d_vector.
*/
template <class T, CUDPPOperator oper, bool isLastBlockFull>
__global__ void vectorSegmentedAddUniform4(T *d_vector,
const T *d_uniforms,
const unsigned int *d_maxIndices,
unsigned int numElements,
int blockOffset,
int baseIndex)
{
__shared__ T uni[2];
unsigned int blockAddress =
blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;
// Get this block's uniform value from the uniform array in device memory
// We store it in shared memory so that the hardware's shared memory
// broadcast capability can be used to share among all threads in each warp
// in a single cycle
if (threadIdx.x == 0)
{
if (blockAddress > 0)
uni[0] = d_uniforms[blockAddress-1];
else
uni[0] = Operator<T, oper>::identity();
// Tacit assumption that T is four-byte wide
uni[1] = (T)(d_maxIndices[blockAddress]);
}
// Compute this thread's output address
int width = __mul24(gridDim.x,(blockDim.x << 1));
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3));
__syncthreads();
unsigned int maxIndex = (unsigned int)(uni[1]);
bool isLastBlock = (blockIdx.x == (gridDim.x-1));
if (maxIndex < UINT_MAX)
{
// Since maxIndex is a 1 based index
--maxIndex;
bool leftLess = address < maxIndex;
bool rightLess = (address + 7 * blockDim.x) < maxIndex;
if (leftLess)
{
if (rightLess)
{
for (unsigned int i = 0; i < 8; ++i)
d_vector[address + i * blockDim.x] =
Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
}
else
{
for (unsigned int i=0; i < 8; ++i)
{
if (address < maxIndex)
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
}
}
else
{
if (!isLastBlockFull && isLastBlock)
{
for (unsigned int i = 0; i < 8; ++i)
{
if (address < numElements)
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
else
{
for (unsigned int i=0; i<8; ++i)
{
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
}
}
/** @brief Add a uniform value to data elements of an array (vec4 version)
*
* This function reads one value per CTA from \a d_uniforms into shared
* memory and adds that value to values "owned" by the CTA in \a d_vector.
* The uniform value is added to only those values "owned" by the CTA which
* have an index greater than d_minIndex. If d_minIndex for that CTA is 0
* it adds the uniform to all values "owned" by the CTA.
* Each thread adds the uniform value to eight values in \a d_vector.
*
* @param[out] d_vector The d_vector whose values will have the uniform added
* @param[in] d_uniforms The array of uniform values (one per CTA)
* @param[in] d_minIndices The array of minimum indices (one per CTA). The
* uniform is added to the right of this index (that is, to every index
* that is greater than this index). If this is 0, the uniform is
* added to all elements of the CTA. This index is 1-based to
* prevent overloading of what 0 means. In our case it means
* absence of a flag. But if the first element of a CTA has
* flag the index will also be 0. Hence we use 1-based indices
* so the index is 1 in the latter case.
* @param[in] numElements The number of elements in \a d_vector to process
* @param[in] blockOffset an optional offset to the beginning of this block's
* data.
* @param[in] baseIndex an optional offset to the beginning of the array
* within \a d_vector.
*
*/
template <class T, CUDPPOperator oper, bool isLastBlockFull>
__global__ void vectorSegmentedAddUniformToRight4(T *d_vector,
const T *d_uniforms,
const unsigned int *d_minIndices,
unsigned int numElements,
int blockOffset,
int baseIndex)
{
__shared__ T uni[2];
unsigned int blockAddress =
blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;
// Get this block's uniform value from the uniform array in device memory
// We store it in shared memory so that the hardware's shared memory
// broadcast capability can be used to share among all threads in each warp
// in a single cycle
if (threadIdx.x == 0)
{
// FIXME - blockAddress test here is incompatible with how it is calculated
// above
if (blockAddress < (gridDim.x-1))
uni[0] = d_uniforms[blockAddress+1];
else
uni[0] = Operator<T, oper>::identity();
// Tacit assumption that T is four-byte wide
uni[1] = (T)(d_minIndices[blockAddress]);
}
// Compute this thread's output address
int width = __mul24(gridDim.x,(blockDim.x << 1));
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3));
__syncthreads();
unsigned int minIndex = (unsigned int)(uni[1]);
bool isLastBlock = (blockIdx.x == (gridDim.x-1));
if (minIndex > 0)
{
// Since minIndex is a 1 based index
--minIndex;
bool leftInRange = address > minIndex;
bool rightInRange = (address + 7 * blockDim.x) > minIndex;
if (rightInRange)
{
if (leftInRange)
{
for (unsigned int i = 0; i < 8; ++i)
d_vector[address + i * blockDim.x] =
Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
}
else
{
for (unsigned int i=0; i < 8; ++i)
{
if (address > minIndex)
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
}
}
else
{
if (!isLastBlockFull && isLastBlock)
{
for (unsigned int i = 0; i < 8; ++i)
{
if (address < numElements)
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
else
{
for (unsigned int i=0; i<8; ++i)
{
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
}
}
/** @} */ // end d_vector functions
/** @} */ // end cudpp_kernel

Event Timeline