Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F86475584
cudpp_maximal_launch.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Oct 6, 17:29
Size
3 KB
Mime Type
text/x-c
Expires
Tue, Oct 8, 17:29 (2 d)
Engine
blob
Format
Raw Data
Handle
21402246
Attached To
rLAMMPS lammps
cudpp_maximal_launch.cpp
View Options
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#include "cudpp_maximal_launch.h"
inline size_t min(size_t x, size_t y)
{
return (x <= y) ? x : y;
}
inline size_t max(size_t x, size_t y)
{
return (x >= y) ? x : y;
}
// computes next highest multiple of f from x
inline size_t multiple(size_t x, size_t f)
{
return ((x + (f-1)) / f);
}
// MS Excel-style CEIL() function
// Rounds x up to nearest multiple of f
inline size_t ceiling(size_t x, size_t f)
{
return multiple(x, f) * f;
}
extern "C"
size_t maxBlocks(cudaFuncAttributes &attribs,
cudaDeviceProp &devprop,
size_t bytesDynamicSharedMem,
size_t threadsPerBlock)
{
// Determine the maximum number of CTAs that can be run simultaneously for each kernel
// This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
const unsigned int warpAllocationMultiple = 2;
const unsigned int smemAllocationUnit = 512; // in bytes
const unsigned int maxThreadsPerSM = (devprop.major < 2 && devprop.minor < 2) ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024
const unsigned int maxBlocksPerSM = 8;
// Number of warps (round up to nearest whole multiple of warp size)
size_t numWarps = multiple(threadsPerBlock, devprop.warpSize);
// Round up to warp allocation multiple
numWarps = ceiling(numWarps, warpAllocationMultiple);
// Number of regs is regs per thread times number of warps times warp size
size_t regsPerCTA = attribs.numRegs * devprop.warpSize * numWarps;
// Round up to multiple of register allocation unit size
regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);
size_t smemBytes = attribs.sharedSizeBytes + bytesDynamicSharedMem;
size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);
size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM;
size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
size_t ctaLimitThreads = maxThreadsPerSM / threadsPerBlock;
return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
}
extern "C"
size_t maxBlocksFromPointer(void* kernel,
size_t bytesDynamicSharedMem,
size_t threadsPerBlock)
{
cudaDeviceProp devprop;
int deviceID = -1;
cudaError_t err = cudaGetDevice(&deviceID);
if (err == cudaSuccess)
{
err = cudaGetDeviceProperties(&devprop, deviceID);
if (err != cudaSuccess)
return -1;
cudaFuncAttributes attr;
err = cudaFuncGetAttributes(&attr, (const char*)kernel);
if (err != cudaSuccess)
return -1;
return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
}
return -1;
}
Event Timeline
Log In to Comment