File Metadata

Created: Thu, Jul 17, 20:11

cudpp_maximal_launch.cpp
View Options

	// -------------------------------------------------------------
	// cuDPP -- CUDA Data Parallel Primitives library
	// -------------------------------------------------------------
	// $Revision$
	// $Date$
	// -------------------------------------------------------------
	// This source code is distributed under the terms of license.txt
	// in the root directory of this source distribution.
	// -------------------------------------------------------------
	#include "cudpp_maximal_launch.h"

	inline size_t min(size_t x, size_t y)
	{
	return (x <= y) ? x : y;
	}

	inline size_t max(size_t x, size_t y)
	{
	return (x >= y) ? x : y;
	}

	// computes next highest multiple of f from x
	inline size_t multiple(size_t x, size_t f)
	{
	return ((x + (f-1)) / f);
	}


	// MS Excel-style CEIL() function
	// Rounds x up to nearest multiple of f
	inline size_t ceiling(size_t x, size_t f)
	{
	return multiple(x, f) * f;
	}

	extern "C"
	size_t maxBlocks(cudaFuncAttributes &attribs,
	cudaDeviceProp &devprop,
	size_t bytesDynamicSharedMem,
	size_t threadsPerBlock)
	{

	// Determine the maximum number of CTAs that can be run simultaneously for each kernel
	// This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
	const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
	const unsigned int warpAllocationMultiple = 2;
	const unsigned int smemAllocationUnit = 512; // in bytes
	const unsigned int maxThreadsPerSM = (devprop.major < 2 && devprop.minor < 2) ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024
	const unsigned int maxBlocksPerSM = 8;

	// Number of warps (round up to nearest whole multiple of warp size)
	size_t numWarps = multiple(threadsPerBlock, devprop.warpSize);
	// Round up to warp allocation multiple
	numWarps = ceiling(numWarps, warpAllocationMultiple);

	// Number of regs is regs per thread times number of warps times warp size
	size_t regsPerCTA = attribs.numRegs * devprop.warpSize * numWarps;
	// Round up to multiple of register allocation unit size
	regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);

	size_t smemBytes = attribs.sharedSizeBytes + bytesDynamicSharedMem;
	size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);

	size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM;
	size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
	size_t ctaLimitThreads = maxThreadsPerSM / threadsPerBlock;

	return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
	}

	extern "C"
	size_t maxBlocksFromPointer(void* kernel,
	size_t bytesDynamicSharedMem,
	size_t threadsPerBlock)
	{
	cudaDeviceProp devprop;
	int deviceID = -1;
	cudaError_t err = cudaGetDevice(&deviceID);
	if (err == cudaSuccess)
	{
	err = cudaGetDeviceProperties(&devprop, deviceID);
	if (err != cudaSuccess)
	return -1;

	cudaFuncAttributes attr;
	err = cudaFuncGetAttributes(&attr, (const char*)kernel);
	if (err != cudaSuccess)
	return -1;

	return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
	}

	return -1;
	}

cudpp_maximal_launch.cpp
No OneTemporary
Actions

File Metadata

cudpp_maximal_launch.cpp
View Options

Event Timeline

cudpp_maximal_launch.cppNo OneTemporaryActions

File Metadata

cudpp_maximal_launch.cppView Options

Event Timeline

cudpp_maximal_launch.cpp
No OneTemporary
Actions

cudpp_maximal_launch.cpp
View Options