util_allocator.cuh
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Jul 28, 20:04

util_allocator.cuh
View Options

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Simple caching allocator for device memory allocations. The allocator is
	* thread-safe and capable of managing device allocations on multiple devices.
	******************************************************************************/

	#pragma once

	#ifndef __CUDA_ARCH__
	#include <set> // NVCC (EDG, really) takes FOREVER to compile std::map
	#include <map>
	#endif

	#include <math.h>

	#include "util_namespace.cuh"
	#include "util_debug.cuh"

	#include "host/spinlock.cuh"

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {


	/**
	* \addtogroup UtilModule
	* @{
	*/


	/******************************************************************************
	* CachingDeviceAllocator (host use)
	******************************************************************************/

	/**
	* \brief A simple caching allocator for device memory allocations.
	*
	* \par Overview
	* The allocator is thread-safe and is capable of managing cached device allocations
	* on multiple devices. It behaves as follows:
	*
	* \par
	* - Allocations categorized by bin size.
	* - Bin sizes progress geometrically in accordance with the growth factor
	* \p bin_growth provided during construction. Unused device allocations within
	* a larger bin cache are not reused for allocation requests that categorize to
	* smaller bin sizes.
	* - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
	* (\p bin_growth ^ \p min_bin).
	* - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
	* bin and are simply freed when they are deallocated instead of being returned
	* to a bin-cache.
	* - %If the total storage of cached allocations on a given device will exceed
	* \p max_cached_bytes, allocations for that device are simply freed when they are
	* deallocated instead of being returned to their bin-cache.
	*
	* \par
	* For example, the default-constructed CachingDeviceAllocator is configured with:
	* - \p bin_growth = 8
	* - \p min_bin = 3
	* - \p max_bin = 7
	* - \p max_cached_bytes = 6MB - 1B
	*
	* \par
	* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
	* and sets a maximum of 6,291,455 cached bytes per device
	*
	*/
	struct CachingDeviceAllocator
	{
	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document


	//---------------------------------------------------------------------
	// Type definitions and constants
	//---------------------------------------------------------------------

	enum
	{
	/// Invalid device ordinal
	INVALID_DEVICE_ORDINAL = -1,
	};

	/**
	* Integer pow function for unsigned base and exponent
	*/
	static unsigned int IntPow(
	unsigned int base,
	unsigned int exp)
	{
	unsigned int retval = 1;
	while (exp > 0)
	{
	if (exp & 1) {
	retval = retval * base; // multiply the result by the current base
	}
	base = base * base; // square the base
	exp = exp >> 1; // divide the exponent in half
	}
	return retval;
	}


	/**
	* Round up to the nearest power-of
	*/
	static void NearestPowerOf(
	unsigned int &power,
	size_t &rounded_bytes,
	unsigned int base,
	size_t value)
	{
	power = 0;
	rounded_bytes = 1;

	while (rounded_bytes < value)
	{
	rounded_bytes *= base;
	power++;
	}
	}

	/**
	* Descriptor for device memory allocations
	*/
	struct BlockDescriptor
	{
	int device; // device ordinal
	void* d_ptr; // Device pointer
	size_t bytes; // Size of allocation in bytes
	unsigned int bin; // Bin enumeration

	// Constructor
	BlockDescriptor(void *d_ptr, int device) :
	d_ptr(d_ptr),
	bytes(0),
	bin(0),
	device(device) {}

	// Constructor
	BlockDescriptor(size_t bytes, unsigned int bin, int device) :
	d_ptr(NULL),
	bytes(bytes),
	bin(bin),
	device(device) {}

	// Comparison functor for comparing device pointers
	static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
	{
	if (a.device < b.device) {
	return true;
	} else if (a.device > b.device) {
	return false;
	} else {
	return (a.d_ptr < b.d_ptr);
	}
	}

	// Comparison functor for comparing allocation sizes
	static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
	{
	if (a.device < b.device) {
	return true;
	} else if (a.device > b.device) {
	return false;
	} else {
	return (a.bytes < b.bytes);
	}
	}
	};

	/// BlockDescriptor comparator function interface
	typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);

	#ifndef __CUDA_ARCH__ // Only define STL container members in host code

	/// Set type for cached blocks (ordered by size)
	typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;

	/// Set type for live blocks (ordered by ptr)
	typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;

	/// Map type of device ordinals to the number of cached bytes cached by each device
	typedef std::map<int, size_t> GpuCachedBytes;

	#endif // __CUDA_ARCH__

	//---------------------------------------------------------------------
	// Fields
	//---------------------------------------------------------------------

	Spinlock spin_lock; /// Spinlock for thread-safety

	unsigned int bin_growth; /// Geometric growth factor for bin-sizes
	unsigned int min_bin; /// Minimum bin enumeration
	unsigned int max_bin; /// Maximum bin enumeration

	size_t min_bin_bytes; /// Minimum bin size
	size_t max_bin_bytes; /// Maximum bin size
	size_t max_cached_bytes; /// Maximum aggregate cached bytes per device

	bool debug; /// Whether or not to print (de)allocation events to stdout
	bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators)

	#ifndef __CUDA_ARCH__ // Only define STL container members in host code

	GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device
	CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse
	BusyBlocks live_blocks; /// Set of live device allocations currently in use

	#endif // __CUDA_ARCH__

	#endif // DOXYGEN_SHOULD_SKIP_THIS

	//---------------------------------------------------------------------
	// Methods
	//---------------------------------------------------------------------

	/**
	* \brief Constructor.
	*/
	CachingDeviceAllocator(
	unsigned int bin_growth, ///< Geometric growth factor for bin-sizes
	unsigned int min_bin, ///< Minimum bin
	unsigned int max_bin, ///< Maximum bin
	size_t max_cached_bytes) ///< Maximum aggregate cached bytes per device
	:
	#ifndef __CUDA_ARCH__ // Only define STL container members in host code
	cached_blocks(BlockDescriptor::SizeCompare),
	live_blocks(BlockDescriptor::PtrCompare),
	#endif
	debug(false),
	spin_lock(0),
	bin_growth(bin_growth),
	min_bin(min_bin),
	max_bin(max_bin),
	min_bin_bytes(IntPow(bin_growth, min_bin)),
	max_bin_bytes(IntPow(bin_growth, max_bin)),
	max_cached_bytes(max_cached_bytes)
	{}


	/**
	* \brief Default constructor.
	*
	* Configured with:
	* \par
	* - \p bin_growth = 8
	* - \p min_bin = 3
	* - \p max_bin = 7
	* - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
	*
	* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
	* sets a maximum of 6,291,455 cached bytes per device
	*/
	CachingDeviceAllocator(bool skip_cleanup = false) :
	#ifndef __CUDA_ARCH__ // Only define STL container members in host code
	cached_blocks(BlockDescriptor::SizeCompare),
	live_blocks(BlockDescriptor::PtrCompare),
	#endif
	skip_cleanup(skip_cleanup),
	debug(false),
	spin_lock(0),
	bin_growth(8),
	min_bin(3),
	max_bin(7),
	min_bin_bytes(IntPow(bin_growth, min_bin)),
	max_bin_bytes(IntPow(bin_growth, max_bin)),
	max_cached_bytes((max_bin_bytes * 3) - 1)
	{}


	/**
	* \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
	*/
	cudaError_t SetMaxCachedBytes(
	size_t max_cached_bytes)
	{
	#ifdef __CUDA_ARCH__
	// Caching functionality only defined on host
	return CubDebug(cudaErrorInvalidConfiguration);
	#else

	// Lock
	Lock(&spin_lock);

	this->max_cached_bytes = max_cached_bytes;

	if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);

	// Unlock
	Unlock(&spin_lock);

	return cudaSuccess;

	#endif // __CUDA_ARCH__
	}


	/**
	* \brief Provides a suitable allocation of device memory for the given size on the specified device
	*/
	cudaError_t DeviceAllocate(
	void** d_ptr,
	size_t bytes,
	int device)
	{
	#ifdef __CUDA_ARCH__
	// Caching functionality only defined on host
	return CubDebug(cudaErrorInvalidConfiguration);
	#else

	bool locked = false;
	int entrypoint_device = INVALID_DEVICE_ORDINAL;
	cudaError_t error = cudaSuccess;

	// Round up to nearest bin size
	unsigned int bin;
	size_t bin_bytes;
	NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
	if (bin < min_bin) {
	bin = min_bin;
	bin_bytes = min_bin_bytes;
	}

	// Check if bin is greater than our maximum bin
	if (bin > max_bin)
	{
	// Allocate the request exactly and give out-of-range bin
	bin = (unsigned int) -1;
	bin_bytes = bytes;
	}

	BlockDescriptor search_key(bin_bytes, bin, device);

	// Lock
	if (!locked) {
	Lock(&spin_lock);
	locked = true;
	}

	do {
	// Find a free block big enough within the same bin on the same device
	CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
	if ((block_itr != cached_blocks.end()) &&
	(block_itr->device == device) &&
	(block_itr->bin == search_key.bin))
	{
	// Reuse existing cache block. Insert into live blocks.
	search_key = *block_itr;
	live_blocks.insert(search_key);

	// Remove from free blocks
	cached_blocks.erase(block_itr);
	cached_bytes[device] -= search_key.bytes;

	if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
	device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
	}
	else
	{
	// Need to allocate a new cache block. Unlock.
	if (locked) {
	Unlock(&spin_lock);
	locked = false;
	}

	// Set to specified device
	if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
	if (CubDebug(error = cudaSetDevice(device))) break;

	// Allocate
	if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;

	// Lock
	if (!locked) {
	Lock(&spin_lock);
	locked = true;
	}

	// Insert into live blocks
	live_blocks.insert(search_key);

	if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
	device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
	}
	} while(0);

	// Unlock
	if (locked) {
	Unlock(&spin_lock);
	locked = false;
	}

	// Copy device pointer to output parameter (NULL on error)
	*d_ptr = search_key.d_ptr;

	// Attempt to revert back to previous device if necessary
	if (entrypoint_device != INVALID_DEVICE_ORDINAL)
	{
	if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
	}

	return error;

	#endif // __CUDA_ARCH__
	}


	/**
	* \brief Provides a suitable allocation of device memory for the given size on the current device
	*/
	cudaError_t DeviceAllocate(
	void** d_ptr,
	size_t bytes)
	{
	#ifdef __CUDA_ARCH__
	// Caching functionality only defined on host
	return CubDebug(cudaErrorInvalidConfiguration);
	#else
	cudaError_t error = cudaSuccess;
	do {
	int current_device;
	if (CubDebug(error = cudaGetDevice(&current_device))) break;
	if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break;
	} while(0);

	return error;

	#endif // __CUDA_ARCH__
	}


	/**
	* \brief Frees a live allocation of device memory on the specified device, returning it to the allocator
	*/
	cudaError_t DeviceFree(
	void* d_ptr,
	int device)
	{
	#ifdef __CUDA_ARCH__
	// Caching functionality only defined on host
	return CubDebug(cudaErrorInvalidConfiguration);
	#else

	bool locked = false;
	int entrypoint_device = INVALID_DEVICE_ORDINAL;
	cudaError_t error = cudaSuccess;

	BlockDescriptor search_key(d_ptr, device);

	// Lock
	if (!locked) {
	Lock(&spin_lock);
	locked = true;
	}

	do {
	// Find corresponding block descriptor
	BusyBlocks::iterator block_itr = live_blocks.find(search_key);
	if (block_itr == live_blocks.end())
	{
	// Cannot find pointer
	if (CubDebug(error = cudaErrorUnknown)) break;
	}
	else
	{
	// Remove from live blocks
	search_key = *block_itr;
	live_blocks.erase(block_itr);

	// Check if we should keep the returned allocation
	if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
	{
	// Insert returned allocation into free blocks
	cached_blocks.insert(search_key);
	cached_bytes[device] += search_key.bytes;

	if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
	device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
	}
	else
	{
	// Free the returned allocation. Unlock.
	if (locked) {
	Unlock(&spin_lock);
	locked = false;
	}

	// Set to specified device
	if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
	if (CubDebug(error = cudaSetDevice(device))) break;

	// Free device memory
	if (CubDebug(error = cudaFree(d_ptr))) break;

	if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
	device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
	}
	}
	} while (0);

	// Unlock
	if (locked) {
	Unlock(&spin_lock);
	locked = false;
	}

	// Attempt to revert back to entry-point device if necessary
	if (entrypoint_device != INVALID_DEVICE_ORDINAL)
	{
	if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
	}

	return error;

	#endif // __CUDA_ARCH__
	}


	/**
	* \brief Frees a live allocation of device memory on the current device, returning it to the allocator
	*/
	cudaError_t DeviceFree(
	void* d_ptr)
	{
	#ifdef __CUDA_ARCH__
	// Caching functionality only defined on host
	return CubDebug(cudaErrorInvalidConfiguration);
	#else

	int current_device;
	cudaError_t error = cudaSuccess;

	do {
	if (CubDebug(error = cudaGetDevice(&current_device))) break;
	if (CubDebug(error = DeviceFree(d_ptr, current_device))) break;
	} while(0);

	return error;

	#endif // __CUDA_ARCH__
	}


	/**
	* \brief Frees all cached device allocations on all devices
	*/
	cudaError_t FreeAllCached()
	{
	#ifdef __CUDA_ARCH__
	// Caching functionality only defined on host
	return CubDebug(cudaErrorInvalidConfiguration);
	#else

	cudaError_t error = cudaSuccess;
	bool locked = false;
	int entrypoint_device = INVALID_DEVICE_ORDINAL;
	int current_device = INVALID_DEVICE_ORDINAL;

	// Lock
	if (!locked) {
	Lock(&spin_lock);
	locked = true;
	}

	while (!cached_blocks.empty())
	{
	// Get first block
	CachedBlocks::iterator begin = cached_blocks.begin();

	// Get entry-point device ordinal if necessary
	if (entrypoint_device == INVALID_DEVICE_ORDINAL)
	{
	if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
	}

	// Set current device ordinal if necessary
	if (begin->device != current_device)
	{
	if (CubDebug(error = cudaSetDevice(begin->device))) break;
	current_device = begin->device;
	}

	// Free device memory
	if (CubDebug(error = cudaFree(begin->d_ptr))) break;

	// Reduce balance and erase entry
	cached_bytes[current_device] -= begin->bytes;
	cached_blocks.erase(begin);

	if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
	current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
	}

	// Unlock
	if (locked) {
	Unlock(&spin_lock);
	locked = false;
	}

	// Attempt to revert back to entry-point device if necessary
	if (entrypoint_device != INVALID_DEVICE_ORDINAL)
	{
	if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
	}

	return error;

	#endif // __CUDA_ARCH__
	}


	/**
	* \brief Destructor
	*/
	virtual ~CachingDeviceAllocator()
	{
	if (!skip_cleanup)
	FreeAllCached();
	}

	};




	/** @} */ // end group UtilModule

	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)

util_allocator.cuhNo OneTemporaryActions

File Metadata

util_allocator.cuhView Options

Event Timeline

util_allocator.cuh
No OneTemporary
Actions

util_allocator.cuh
View Options