Page MenuHomec4science

util_allocator.cuh
No OneTemporary

File Metadata

Created
Sun, Jul 28, 20:04

util_allocator.cuh

/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Simple caching allocator for device memory allocations. The allocator is
* thread-safe and capable of managing device allocations on multiple devices.
******************************************************************************/
#pragma once
#ifndef __CUDA_ARCH__
#include <set> // NVCC (EDG, really) takes FOREVER to compile std::map
#include <map>
#endif
#include <math.h>
#include "util_namespace.cuh"
#include "util_debug.cuh"
#include "host/spinlock.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilModule
* @{
*/
/******************************************************************************
* CachingDeviceAllocator (host use)
******************************************************************************/
/**
* \brief A simple caching allocator for device memory allocations.
*
* \par Overview
* The allocator is thread-safe and is capable of managing cached device allocations
* on multiple devices. It behaves as follows:
*
* \par
* - Allocations categorized by bin size.
* - Bin sizes progress geometrically in accordance with the growth factor
* \p bin_growth provided during construction. Unused device allocations within
* a larger bin cache are not reused for allocation requests that categorize to
* smaller bin sizes.
* - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
* (\p bin_growth ^ \p min_bin).
* - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
* bin and are simply freed when they are deallocated instead of being returned
* to a bin-cache.
* - %If the total storage of cached allocations on a given device will exceed
* \p max_cached_bytes, allocations for that device are simply freed when they are
* deallocated instead of being returned to their bin-cache.
*
* \par
* For example, the default-constructed CachingDeviceAllocator is configured with:
* - \p bin_growth = 8
* - \p min_bin = 3
* - \p max_bin = 7
* - \p max_cached_bytes = 6MB - 1B
*
* \par
* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
* and sets a maximum of 6,291,455 cached bytes per device
*
*/
struct CachingDeviceAllocator
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
//---------------------------------------------------------------------
// Type definitions and constants
//---------------------------------------------------------------------
enum
{
/// Invalid device ordinal
INVALID_DEVICE_ORDINAL = -1,
};
/**
* Integer pow function for unsigned base and exponent
*/
static unsigned int IntPow(
unsigned int base,
unsigned int exp)
{
unsigned int retval = 1;
while (exp > 0)
{
if (exp & 1) {
retval = retval * base; // multiply the result by the current base
}
base = base * base; // square the base
exp = exp >> 1; // divide the exponent in half
}
return retval;
}
/**
* Round up to the nearest power-of
*/
static void NearestPowerOf(
unsigned int &power,
size_t &rounded_bytes,
unsigned int base,
size_t value)
{
power = 0;
rounded_bytes = 1;
while (rounded_bytes < value)
{
rounded_bytes *= base;
power++;
}
}
/**
* Descriptor for device memory allocations
*/
struct BlockDescriptor
{
int device; // device ordinal
void* d_ptr; // Device pointer
size_t bytes; // Size of allocation in bytes
unsigned int bin; // Bin enumeration
// Constructor
BlockDescriptor(void *d_ptr, int device) :
d_ptr(d_ptr),
bytes(0),
bin(0),
device(device) {}
// Constructor
BlockDescriptor(size_t bytes, unsigned int bin, int device) :
d_ptr(NULL),
bytes(bytes),
bin(bin),
device(device) {}
// Comparison functor for comparing device pointers
static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
{
if (a.device < b.device) {
return true;
} else if (a.device > b.device) {
return false;
} else {
return (a.d_ptr < b.d_ptr);
}
}
// Comparison functor for comparing allocation sizes
static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
{
if (a.device < b.device) {
return true;
} else if (a.device > b.device) {
return false;
} else {
return (a.bytes < b.bytes);
}
}
};
/// BlockDescriptor comparator function interface
typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
#ifndef __CUDA_ARCH__ // Only define STL container members in host code
/// Set type for cached blocks (ordered by size)
typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
/// Set type for live blocks (ordered by ptr)
typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
/// Map type of device ordinals to the number of cached bytes cached by each device
typedef std::map<int, size_t> GpuCachedBytes;
#endif // __CUDA_ARCH__
//---------------------------------------------------------------------
// Fields
//---------------------------------------------------------------------
Spinlock spin_lock; /// Spinlock for thread-safety
unsigned int bin_growth; /// Geometric growth factor for bin-sizes
unsigned int min_bin; /// Minimum bin enumeration
unsigned int max_bin; /// Maximum bin enumeration
size_t min_bin_bytes; /// Minimum bin size
size_t max_bin_bytes; /// Maximum bin size
size_t max_cached_bytes; /// Maximum aggregate cached bytes per device
bool debug; /// Whether or not to print (de)allocation events to stdout
bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators)
#ifndef __CUDA_ARCH__ // Only define STL container members in host code
GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device
CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse
BusyBlocks live_blocks; /// Set of live device allocations currently in use
#endif // __CUDA_ARCH__
#endif // DOXYGEN_SHOULD_SKIP_THIS
//---------------------------------------------------------------------
// Methods
//---------------------------------------------------------------------
/**
* \brief Constructor.
*/
CachingDeviceAllocator(
unsigned int bin_growth, ///< Geometric growth factor for bin-sizes
unsigned int min_bin, ///< Minimum bin
unsigned int max_bin, ///< Maximum bin
size_t max_cached_bytes) ///< Maximum aggregate cached bytes per device
:
#ifndef __CUDA_ARCH__ // Only define STL container members in host code
cached_blocks(BlockDescriptor::SizeCompare),
live_blocks(BlockDescriptor::PtrCompare),
#endif
debug(false),
spin_lock(0),
bin_growth(bin_growth),
min_bin(min_bin),
max_bin(max_bin),
min_bin_bytes(IntPow(bin_growth, min_bin)),
max_bin_bytes(IntPow(bin_growth, max_bin)),
max_cached_bytes(max_cached_bytes)
{}
/**
* \brief Default constructor.
*
* Configured with:
* \par
* - \p bin_growth = 8
* - \p min_bin = 3
* - \p max_bin = 7
* - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
*
* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
* sets a maximum of 6,291,455 cached bytes per device
*/
CachingDeviceAllocator(bool skip_cleanup = false) :
#ifndef __CUDA_ARCH__ // Only define STL container members in host code
cached_blocks(BlockDescriptor::SizeCompare),
live_blocks(BlockDescriptor::PtrCompare),
#endif
skip_cleanup(skip_cleanup),
debug(false),
spin_lock(0),
bin_growth(8),
min_bin(3),
max_bin(7),
min_bin_bytes(IntPow(bin_growth, min_bin)),
max_bin_bytes(IntPow(bin_growth, max_bin)),
max_cached_bytes((max_bin_bytes * 3) - 1)
{}
/**
* \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
*/
cudaError_t SetMaxCachedBytes(
size_t max_cached_bytes)
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
// Lock
Lock(&spin_lock);
this->max_cached_bytes = max_cached_bytes;
if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);
// Unlock
Unlock(&spin_lock);
return cudaSuccess;
#endif // __CUDA_ARCH__
}
/**
* \brief Provides a suitable allocation of device memory for the given size on the specified device
*/
cudaError_t DeviceAllocate(
void** d_ptr,
size_t bytes,
int device)
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
bool locked = false;
int entrypoint_device = INVALID_DEVICE_ORDINAL;
cudaError_t error = cudaSuccess;
// Round up to nearest bin size
unsigned int bin;
size_t bin_bytes;
NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
if (bin < min_bin) {
bin = min_bin;
bin_bytes = min_bin_bytes;
}
// Check if bin is greater than our maximum bin
if (bin > max_bin)
{
// Allocate the request exactly and give out-of-range bin
bin = (unsigned int) -1;
bin_bytes = bytes;
}
BlockDescriptor search_key(bin_bytes, bin, device);
// Lock
if (!locked) {
Lock(&spin_lock);
locked = true;
}
do {
// Find a free block big enough within the same bin on the same device
CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
if ((block_itr != cached_blocks.end()) &&
(block_itr->device == device) &&
(block_itr->bin == search_key.bin))
{
// Reuse existing cache block. Insert into live blocks.
search_key = *block_itr;
live_blocks.insert(search_key);
// Remove from free blocks
cached_blocks.erase(block_itr);
cached_bytes[device] -= search_key.bytes;
if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
}
else
{
// Need to allocate a new cache block. Unlock.
if (locked) {
Unlock(&spin_lock);
locked = false;
}
// Set to specified device
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
if (CubDebug(error = cudaSetDevice(device))) break;
// Allocate
if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
// Lock
if (!locked) {
Lock(&spin_lock);
locked = true;
}
// Insert into live blocks
live_blocks.insert(search_key);
if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
}
} while(0);
// Unlock
if (locked) {
Unlock(&spin_lock);
locked = false;
}
// Copy device pointer to output parameter (NULL on error)
*d_ptr = search_key.d_ptr;
// Attempt to revert back to previous device if necessary
if (entrypoint_device != INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
return error;
#endif // __CUDA_ARCH__
}
/**
* \brief Provides a suitable allocation of device memory for the given size on the current device
*/
cudaError_t DeviceAllocate(
void** d_ptr,
size_t bytes)
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
cudaError_t error = cudaSuccess;
do {
int current_device;
if (CubDebug(error = cudaGetDevice(&current_device))) break;
if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break;
} while(0);
return error;
#endif // __CUDA_ARCH__
}
/**
* \brief Frees a live allocation of device memory on the specified device, returning it to the allocator
*/
cudaError_t DeviceFree(
void* d_ptr,
int device)
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
bool locked = false;
int entrypoint_device = INVALID_DEVICE_ORDINAL;
cudaError_t error = cudaSuccess;
BlockDescriptor search_key(d_ptr, device);
// Lock
if (!locked) {
Lock(&spin_lock);
locked = true;
}
do {
// Find corresponding block descriptor
BusyBlocks::iterator block_itr = live_blocks.find(search_key);
if (block_itr == live_blocks.end())
{
// Cannot find pointer
if (CubDebug(error = cudaErrorUnknown)) break;
}
else
{
// Remove from live blocks
search_key = *block_itr;
live_blocks.erase(block_itr);
// Check if we should keep the returned allocation
if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
{
// Insert returned allocation into free blocks
cached_blocks.insert(search_key);
cached_bytes[device] += search_key.bytes;
if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
}
else
{
// Free the returned allocation. Unlock.
if (locked) {
Unlock(&spin_lock);
locked = false;
}
// Set to specified device
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
if (CubDebug(error = cudaSetDevice(device))) break;
// Free device memory
if (CubDebug(error = cudaFree(d_ptr))) break;
if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
}
}
} while (0);
// Unlock
if (locked) {
Unlock(&spin_lock);
locked = false;
}
// Attempt to revert back to entry-point device if necessary
if (entrypoint_device != INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
return error;
#endif // __CUDA_ARCH__
}
/**
* \brief Frees a live allocation of device memory on the current device, returning it to the allocator
*/
cudaError_t DeviceFree(
void* d_ptr)
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
int current_device;
cudaError_t error = cudaSuccess;
do {
if (CubDebug(error = cudaGetDevice(&current_device))) break;
if (CubDebug(error = DeviceFree(d_ptr, current_device))) break;
} while(0);
return error;
#endif // __CUDA_ARCH__
}
/**
* \brief Frees all cached device allocations on all devices
*/
cudaError_t FreeAllCached()
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
cudaError_t error = cudaSuccess;
bool locked = false;
int entrypoint_device = INVALID_DEVICE_ORDINAL;
int current_device = INVALID_DEVICE_ORDINAL;
// Lock
if (!locked) {
Lock(&spin_lock);
locked = true;
}
while (!cached_blocks.empty())
{
// Get first block
CachedBlocks::iterator begin = cached_blocks.begin();
// Get entry-point device ordinal if necessary
if (entrypoint_device == INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
}
// Set current device ordinal if necessary
if (begin->device != current_device)
{
if (CubDebug(error = cudaSetDevice(begin->device))) break;
current_device = begin->device;
}
// Free device memory
if (CubDebug(error = cudaFree(begin->d_ptr))) break;
// Reduce balance and erase entry
cached_bytes[current_device] -= begin->bytes;
cached_blocks.erase(begin);
if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
}
// Unlock
if (locked) {
Unlock(&spin_lock);
locked = false;
}
// Attempt to revert back to entry-point device if necessary
if (entrypoint_device != INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
return error;
#endif // __CUDA_ARCH__
}
/**
* \brief Destructor
*/
virtual ~CachingDeviceAllocator()
{
if (!skip_cleanup)
FreeAllCached();
}
};
/** @} */ // end group UtilModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

Event Timeline