nvd_kernel.h
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Nov 13, 07:48

nvd_kernel.h
View Options

	/***************************************************************************
	nvd_kernel.h
	-------------------
	W. Michael Brown

	Utilities for dealing with CUDA Driver kernels

	__________________________________________________________________________
	This file is part of the Geryon Unified Coprocessor Library (UCL)
	__________________________________________________________________________

	begin : Tue Feb 9 2010
	copyright : (C) 2010 by W. Michael Brown
	email : brownw@ornl.gov
	***************************************************************************/

	/* -----------------------------------------------------------------------
	Copyright (2010) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the Simplified BSD License.
	----------------------------------------------------------------------- */

	#ifndef NVD_KERNEL
	#define NVD_KERNEL

	#include "nvd_device.h"
	#include <fstream>

	namespace ucl_cudadr {

	class UCL_Texture;
	template <class numtyp> class UCL_D_Vec;
	template <class numtyp> class UCL_D_Mat;
	template <class hosttype, class devtype> class UCL_Vector;
	template <class hosttype, class devtype> class UCL_Matrix;
	#define UCL_MAX_KERNEL_ARGS 256

	/// Class storing 1 or more kernel functions from a single string or file
	class UCL_Program {
	public:
	inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
	inline UCL_Program(UCL_Device &device, const void *program,
	const char flags="", std::string log=NULL) {
	_cq=device.cq();
	init(device);
	load_string(program,flags,log);
	}

	inline ~UCL_Program() {}

	/// Initialize the program with a device
	inline void init(UCL_Device &device) { _cq=device.cq(); }

	/// Clear any data associated with program
	/ \note Must call init() after each clear /
	inline void clear() { }

	/// Load a program from a file and compile with flags
	inline int load(const char filename, const char flags="",
	std::string *log=NULL) {
	std::ifstream in(filename);
	if (!in \|\| in.is_open()==false) {
	#ifndef UCL_NO_EXIT
	std::cerr << "UCL Error: Could not open kernel file: "
	<< filename << std::endl;
	UCL_GERYON_EXIT;
	#endif
	return UCL_FILE_NOT_FOUND;
	}

	std::string program((std::istreambuf_iterator<char>(in)),
	std::istreambuf_iterator<char>());
	in.close();
	return load_string(program.c_str(),flags,log);
	}

	/// Load a program from a string and compile with flags
	inline int load_string(const void program, const char flags="",
	std::string *log=NULL) {
	if (std::string(flags)=="BINARY")
	return load_binary((const char *)program);
	const unsigned int num_opts=2;
	CUjit_option options[num_opts];
	void *values[num_opts];

	// set up size of compilation log buffer
	options[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
	values[0] = (void *)(int)10240;
	// set up pointer to the compilation log buffer
	options[1] = CU_JIT_INFO_LOG_BUFFER;
	char clog[10240];
	values[1] = clog;

	CUresult err=cuModuleLoadDataEx(&_module,program,num_opts,
	options,(void **)values);

	if (log!=NULL)
	*log=std::string(clog);

	if (err != CUDA_SUCCESS) {
	#ifndef UCL_NO_EXIT
	std::cerr << std::endl
	<< "----------------------------------------------------------\n"
	<< " UCL Error: Error compiling PTX Program...\n"
	<< "----------------------------------------------------------\n";
	std::cerr << log << std::endl;
	#endif
	return UCL_COMPILE_ERROR;
	}

	return UCL_SUCCESS;
	}

	/// Load a precompiled program from a file
	inline int load_binary(const char *filename) {
	CUmodule _module;
	CUresult err = cuModuleLoad(&_module,filename);
	if (err==301) {
	#ifndef UCL_NO_EXIT
	std::cerr << "UCL Error: Could not open binary kernel file: "
	<< filename << std::endl;
	UCL_GERYON_EXIT;
	#endif
	return UCL_FILE_NOT_FOUND;
	} else if (err!=CUDA_SUCCESS) {
	#ifndef UCL_NO_EXIT
	std::cerr << "UCL Error: Error loading binary kernel file: "
	<< filename << std::endl;
	UCL_GERYON_EXIT;
	#endif
	return UCL_FILE_NOT_FOUND;
	}
	//int ucl_error=UCL_SUCCESS;
	//if (err==301)
	// return UCL_FILE_NOT_FOUND;
	//else if (err!=CUDA_SUCCESS)
	// return UCL_ERROR;
	return UCL_SUCCESS;
	}

	friend class UCL_Kernel;
	private:
	CUmodule _module;
	CUstream _cq;
	friend class UCL_Texture;
	};

	/// Class for dealing with CUDA Driver kernels
	class UCL_Kernel {
	public:
	UCL_Kernel() : _dimensions(1), _num_args(0) {
	#if CUDA_VERSION < 4000
	_param_size=0;
	#endif
	_num_blocks[0]=0;
	}

	UCL_Kernel(UCL_Program &program, const char *function) :
	_dimensions(1), _num_args(0) {
	#if CUDA_VERSION < 4000
	_param_size=0;
	#endif
	_num_blocks[0]=0;
	set_function(program,function);
	_cq=program._cq;
	}

	~UCL_Kernel() {}

	/// Clear any function associated with the kernel
	inline void clear() { }

	/// Get the kernel function from a program
	/ \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) /
	inline int set_function(UCL_Program &program, const char *function) {
	CUresult err=cuModuleGetFunction(&_kernel,program._module,function);
	if (err!=CUDA_SUCCESS) {
	#ifndef UCL_NO_EXIT
	std::cerr << "UCL Error: Could not find function: " << function
	<< " in program.\n";
	UCL_GERYON_EXIT;
	#endif
	return UCL_FUNCTION_NOT_FOUND;
	}
	_cq=program._cq;
	return UCL_SUCCESS;
	}

	/// Set the kernel argument.
	/** If not a device pointer, this must be repeated each time the argument
	* changes
	* \note To set kernel parameter i (i>0), parameter i-1 must be set **/
	template <class dtype>
	inline void set_arg(const unsigned index, const dtype * const arg) {
	if (index==_num_args)
	add_arg(arg);
	else if (index<_num_args)
	#if CUDA_VERSION >= 4000
	_kernel_args[index]=arg;
	#else
	CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
	#endif
	else
	assert(0==1); // Must add kernel parameters in sequential order
	}

	/// Set a geryon container as a kernel argument.
	template <class numtyp>
	inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
	{ set_arg(&arg->begin()); }

	/// Set a geryon container as a kernel argument.
	template <class numtyp>
	inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
	{ set_arg(&arg->begin()); }

	/// Set a geryon container as a kernel argument.
	template <class hosttype, class devtype>
	inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
	{ set_arg(&arg->device.begin()); }

	/// Set a geryon container as a kernel argument.
	template <class hosttype, class devtype>
	inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
	{ set_arg(&arg->device.begin()); }

	/// Add a kernel argument.
	inline void add_arg(const CUdeviceptr* const arg) {
	#if CUDA_VERSION >= 4000
	_kernel_args[_num_args]=(void *)arg;
	#else
	void* ptr = (void)(size_t)(arg);
	_param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
	CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
	_offsets.push_back(_param_size);
	_param_size+=sizeof(ptr);
	#endif
	_num_args++;
	if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
	}

	/// Add a kernel argument.
	template <class dtype>
	inline void add_arg(const dtype* const arg) {
	#if CUDA_VERSION >= 4000
	_kernel_args[_num_args]=const_cast<dtype * const>(arg);
	#else
	_param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
	CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
	_offsets.push_back(_param_size);
	_param_size+=sizeof(dtype);
	#endif
	_num_args++;
	if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
	}

	/// Add a geryon container as a kernel argument.
	template <class numtyp>
	inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
	{ add_arg(&arg->begin()); }

	/// Add a geryon container as a kernel argument.
	template <class numtyp>
	inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
	{ add_arg(&arg->begin()); }

	/// Add a geryon container as a kernel argument.
	template <class hosttype, class devtype>
	inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
	{ add_arg(&arg->device.begin()); }

	/// Add a geryon container as a kernel argument.
	template <class hosttype, class devtype>
	inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
	{ add_arg(&arg->device.begin()); }

	/// Set the number of thread blocks and the number of threads in each block
	/** \note This should be called before any arguments have been added
	\note The default command queue is used for the kernel execution **/
	inline void set_size(const size_t num_blocks, const size_t block_size) {
	_dimensions=1;
	_num_blocks[0]=num_blocks;
	_num_blocks[1]=1;
	_num_blocks[2]=1;
	#if CUDA_VERSION >= 4000
	_block_size[0]=block_size;
	_block_size[1]=1;
	_block_size[2]=1;
	#else
	CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
	#endif
	}

	/// Set the number of thread blocks and the number of threads in each block
	/** \note This should be called before any arguments have been added
	\note The default command queue for the kernel is changed to cq **/
	inline void set_size(const size_t num_blocks, const size_t block_size,
	command_queue &cq)
	{ _cq=cq; set_size(num_blocks,block_size); }

	/// Set the number of thread blocks and the number of threads in each block
	/** \note This should be called before any arguments have been added
	\note The default command queue is used for the kernel execution **/
	inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
	const size_t block_size_x, const size_t block_size_y) {
	_dimensions=2;
	_num_blocks[0]=num_blocks_x;
	_num_blocks[1]=num_blocks_y;
	_num_blocks[2]=1;
	#if CUDA_VERSION >= 4000
	_block_size[0]=block_size_x;
	_block_size[1]=block_size_y;
	_block_size[2]=1;
	#else
	CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
	#endif
	}

	/// Set the number of thread blocks and the number of threads in each block
	/** \note This should be called before any arguments have been added
	\note The default command queue for the kernel is changed to cq **/
	inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
	const size_t block_size_x, const size_t block_size_y,
	command_queue &cq)
	{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}

	/// Set the number of thread blocks and the number of threads in each block
	/** \note This should be called before any arguments have been added
	\note The default command queue is used for the kernel execution **/
	inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
	const size_t block_size_x,
	const size_t block_size_y, const size_t block_size_z) {
	_dimensions=2;
	_num_blocks[0]=num_blocks_x;
	_num_blocks[1]=num_blocks_y;
	_num_blocks[2]=1;
	#if CUDA_VERSION >= 4000
	_block_size[0]=block_size_x;
	_block_size[1]=block_size_y;
	_block_size[2]=block_size_z;
	#else
	CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
	block_size_z));
	#endif
	}

	/// Set the number of thread blocks and the number of threads in each block
	/** \note This should be called before any arguments have been added
	\note The default command queue is used for the kernel execution **/
	inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
	const size_t block_size_x, const size_t block_size_y,
	const size_t block_size_z, command_queue &cq) {
	_cq=cq;
	set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
	block_size_z);
	}

	/// Run the kernel in the default command queue
	inline void run() {
	#if CUDA_VERSION >= 4000
	CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
	_num_blocks[2],_block_size[0],_block_size[1],
	_block_size[2],0,_cq,_kernel_args,NULL));
	#else
	CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
	CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
	#endif
	}

	/// Clear any arguments associated with the kernel
	inline void clear_args() {
	_num_args=0;
	#if CUDA_VERSION < 4000
	_offsets.clear();
	_param_size=0;
	#endif
	}

	/// Return the default command queue/stream associated with this data
	inline command_queue & cq() { return _cq; }
	/// Change the default command queue associated with matrix
	inline void cq(command_queue &cq_in) { _cq=cq_in; }
	#include "ucl_arg_kludge.h"

	private:
	CUfunction _kernel;
	CUstream _cq;
	unsigned _dimensions;
	unsigned _num_blocks[3];
	unsigned _num_args;
	friend class UCL_Texture;

	#if CUDA_VERSION >= 4000
	unsigned _block_size[3];
	void * _kernel_args[UCL_MAX_KERNEL_ARGS];
	#else
	std::vector<unsigned> _offsets;
	unsigned _param_size;
	#endif
	};

	} // namespace

	#endif

nvd_kernel.hNo OneTemporaryActions

File Metadata

nvd_kernel.hView Options

Event Timeline

nvd_kernel.h
No OneTemporary
Actions

nvd_kernel.h
View Options