cuda_data.h
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Oct 4, 18:58

cuda_data.h
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator

	Original Version:
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	See the README file in the top-level LAMMPS directory.

	-----------------------------------------------------------------------

	USER-CUDA Package and associated modifications:
	https://sourceforge.net/projects/lammpscuda/

	Christian Trott, christian.trott@tu-ilmenau.de
	Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
	Theoretical Physics II, University of Technology Ilmenau, Germany

	See the README file in the USER-CUDA directory.

	This software is distributed under the GNU General Public License.
	------------------------------------------------------------------------- */

	#ifndef _CUDA_DATA_H_
	#define _CUDA_DATA_H_


	enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
	//xx==x in atom_vec x is a member therefore copymode x produces compile errors
	#include "cuda_shared.h"
	#include "cuda_wrapper_cu.h"
	#include "cuda_data_cu.h"
	#include <ctime>

	#include <cstdio>
	#include <typeinfo>
	template <typename host_type, typename dev_type, copy_mode mode>
	class cCudaData
	{
	protected:
	void** buffer;
	int* buf_size;
	host_type* host_data;
	dev_array* dev_data_array;
	dev_type* temp_data;
	unsigned nbytes;
	bool owns_dev_array;
	bool current_data_on_device; //this is not yet working as intended and therefore deactivated
	bool current_data_on_host;
	bool is_continues;
	bool pinned;

	public:
	cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
	cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
	~cCudaData();
	void* dev_data() {if(dev_data_array!=NULL) return dev_data_array->dev_data; else return NULL;};
	void set_dev_data(void* adev_data) {dev_data_array->dev_data=adev_data;};
	void set_dev_array(dev_array* adev_array) {dev_data_array=adev_array;};
	void set_host_data(host_type* host_data);
	void* get_host_data() { return host_data;};
	void set_buffer(void** buffer,int* buf_size,bool ais_continues);
	unsigned int* get_dim() {return dev_data_array->dim;};
	// if you want to upload data to the gpu, which will not change there, then set will_be_changed=false
	// if you want to upload data to the gpu and update it there, then set will_be_changed=true (default)
	void upload(bool will_be_changed=true);
	void uploadAsync(int stream, bool will_be_changed=true );
	// if you want to download data just to have a look at it, then set will_be_changed=false
	// if you are going to modify the downloaded data, then set will_be_changed=true (default)
	void download(bool will_be_changed=true);
	void downloadAsync(int stream);
	void memset_device(int value);
	void device_data_has_changed() {current_data_on_device=false;}
	void host_data_has_changed() {current_data_on_host=false;}
	int dev_size() {
	int size = dev_data_array->dim[0]*sizeof(dev_type);
	if(dev_data_array->dim[1]) size*=dev_data_array->dim[1];
	if(dev_data_array->dim[2]) size*=dev_data_array->dim[2];
	return size;}
	};


	template <typename host_type, typename dev_type, copy_mode mode>
	cCudaData<host_type, dev_type, mode>
	::cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
	{
	pinned=is_pinned;
	owns_dev_array = false;
	current_data_on_device = false;
	current_data_on_host = false;
	is_continues = false;
	this->host_data = host_data;
	this->dev_data_array = dev_data_array;
	unsigned ndev;
	if((mode == x)\|\|(mode==xx))
	{
	ndev = dim_x;
	dev_data_array->dim[0] = dim_x;
	dev_data_array->dim[1] = 0;
	dev_data_array->dim[2] = 0;
	}
	else if(mode == xy \|\| mode == yx )
	{
	ndev = dim_x * dim_y;
	dev_data_array->dim[0] = dim_x;
	dev_data_array->dim[1] = dim_y;
	dev_data_array->dim[2] = 0;
	}
	else
	{
	ndev = dim_x * dim_y * dim_z;
	dev_data_array->dim[0] = dim_x;
	dev_data_array->dim[1] = dim_y;
	dev_data_array->dim[2] = dim_z;
	}
	nbytes = ndev * sizeof(dev_type);
	if(nbytes<=0)
	{
	host_data=NULL;
	temp_data=NULL;
	dev_data_array->dev_data=NULL;
	return;
	}

	dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
	if(((mode!=x)&&(mode!=xx)) \|\| typeid(host_type) != typeid(dev_type))
	{
	if(not pinned)
	temp_data = new dev_type[ndev];
	else
	{
	temp_data = (dev_type) CudaWrapper_AllocPinnedHostData(ndevsizeof(dev_type));
	}
	}
	}

	template <typename host_type, typename dev_type, copy_mode mode>
	cCudaData<host_type, dev_type, mode>
	::cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
	{
	pinned=is_pinned;
	this->dev_data_array = new dev_array;
	this->owns_dev_array = true;
	current_data_on_device = false;
	current_data_on_host = false;
	is_continues = false;
	this->host_data = host_data;
	unsigned ndev;
	if((mode == x)\|\|(mode==xx))
	{
	ndev = dim_x;
	dev_data_array->dim[0] = dim_x;
	dev_data_array->dim[1] = 0;
	dev_data_array->dim[2] = 0;
	}
	else if(mode == xy \|\| mode == yx )
	{
	ndev = dim_x * dim_y;
	dev_data_array->dim[0] = dim_x;
	dev_data_array->dim[1] = dim_y;
	dev_data_array->dim[2] = 0;
	}
	else
	{
	ndev = dim_x * dim_y * dim_z;
	dev_data_array->dim[0] = dim_x;
	dev_data_array->dim[1] = dim_y;
	dev_data_array->dim[2] = dim_z;
	}
	nbytes = ndev * sizeof(dev_type);
	if(nbytes<=0)
	{
	host_data=NULL;
	temp_data=NULL;
	dev_data_array->dev_data=NULL;
	return;
	}

	dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
	if(((mode!=x)&&(mode!=xx)) \|\| (typeid(host_type) != typeid(dev_type)))
	{
	if(not pinned)
	temp_data = new dev_type[ndev];
	else
	{
	temp_data = (dev_type) CudaWrapper_AllocPinnedHostData(ndevsizeof(dev_type));
	}
	}
	}

	template <typename host_type, typename dev_type, copy_mode mode>
	cCudaData<host_type, dev_type, mode>
	::~cCudaData()
	{
	if(((mode!=x)&&(mode!=xx)) \|\| typeid(host_type) != typeid(dev_type))
	{
	if(not pinned)
	delete [] temp_data;
	else
	{
	CudaWrapper_FreePinnedHostData((void*)temp_data);
	}
	}
	if((dev_data_array->dev_data)&&(nbytes>0))
	CudaWrapper_FreeCudaData(dev_data_array->dev_data,nbytes);
	if(owns_dev_array) delete dev_data_array;
	}

	template <typename host_type, typename dev_type, copy_mode mode>
	void cCudaData<host_type, dev_type, mode>
	::set_host_data(host_type* host_data)
	{
	this->host_data = host_data;
	}

	template <typename host_type, typename dev_type, copy_mode mode>
	void cCudaData<host_type, dev_type, mode>
	::upload(bool will_be_changed)
	{
	// if current data is already up, do not re-upload it
	// if(current_data_on_device) return;
	if(buffer&&is_continues)
	{
	printf("Actual Buffer: %p %i\n",buffer,buf_size);
	if(typeid(host_type)==typeid(double))
	{
	if(typeid(dev_type)==typeid(double))
	{
	CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
	dev_data_array->dim,mode,*buffer);
	current_data_on_device = true;
	if(will_be_changed) current_data_on_host = false;
	return;
	}
	else if(typeid(dev_type)==typeid(float))
	{
	CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
	dev_data_array->dim,mode,*buffer);
	current_data_on_device = true;
	if(will_be_changed) current_data_on_host = false;
	return;
	}
	}
	else if(typeid(host_type)==typeid(float))
	{
	if(typeid(dev_type)==typeid(double))
	{
	CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
	dev_data_array->dim,mode,*buffer);
	current_data_on_device = true;
	if(will_be_changed) current_data_on_host = false;
	return;
	}
	else if(typeid(dev_type)==typeid(float))
	{
	CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
	dev_data_array->dim,mode,*buffer);
	current_data_on_device = true;
	if(will_be_changed) current_data_on_host = false;
	return;
	}
	}
	else if(typeid(host_type)==typeid(int))
	{
	if(typeid(dev_type)==typeid(int))
	{
	CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
	dev_data_array->dim,mode,*buffer);
	current_data_on_device = true;
	if(will_be_changed) current_data_on_host = false;
	return;
	}
	}
	}
	switch(mode)
	{
	case x:
	{
	if(typeid(host_type) == typeid(dev_type))
	CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
	else
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	}
	break;
	}

	case xx:
	{
	if(typeid(host_type) == typeid(dev_type))
	CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
	else
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	}
	break;
	}

	case xy:
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	{
	dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	break;
	}

	case yx:
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	{
	temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	break;
	}
	case xyz:
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	dev_type* temp = &temp_data[(idev_data_array->dim[1]+j)dev_data_array->dim[2]];
	for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
	{
	temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	break;
	}

	case xzy:
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
	{
	dev_type* temp = &temp_data[(idev_data_array->dim[2]+k)dev_data_array->dim[1]];
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	break;
	}
	}
	// we have uploaded the data to the device, i.e.:
	current_data_on_device = true;
	// the data is going to change on the device, making the host data out-dated
	if(will_be_changed) current_data_on_host = false;
	}

	template <typename host_type, typename dev_type, copy_mode mode>
	void cCudaData<host_type, dev_type, mode>
	::uploadAsync(int stream,bool will_be_changed)
	{
	// if current data is already up, do not re-upload it
	// if(current_data_on_device) return;
	if(buffer&&is_continues)
	{
	printf("Actual Buffer: %p %i\n",buffer,buf_size);
	if(typeid(host_type)==typeid(double))
	{
	if(typeid(dev_type)==typeid(double))
	{
	CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
	dev_data_array->dim,mode,*buffer);
	current_data_on_device = true;
	if(will_be_changed) current_data_on_host = false;
	return;
	}
	else if(typeid(dev_type)==typeid(float))
	{
	CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
	dev_data_array->dim,mode,*buffer);
	current_data_on_device = true;
	if(will_be_changed) current_data_on_host = false;
	return;
	}
	}
	else if(typeid(host_type)==typeid(float))
	{
	if(typeid(dev_type)==typeid(double))
	{
	CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
	dev_data_array->dim,mode,*buffer);
	current_data_on_device = true;
	if(will_be_changed) current_data_on_host = false;
	return;
	}
	else if(typeid(dev_type)==typeid(float))
	{
	CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
	dev_data_array->dim,mode,*buffer);
	current_data_on_device = true;
	if(will_be_changed) current_data_on_host = false;
	return;
	}
	}
	else if(typeid(host_type)==typeid(int))
	{
	if(typeid(dev_type)==typeid(int))
	{
	CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
	dev_data_array->dim,mode,*buffer);
	current_data_on_device = true;
	if(will_be_changed) current_data_on_host = false;
	return;
	}
	}
	}
	switch(mode)
	{
	case x:
	{
	if(typeid(host_type) == typeid(dev_type))
	CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
	else
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
	}
	break;
	}

	case xx:
	{
	if(typeid(host_type) == typeid(dev_type))
	CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
	else
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
	}
	break;
	}

	case xy:
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	{
	dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
	break;
	}

	case yx:
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	{
	temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
	break;
	}
	case xyz:
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	dev_type* temp = &temp_data[(idev_data_array->dim[1]+j)dev_data_array->dim[2]];
	for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
	{
	temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
	break;
	}

	case xzy:
	{
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
	{
	dev_type* temp = &temp_data[(idev_data_array->dim[2]+k)dev_data_array->dim[1]];
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufUploadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
	break;
	}
	}
	// we have uploaded the data to the device, i.e.:
	current_data_on_device = true;
	// the data is going to change on the device, making the host data out-dated
	if(will_be_changed) current_data_on_host = false;
	}

	template <typename host_type, typename dev_type, copy_mode mode>
	void cCudaData<host_type, dev_type, mode>
	::download(bool will_be_changed)
	{
	// if current data is already down, do not re-download it
	// if(current_data_on_host) return;
	switch(mode)
	{
	case x:
	{
	if(typeid(host_type) == typeid(dev_type))
	CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
	else
	{
	CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufDownloadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	}
	break;
	}

	case xx:
	{
	if(typeid(host_type) == typeid(dev_type))
	CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
	else
	{
	CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufDownloadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	}
	break;
	}

	case xy:
	{
	CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	{
	dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufDownloadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	break;
	}

	case yx:
	{
	CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	{
	reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufDownloadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	break;
	}

	case xyz:
	{
	CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	dev_type* temp = &temp_data[(i * dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
	for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
	{
	reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[k]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufDownloadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	break;
	}

	case xzy:
	{
	CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
	timespec time1,time2;
	clock_gettime(CLOCK_REALTIME,&time1);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
	{
	dev_type* temp = &temp_data[(i * dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[j]);
	}
	}
	clock_gettime(CLOCK_REALTIME,&time2);
	CudaWrapper_AddCPUBufDownloadTime(
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
	break;
	}
	}
	// we have downloaded the data to the host, i.e.:
	current_data_on_host = true;
	// the data is going to change on the host, making the device data out-dated
	if(will_be_changed) current_data_on_device = false;
	}

	template <typename host_type, typename dev_type, copy_mode mode>
	void cCudaData<host_type, dev_type, mode>
	::downloadAsync(int stream)
	{
	switch(mode)
	{
	case x:
	{
	if(typeid(host_type) == typeid(dev_type))
	{
	CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
	CudaWrapper_SyncStream(stream);
	}
	else
	{
	CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
	CudaWrapper_SyncStream(stream);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
	}
	break;
	}

	case xx:
	{
	if(typeid(host_type) == typeid(dev_type))
	{
	CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
	CudaWrapper_SyncStream(stream);
	}
	else
	{
	CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
	CudaWrapper_SyncStream(stream);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
	}
	break;
	}

	case xy:
	{
	CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
	CudaWrapper_SyncStream(stream);
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	{
	dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
	}
	}
	break;
	}

	case yx:
	{
	CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
	CudaWrapper_SyncStream(stream);
	for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
	{
	dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
	for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
	{
	reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
	}
	}
	break;
	}
	}
	}


	template <typename host_type, typename dev_type, copy_mode mode>
	void cCudaData<host_type, dev_type, mode>
	::memset_device(int value)
	{
	CudaWrapper_Memset(dev_data_array->dev_data,value, nbytes);
	}

	template <typename host_type, typename dev_type, copy_mode mode>
	void cCudaData<host_type, dev_type, mode>
	::set_buffer(void** abuffer,int* abuf_size,bool ais_continues)
	{
	buffer = abuffer;
	buf_size = abuf_size;
	unsigned nbytes_buf=(nbytes/sizeof(dev_type))*sizeof(host_type);
	if(buffer!=NULL)
	if(not((typeid(host_type) == typeid(dev_type))&&(mode == x \|\| mode == xx)))
	{
	printf("Allocate Buffer: %p %i\n",buffer,buf_size);
	if(((buffer)!=NULL)&&(buf_size<nbytes_buf))
	CudaWrapper_FreeCudaData(buffer,buf_size);
	if(*buf_size<nbytes_buf)
	{buffer=CudaWrapper_AllocCudaData(nbytes_buf);buf_size=nbytes_buf;}
	printf("Allocate Buffer2: %p %i\n",buffer,buf_size);

	}
	is_continues=ais_continues;
	}
	#endif // _CUDA_DATA_H_

cuda_data.hNo OneTemporaryActions

File Metadata

cuda_data.hView Options

Event Timeline

cuda_data.h
No OneTemporary
Actions

cuda_data.h
View Options