implementation.cu
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Feb 24, 07:56

implementation.cu
View Options

	/*
	============================================================================
	Filename : algorithm.c
	Author : Your name goes here
	SCIPER : Your SCIPER number
	============================================================================
	*/

	#include <iostream>
	#include <iomanip>
	#include <sys/time.h>
	#include <cuda_runtime.h>
	using namespace std;

	// CPU Baseline
	void array_process(double input, double output, int length, int iterations)
	{
	double *temp;

	for(int n=0; n<(int) iterations; n++)
	{
	for(int i=1; i<length-1; i++)
	{
	for(int j=1; j<length-1; j++)
	{
	output[(i)(length)+(j)] = (input[(i-1)(length)+(j-1)] +
	input[(i-1)*(length)+(j)] +
	input[(i-1)*(length)+(j+1)] +
	input[(i)*(length)+(j-1)] +
	input[(i)*(length)+(j)] +
	input[(i)*(length)+(j+1)] +
	input[(i+1)*(length)+(j-1)] +
	input[(i+1)*(length)+(j)] +
	input[(i+1)*(length)+(j+1)] ) / 9;
	}
	}
	output[(length/2-1)*length+(length/2-1)] = 1000;
	output[(length/2)*length+(length/2-1)] = 1000;
	output[(length/2-1)*length+(length/2)] = 1000;
	output[(length/2)*length+(length/2)] = 1000;

	temp = input;
	input = output;
	output = temp;
	}
	}


	__global__ void Heat(double input, double output, int length)
	{
	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
	int j = (blockIdx.x * blockDim.x) + threadIdx.x;


	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
	{
	if(!(((i == length/2-1)\|\|(i == length/2))&&((j == length/2-1)\|\|(j == length/2))))
	{
	output[(i)(length)+(j)] = ( input[(i-1)(length)+(j-1)] +
	input[(i-1)*(length)+(j)] +
	input[(i-1)*(length)+(j+1)] +
	input[(i)*(length)+(j-1)] +
	input[(i)*(length)+(j)] +
	input[(i)*(length)+(j+1)] +
	input[(i+1)*(length)+(j-1)] +
	input[(i+1)*(length)+(j)] +
	input[(i+1)*(length)+(j+1)] ) / 9;
	}
	else output[(i)*(length)+(j)] = 1000;
	}
	else return;
	}


	__global__ void Heat_sh(double input, double output, int len)
	{
	extern __shared__ double s[];
	int length = len;
	int S_size = (blockDim.x+2)*(blockDim.y+2);



	int I = blockIdx.y*blockDim.y;
	int J = blockIdx.x*blockDim.x;

	int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block

	int a = 0;
	int b = 0;

	/*
	//first thread fills s

	if(tid == 0)
	{
	for(int k = 0 ; k<S_size; k++)
	{
	a = k/(blockDim.x+2);
	b = k%(blockDim.x+2);


	if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
	{
	s[a(blockDim.x+2) + b] = input[(I-1+a)length + (J-1+b)];
	}
	}
	}


	//first warp fills s

	int warp_size = 32;
	int warp_id = threadIdx.x / warp_size;
	int ALU_id = threadIdx.x % warp_size;

	if(warp_id == 0)
	{
	for(int k = ALU_id ; k<S_size; k += warp_size)
	{
	a = k/(blockDim.x+2);
	b = k%(blockDim.x+2);

	if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
	{
	s[a(blockDim.x+2) + b] = input[(I-1+a)length + (J-1+b)];
	}
	}
	}


	//all threads fill s

	for(int k = tid ; k<S_size; k += blockDim.x*blockDim.y)
	{
	a = k/(blockDim.x+2);
	b = k%(blockDim.x+2);

	if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
	{
	s[a(blockDim.x+2) + b] = input[(I-1+a)length + (J-1+b)];
	}
	}
	*/

	//unrolled fill

	a = tid/(blockDim.x+2);
	b = tid%(blockDim.x+2);

	if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
	{
	s[a(blockDim.x+2) + b] = input[(I-1+a)length + (J-1+b)];
	}

	a = (blockDim.x*blockDim.y+tid)/(blockDim.x+2);
	b = (blockDim.x*blockDim.y+tid)%(blockDim.x+2);

	if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
	{
	s[a(blockDim.x+2) + b] = input[(I-1+a)length + (J-1+b)];
	}


	__syncthreads();

	a = threadIdx.y+1; //access s
	b = threadIdx.x+1; //access s

	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
	int j = (blockIdx.x * blockDim.x) + threadIdx.x;


	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
	{
	if(!(((i == length/2-1)\|\|(i == length/2))&&((j == length/2-1)\|\|(j == length/2))))
	{
	output[(i)(length)+(j)] = ( s[(a-1)(blockDim.x+2) + b-1] + s[(a-1)(blockDim.x+2) + b] + s[(a-1)(blockDim.x+2) + b+1]
	+ s[(a)(blockDim.x+2) + b-1] + s[(a)(blockDim.x+2) + b] + s[(a)*(blockDim.x+2) + b+1]
	+ s[(a+1)(blockDim.x+2) + b-1] + s[(a+1)(blockDim.x+2) + b] + s[(a+1)*(blockDim.x+2) + b+1] ) / 9;
	}
	else output[(i)*(length)+(j)] = 1000;
	}
	else return;
	}

	__global__ void Heat_sh_mod(double input, double output, int len)
	{
	extern __shared__ double s[];
	int length = len;

	//int S_size_x = blockDim.x;
	//int S_size_y = blockDim.y;

	//int size_x = (blockDim.x-2);
	//int size_y = (blockDim.y-2);



	int In_x = blockIdx.xblockDim.x - 2blockIdx.x; //nb_blocks in x *blockDim.x
	int In_y = ((blockIdx.ygridDim.x)blockDim.y) - 2blockIdx.y; //((nb_blocks in y)blockDim.y) * nb_element per line


	int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block

	if(((In_y-1)>=0)&&((In_x-1)>=0)&&(In_y-1 + threadIdx.y<length)&&(In_x-1 + threadIdx.x<length))
	{
	s[tid] = input[(In_y-1)(gridDim.xblockDim.x) + In_x-1 + (threadIdx.y)*blockDim.x +threadIdx.x];
	}

	__syncthreads();


	int a = threadIdx.y; //access s
	int b = threadIdx.x; //access s

	int i = In_y(gridDim.xblockDim.x) + a-1;
	int j = In_x + b-1;

	if((a > 0) && (b > 0)&&(a<blockDim.y-1)&&(b<blockDim.x-1)&&(i > 0)&&(j > 0)&&(i < length-1) && (j < length-1))
	{
	if(!(((i == length/2-1)\|\|(i == length/2))&&((j == length/2-1)\|\|(j == length/2))))
	{
	output[(i)(length)+(j)] = ( s[(a-1)(blockDim.x) + b-1] + s[(a-1)(blockDim.x) + b] + s[(a-1)(blockDim.x) + b+1]
	+ s[(a)(blockDim.x) + b-1] + s[(a)(blockDim.x) + b] + s[(a)*(blockDim.x) + b+1]
	+ s[(a+1)(blockDim.x) + b-1] + s[(a+1)(blockDim.x) + b] + s[(a+1)*(blockDim.x) + b+1] ) / 9;
	}
	else output[(i)*(length)+(j)] = 1000;
	}
	else return;
	}

	__global__ void Heat_sh_red_1(double input, double output, int length)
	{
	extern __shared__ double shared[];
	int S_size = (blockDim.x+2)*(blockDim.y+2);
	int red_size = (blockDim.x)*(blockDim.y+2);

	double *s = shared;
	double red = (double)&shared[S_size];

	int I = blockIdx.y*blockDim.y;
	int J = blockIdx.x*blockDim.x;

	int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block

	int a = 0;
	int b = 0;

	//all threads fill s

	for(int k = tid ; k<S_size; k += blockDim.x*blockDim.y)
	{
	a = k/(blockDim.x+2);
	b = k%(blockDim.x+2);

	if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
	{
	s[a(blockDim.x+2) + b] = input[(I-1+a)length + (J-1+b)];
	}
	}

	__syncthreads();

	for(int k = tid ; k<red_size; k += blockDim.x*blockDim.y)
	{
	a = k/(blockDim.y+2);
	b = k%(blockDim.y+2);

	red[a(blockDim.y+2) + b] = s[b(blockDim.x+2) + a] + s[b(blockDim.x+2) + a+1] + s[b(blockDim.x+2) + a+2]; //reduction + transpose (a and b swaped)
	}

	__syncthreads();

	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
	int j = (blockIdx.x * blockDim.x) + threadIdx.x;

	a = threadIdx.y;
	b = threadIdx.x;

	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
	{
	if(!(((i == length/2-1)\|\|(i == length/2))&&((j == length/2-1)\|\|(j == length/2))))
	{
	output[(i)(length)+(j)] = (red[(b)(blockDim.y+2) + a] + red[(b)(blockDim.y+2) + a+1] + red[(b)(blockDim.y+2) + a+2]) / 9;
	}
	else output[(i)*(length)+(j)] = 1000;
	}


	else return;
	}



	__global__ void Heat_sh_red_2(double input, double output, int length)
	{
	extern __shared__ double red[];
	int red_size = (blockDim.x)*(blockDim.y+2);

	int I = blockIdx.y*blockDim.y;
	int J = blockIdx.x*blockDim.x;

	int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block

	int a = 0;
	int b = 0;



	for(int k = tid ; k<red_size; k += blockDim.x*blockDim.y)
	{
	a = k/(blockDim.y+2);
	b = k%(blockDim.y+2);

	if((J-1+a >= 0) && (I-1+b >= 0) && (J+1+a < length) && (I-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
	{
	red[a(blockDim.y+2) + b] = input[(I-1+b)length + (J-1+a)] + input[(I-1+b)length + (J+a)] + input[(I-1+b)length + (J+a+1)]; //reduction + transpose (a and b swaped)
	}
	}

	__syncthreads();

	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
	int j = (blockIdx.x * blockDim.x) + threadIdx.x;

	a = threadIdx.y;
	b = threadIdx.x;

	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
	{
	if(!(((i == length/2-1)\|\|(i == length/2))&&((j == length/2-1)\|\|(j == length/2))))
	{
	output[(i)(length)+(j)] = (red[(b)(blockDim.y+2) + a] + red[(b)(blockDim.y+2) + a+1] + red[(b)(blockDim.y+2) + a+2]) / 9;
	}
	else output[(i)*(length)+(j)] = 1000;
	}


	else return;
	}


	// GPU Optimized function
	void GPU_array_process(double input, double output, int length, int iterations)
	{
	//Cuda events for calculating elapsed time
	cudaEvent_t cpy_H2D_start, cpy_H2D_end, comp_start, comp_end, cpy_D2H_start, cpy_D2H_end;
	cudaEventCreate(&cpy_H2D_start);
	cudaEventCreate(&cpy_H2D_end);
	cudaEventCreate(&cpy_D2H_start);
	cudaEventCreate(&cpy_D2H_end);
	cudaEventCreate(&comp_start);
	cudaEventCreate(&comp_end);

	/* Preprocessing goes here */
	int SIZE = lengthlengthsizeof(double);

	double* gpu_input;
	double* gpu_output;
	double* temp; //used to swith pointers between gpu_input and gpu_output
	cudaMalloc((void**)&gpu_input, SIZE);
	cudaMalloc((void**)&gpu_output, SIZE);


	cudaEventRecord(cpy_H2D_start);
	/* Copying array from host to device goes here */
	cudaMemcpy((void)gpu_input, (void)input, SIZE, cudaMemcpyHostToDevice);
	cudaMemcpy((void)gpu_output, (void)output, SIZE, cudaMemcpyHostToDevice);


	int row_threads = 16;
	int col_threads = 16;

	//dim3 thrsPerBlock(row_threads, col_threads);
	//dim3 nBlks((length-1)/row_threads + 1,(length-1)/col_threads + 1); //if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc...
	dim3 nBlks((length-1)/(row_threads-2) + 1,(length-1)/(col_threads-2) + 1);
	dim3 thrsPerBlock(row_threads, col_threads);

	cudaEventRecord(cpy_H2D_end);
	cudaEventSynchronize(cpy_H2D_end);

	//Copy array from host to device
	cudaEventRecord(comp_start);
	/* GPU calculation goes here */


	for(size_t i = 0; i<(int) iterations; i++)
	{
	//Heat<<<nBlks, thrsPerBlock>>>(gpu_input, gpu_output, length);
	//Heat_sh<<<nBlks, thrsPerBlock, ((row_threads+2)(col_threads+2))sizeof(double)>>>(gpu_input, gpu_output, length);
	Heat_sh_mod<<<nBlks, thrsPerBlock, (row_threadscol_threads)sizeof(double)>>>(gpu_input, gpu_output, length);
	//Heat_sh_red_1<<<nBlks, thrsPerBlock, (((row_threads+2)(col_threads+2))+ ((row_threads)(col_threads+2)))*sizeof(double)>>>(gpu_input, gpu_output, length);
	//Heat_sh_red_2<<<nBlks, thrsPerBlock, (((row_threads)(col_threads+2)))sizeof(double)>>>(gpu_input, gpu_output, length);

	temp = gpu_input;
	gpu_input = gpu_output;
	gpu_output = temp;

	}

	gpu_output = gpu_input; //unswap from the last iteration


	cudaEventRecord(comp_end);
	cudaEventSynchronize(comp_end);

	cudaEventRecord(cpy_D2H_start);
	/* Copying array from device to host goes here */

	cudaMemcpy((void)output, (void)gpu_output, SIZE, cudaMemcpyDeviceToHost);


	cudaEventRecord(cpy_D2H_end);
	cudaEventSynchronize(cpy_D2H_end);

	/* Postprocessing goes here */

	cudaFree(gpu_input);
	cudaFree(gpu_output);


	float time;
	cudaEventElapsedTime(&time, cpy_H2D_start, cpy_H2D_end);
	cout<<"Host to Device MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;

	cudaEventElapsedTime(&time, comp_start, comp_end);
	cout<<"Computation takes "<<setprecision(4)<<time/1000<<"s"<<endl;

	cudaEventElapsedTime(&time, cpy_D2H_start, cpy_D2H_end);
	cout<<"Device to Host MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
	}

implementation.cuNo OneTemporaryActions

File Metadata

implementation.cuView Options

Event Timeline

implementation.cu
No OneTemporary
Actions

implementation.cu
View Options