implementation.cu
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Feb 2, 23:47

implementation.cu
View Options

	/*
	============================================================================
	Filename : algorithm.c
	Author : Your name goes here
	SCIPER : Your SCIPER number
	============================================================================
	*/

	#include <iostream>
	#include <iomanip>
	#include <sys/time.h>
	#include <cuda_runtime.h>
	using namespace std;

	#define SHARED_MEM_SIZE 98304 // 96 KB
	#define CACHE_LINE 128
	#define WARP_SIZE 32

	#define COL_THREADS 16
	#define ROW_THREADS 8

	// CPU Baseline
	void array_process(double input, double output, int length, int iterations)
	{
	double *temp;

	for(int n=0; n<(int) iterations; n++)
	{
	for(int i=1; i<length-1; i++)
	{
	for(int j=1; j<length-1; j++)
	{
	output[(i)(length)+(j)] = (input[(i-1)(length)+(j-1)] +
	input[(i-1)*(length)+(j)] +
	input[(i-1)*(length)+(j+1)] +
	input[(i)*(length)+(j-1)] +
	input[(i)*(length)+(j)] +
	input[(i)*(length)+(j+1)] +
	input[(i+1)*(length)+(j-1)] +
	input[(i+1)*(length)+(j)] +
	input[(i+1)*(length)+(j+1)] ) / 9;

	}
	}
	output[(length/2-1)*length+(length/2-1)] = 1000;
	output[(length/2)*length+(length/2-1)] = 1000;
	output[(length/2-1)*length+(length/2)] = 1000;
	output[(length/2)*length+(length/2)] = 1000;

	temp = input;
	input = output;
	output = temp;
	}
	}

	/*
	* General idea:
	* - Allocate double precision shared memory for each block
	* - 16 threads per block, because want full occupancy and each thread takes 2 banks
	* - sizeof(shared) = 128b, then a block should not exceed sizeof(shared) / sizeof(double) = 16 values
	*
	* Idea before column reduction:
	* - Start a new kernel
	* - Avoid conflicts, sum over columns: reduct of 3 times the original matrix
	* - Store transposed result into output
	* - Swap input and output
	*
	* Idea after column reduction:
	* - Start a new kernel
	* - Avoid conflicts, sum over rows with padding of 1
	* - Store result into output
	*/

	__global__ void Heat(double input, double output, int length)
	{
	// shared memory portion
	__shared__ double sdata[ROW_THREADS][COL_THREADS];

	unsigned int tid = threadIdx.x;

	unsigned int warp = threadIdx.y / 2;

	unsigned int x = (blockIdx.x * blockDim.x) + threadIdx.x; // absolute x
	unsigned int y = (blockIdx.y * (blockDim.y - 2)); // absolute y

	// follow the line setting
	y += (warp != 0) ? threadIdx.y - 2 : (blockDim.y-1) * threadIdx.y;

	/*
	Block structure:

	-1 ******************* <- warp 0
	0 ddddddddddddddddddd <- warp 1
	1 ddddddddddddddddddd <- warp 1
	2 ddddddddddddddddddd <- warp 2
	3 ddddddddddddddddddd <- warp 2
	....
	ddddddddddddddddddd <- warp ROW_THREADS/2 - 1
	ddddddddddddddddddd <- warp ROW_THREADS/2 - 1
	******************* <- warp 0
	*/

	unsigned int TILE = gridDim.x * blockDim.x;

	unsigned int k = y * TILE + x; // absolute index of the array

	if (warp != 0 && x > 0 && y > 0 && (x < length-1) && (y < length-1))
	output[k] = blockIdx.x;

	// grid boundary
	if ( !( (x > 0) && (y > 0) && (x < length-1) && (y < length-1) ) ) {
	sdata[threadIdx.y][threadIdx.x] = 0;
	} else if (((x == length/2-1) \|\| (x == length/2)) && ((y == length/2-1) \|\| (y == length/2))) {
	output[k] = 1000;
	} else {

	// all threads in a warp should share the same cache
	// already sum the first three reductions


	sdata[threadIdx.y][threadIdx.x] =
	input[k-1] + // non-coalesced access every right end block
	input[k] +
	input[k+1]; // non-coalesced access every left end block

	// assure all threads are synchronized
	__syncthreads();


	// allow only one warp

	if (warp == 0) {

	// select only 16 threads
	// no bank conflicts only if COL_THREADS is 16
	if (threadIdx.y == 0) {

	// two rows more are fetched
	for (unsigned int row = 1; row < (ROW_THREADS - 1); ++row) {

	// select the same bank
	sdata[row][tid] += sdata[row-1][tid] + sdata[row+1][tid];
	}
	}
	}

	// wait all threads are finished
	__syncthreads();

	//if (warp != 0)
	//output[k] = 1.0;
	//output[k] = sdata[threadIdx.y][threadIdx.x] / 9;
	}
	}


	// GPU Optimized function
	void GPU_array_process(double input, double output, int length, int iterations)
	{
	//Cuda events for calculating elapsed time
	cudaEvent_t cpy_H2D_start, cpy_H2D_end, comp_start, comp_end, cpy_D2H_start, cpy_D2H_end;
	cudaEventCreate(&cpy_H2D_start);
	cudaEventCreate(&cpy_H2D_end);
	cudaEventCreate(&cpy_D2H_start);
	cudaEventCreate(&cpy_D2H_end);
	cudaEventCreate(&comp_start);
	cudaEventCreate(&comp_end);

	// defining constants

	unsigned int XBLOCKS = length / COL_THREADS; // ceil to a COL_THREADS multiple, x direction
	unsigned int YBLOCKS = length / (ROW_THREADS - 2);

	if (length % COL_THREADS != 0)
	++XBLOCKS;

	if (length % (ROW_THREADS-2) != 0)
	++YBLOCKS;

	unsigned int TILE = XBLOCKS * COL_THREADS;
	unsigned int SIZE = TILETILEsizeof(double);

	cout << "X blocks = " << XBLOCKS << endl;
	cout << "Y blocks = " << YBLOCKS << endl;

	cout << "X TILE = " << TILE << endl;
	cout << "Y TILE = " << (YBLOCKS * (ROW_THREADS-2)) << endl;

	/* Preprocessing goes here */

	double* gpu_input;
	double* gpu_output;
	double* temp; //used to swith pointers between gpu_input and gpu_output
	cudaMalloc((void**)&gpu_input, SIZE);
	cudaMalloc((void**)&gpu_output, SIZE);

	cudaEventRecord(cpy_H2D_start);
	/* Copying array from host to device goes here */

	for (unsigned int row = 0; row < length; ++row) {

	// copy the row
	cudaMemcpy((void)(gpu_input + row TILE), (void)(input + row length), length, cudaMemcpyHostToDevice);
	}

	dim3 thrsPerBlock(COL_THREADS, ROW_THREADS);
	dim3 nBlks(XBLOCKS, YBLOCKS); //if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc...
	//dim3 nBlks(1,1);

	cudaEventRecord(cpy_H2D_end);
	cudaEventSynchronize(cpy_H2D_end);

	//Copy array from host to device
	cudaEventRecord(comp_start);
	/* GPU calculation goes here */

	unsigned int shared_mem_size = COL_THREADS * ROW_THREADS * sizeof(double);

	for(size_t i = 0; i<iterations; i++)
	{
	// start kernel and compute evolution
	// , SHARED_MEM_SIZE
	Heat<<<nBlks, thrsPerBlock, shared_mem_size>>>(gpu_input, gpu_output, length);

	// wait until GPU has finished
	cudaDeviceSynchronize();

	temp = gpu_input;
	gpu_input = gpu_output;
	gpu_output = temp;
	}

	gpu_output = gpu_input; //unswap from the last iteration

	cudaEventRecord(comp_end);
	cudaEventSynchronize(comp_end);

	/*double mean = 0;
	for (int i = 0; i < length; ++i) {
	for (int j = 0; j < length; ++j) {
	mean += gpu_output[i * TILE + j];
	}
	}

	cout << "Massimo: " << (mean/(lengthlength)) << endl;/

	cudaEventRecord(cpy_D2H_start);
	/* Copying array from device to host goes here */

	for (unsigned int row = 0; row < length; ++row) {
	cudaMemcpy((void)(output + length row), (void)(gpu_output + TILE row), length, cudaMemcpyDeviceToHost);
	}


	cudaEventRecord(cpy_D2H_end);
	cudaEventSynchronize(cpy_D2H_end);

	/* Postprocessing goes here */

	cudaFree(gpu_input);
	cudaFree(gpu_output);


	float time;
	cudaEventElapsedTime(&time, cpy_H2D_start, cpy_H2D_end);
	cout<<"Host to Device MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;

	cudaEventElapsedTime(&time, comp_start, comp_end);
	cout<<"Computation takes "<<setprecision(4)<<time/1000<<"s"<<endl;

	cudaEventElapsedTime(&time, cpy_D2H_start, cpy_D2H_end);
	cout<<"Device to Host MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
	}

implementation.cuNo OneTemporaryActions

File Metadata

implementation.cuView Options

Event Timeline

implementation.cu
No OneTemporary
Actions

implementation.cu
View Options