Page MenuHomec4science

implementation.cu
No OneTemporary

File Metadata

Created
Sun, Feb 2, 23:47

implementation.cu

/*
============================================================================
Filename : algorithm.c
Author : Your name goes here
SCIPER : Your SCIPER number
============================================================================
*/
#include <iostream>
#include <iomanip>
#include <sys/time.h>
#include <cuda_runtime.h>
using namespace std;
#define SHARED_MEM_SIZE 98304 // 96 KB
#define CACHE_LINE 128
#define WARP_SIZE 32
#define COL_THREADS 16
#define ROW_THREADS 8
// CPU Baseline
void array_process(double *input, double *output, int length, int iterations)
{
double *temp;
for(int n=0; n<(int) iterations; n++)
{
for(int i=1; i<length-1; i++)
{
for(int j=1; j<length-1; j++)
{
output[(i)*(length)+(j)] = (input[(i-1)*(length)+(j-1)] +
input[(i-1)*(length)+(j)] +
input[(i-1)*(length)+(j+1)] +
input[(i)*(length)+(j-1)] +
input[(i)*(length)+(j)] +
input[(i)*(length)+(j+1)] +
input[(i+1)*(length)+(j-1)] +
input[(i+1)*(length)+(j)] +
input[(i+1)*(length)+(j+1)] ) / 9;
}
}
output[(length/2-1)*length+(length/2-1)] = 1000;
output[(length/2)*length+(length/2-1)] = 1000;
output[(length/2-1)*length+(length/2)] = 1000;
output[(length/2)*length+(length/2)] = 1000;
temp = input;
input = output;
output = temp;
}
}
/*
* General idea:
* - Allocate double precision shared memory for each block
* - 16 threads per block, because want full occupancy and each thread takes 2 banks
* - sizeof(shared) = 128b, then a block should not exceed sizeof(shared) / sizeof(double) = 16 values
*
* Idea before column reduction:
* - Start a new kernel
* - Avoid conflicts, sum over columns: reduct of 3 times the original matrix
* - Store transposed result into output
* - Swap input and output
*
* Idea after column reduction:
* - Start a new kernel
* - Avoid conflicts, sum over rows with padding of 1
* - Store result into output
*/
__global__ void Heat(double *input, double *output, int length)
{
// shared memory portion
__shared__ double sdata[ROW_THREADS][COL_THREADS];
unsigned int tid = threadIdx.x;
unsigned int warp = threadIdx.y / 2;
unsigned int x = (blockIdx.x * blockDim.x) + threadIdx.x; // absolute x
unsigned int y = (blockIdx.y * (blockDim.y - 2)); // absolute y
// follow the line setting
y += (warp != 0) ? threadIdx.y - 2 : (blockDim.y-1) * threadIdx.y;
/*
Block structure:
-1 ******************* <- warp 0
0 ddddddddddddddddddd <- warp 1
1 ddddddddddddddddddd <- warp 1
2 ddddddddddddddddddd <- warp 2
3 ddddddddddddddddddd <- warp 2
....
ddddddddddddddddddd <- warp ROW_THREADS/2 - 1
ddddddddddddddddddd <- warp ROW_THREADS/2 - 1
******************* <- warp 0
*/
unsigned int TILE = gridDim.x * blockDim.x;
unsigned int k = y * TILE + x; // absolute index of the array
if (warp != 0 && x > 0 && y > 0 && (x < length-1) && (y < length-1))
output[k] = blockIdx.x;
// grid boundary
if ( !( (x > 0) && (y > 0) && (x < length-1) && (y < length-1) ) ) {
sdata[threadIdx.y][threadIdx.x] = 0;
} else if (((x == length/2-1) || (x == length/2)) && ((y == length/2-1) || (y == length/2))) {
output[k] = 1000;
} else {
// all threads in a warp should share the same cache
// already sum the first three reductions
sdata[threadIdx.y][threadIdx.x] =
input[k-1] + // non-coalesced access every right end block
input[k] +
input[k+1]; // non-coalesced access every left end block
// assure all threads are synchronized
__syncthreads();
// allow only one warp
if (warp == 0) {
// select only 16 threads
// no bank conflicts only if COL_THREADS is 16
if (threadIdx.y == 0) {
// two rows more are fetched
for (unsigned int row = 1; row < (ROW_THREADS - 1); ++row) {
// select the same bank
sdata[row][tid] += sdata[row-1][tid] + sdata[row+1][tid];
}
}
}
// wait all threads are finished
__syncthreads();
//if (warp != 0)
//output[k] = 1.0;
//output[k] = sdata[threadIdx.y][threadIdx.x] / 9;
}
}
// GPU Optimized function
void GPU_array_process(double *input, double *output, int length, int iterations)
{
//Cuda events for calculating elapsed time
cudaEvent_t cpy_H2D_start, cpy_H2D_end, comp_start, comp_end, cpy_D2H_start, cpy_D2H_end;
cudaEventCreate(&cpy_H2D_start);
cudaEventCreate(&cpy_H2D_end);
cudaEventCreate(&cpy_D2H_start);
cudaEventCreate(&cpy_D2H_end);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_end);
// defining constants
unsigned int XBLOCKS = length / COL_THREADS; // ceil to a COL_THREADS multiple, x direction
unsigned int YBLOCKS = length / (ROW_THREADS - 2);
if (length % COL_THREADS != 0)
++XBLOCKS;
if (length % (ROW_THREADS-2) != 0)
++YBLOCKS;
unsigned int TILE = XBLOCKS * COL_THREADS;
unsigned int SIZE = TILE*TILE*sizeof(double);
cout << "X blocks = " << XBLOCKS << endl;
cout << "Y blocks = " << YBLOCKS << endl;
cout << "X TILE = " << TILE << endl;
cout << "Y TILE = " << (YBLOCKS * (ROW_THREADS-2)) << endl;
/* Preprocessing goes here */
double* gpu_input;
double* gpu_output;
double* temp; //used to swith pointers between gpu_input and gpu_output
cudaMalloc((void**)&gpu_input, SIZE);
cudaMalloc((void**)&gpu_output, SIZE);
cudaEventRecord(cpy_H2D_start);
/* Copying array from host to device goes here */
for (unsigned int row = 0; row < length; ++row) {
// copy the row
cudaMemcpy((void*)(gpu_input + row * TILE), (void*)(input + row * length), length, cudaMemcpyHostToDevice);
}
dim3 thrsPerBlock(COL_THREADS, ROW_THREADS);
dim3 nBlks(XBLOCKS, YBLOCKS); //if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc...
//dim3 nBlks(1,1);
cudaEventRecord(cpy_H2D_end);
cudaEventSynchronize(cpy_H2D_end);
//Copy array from host to device
cudaEventRecord(comp_start);
/* GPU calculation goes here */
unsigned int shared_mem_size = COL_THREADS * ROW_THREADS * sizeof(double);
for(size_t i = 0; i<iterations; i++)
{
// start kernel and compute evolution
// , SHARED_MEM_SIZE
Heat<<<nBlks, thrsPerBlock, shared_mem_size>>>(gpu_input, gpu_output, length);
// wait until GPU has finished
cudaDeviceSynchronize();
temp = gpu_input;
gpu_input = gpu_output;
gpu_output = temp;
}
gpu_output = gpu_input; //unswap from the last iteration
cudaEventRecord(comp_end);
cudaEventSynchronize(comp_end);
/*double mean = 0;
for (int i = 0; i < length; ++i) {
for (int j = 0; j < length; ++j) {
mean += gpu_output[i * TILE + j];
}
}
cout << "Massimo: " << (mean/(length*length)) << endl;*/
cudaEventRecord(cpy_D2H_start);
/* Copying array from device to host goes here */
for (unsigned int row = 0; row < length; ++row) {
cudaMemcpy((void*)(output + length * row), (void*)(gpu_output + TILE * row), length, cudaMemcpyDeviceToHost);
}
cudaEventRecord(cpy_D2H_end);
cudaEventSynchronize(cpy_D2H_end);
/* Postprocessing goes here */
cudaFree(gpu_input);
cudaFree(gpu_output);
float time;
cudaEventElapsedTime(&time, cpy_H2D_start, cpy_H2D_end);
cout<<"Host to Device MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
cudaEventElapsedTime(&time, comp_start, comp_end);
cout<<"Computation takes "<<setprecision(4)<<time/1000<<"s"<<endl;
cudaEventElapsedTime(&time, cpy_D2H_start, cpy_D2H_end);
cout<<"Device to Host MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
}

Event Timeline