Page MenuHomec4science

implementation_louis.cu
No OneTemporary

File Metadata

Created
Tue, Dec 3, 22:54

implementation_louis.cu

/*
============================================================================
Filename : algorithm.c
Author : Your name goes here
SCIPER : Your SCIPER number
============================================================================
*/
#include <iostream>
#include <iomanip>
#include <sys/time.h>
#include <cuda_runtime.h>
using namespace std;
#define SHARED_MEM_SIZE 98304 // 96 KB
#define CACHE_LINE 128
#define WARP_SIZE 32
#define COL_THREADS 16
#define ROW_THREADS 8
// CPU Baseline
void array_process(double *input, double *output, int length, int iterations)
{
double *temp;
for(int n=0; n<(int) iterations; n++)
{
for(int i=1; i<length-1; i++)
{
for(int j=1; j<length-1; j++)
{
output[(i)*(length)+(j)] = (input[(i-1)*(length)+(j-1)] +
input[(i-1)*(length)+(j)] +
input[(i-1)*(length)+(j+1)] +
input[(i)*(length)+(j-1)] +
input[(i)*(length)+(j)] +
input[(i)*(length)+(j+1)] +
input[(i+1)*(length)+(j-1)] +
input[(i+1)*(length)+(j)] +
input[(i+1)*(length)+(j+1)] ) / 9;
}
}
output[(length/2-1)*length+(length/2-1)] = 1000;
output[(length/2)*length+(length/2-1)] = 1000;
output[(length/2-1)*length+(length/2)] = 1000;
output[(length/2)*length+(length/2)] = 1000;
temp = input;
input = output;
output = temp;
}
}
/*__global__ void Heat(double *input, double *output, int length)
{
int i = (blockIdx.x * blockDim.x) + threadIdx.x;
int j = (blockIdx.y * blockDim.y) + threadIdx.y;
if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
{
if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
{
output[(i)*(length)+(j)] = ( input[(i-1)*(length)+(j-1)] +
input[(i-1)*(length)+(j)] +
input[(i-1)*(length)+(j+1)] +
input[(i)*(length)+(j-1)] +
input[(i)*(length)+(j)] +
input[(i)*(length)+(j+1)] +
input[(i+1)*(length)+(j-1)] +
input[(i+1)*(length)+(j)] +
input[(i+1)*(length)+(j+1)] ) / 9;
}
else output[(i)*(length)+(j)] = 1000;
}
else return;
}*/
__global__ void Heat_sh(double *input, double *output, int length)
{
extern __shared__ double s[];
int S_size = (blockDim.x+2)*(blockDim.y+2);
int I = blockIdx.y*blockDim.y;
int J = blockIdx.x*blockDim.x;
int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block
int a = 0;
int b = 0;
/*
//first thread fills s
if(tid == 0)
{
for(int k = 0 ; k<S_size; k++)
{
a = k/(blockDim.x+2);
b = k%(blockDim.x+2);
if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
{
s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
}
}
}
//first warp fills s
int warp_size = 32;
int warp_id = threadIdx.x / warp_size;
int ALU_id = threadIdx.x % warp_size;
if(warp_id == 0)
{
for(int k = ALU_id ; k<S_size; k += warp_size)
{
a = k/(blockDim.x+2);
b = k%(blockDim.x+2);
if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
{
s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
}
}
}*/
//all threads fill s
for(int k = tid ; k<S_size; k += blockDim.x*blockDim.y)
{
a = k/(blockDim.x+2);
b = k%(blockDim.x+2);
if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
{
s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
}
}
__syncthreads();
a = threadIdx.y+1; //access s
b = threadIdx.x+1; //access s
int i = (blockIdx.y * blockDim.y) + threadIdx.y;
int j = (blockIdx.x * blockDim.x) + threadIdx.x;
if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
{
if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
{
output[(i)*(length)+(j)] = ( s[(a-1)*(blockDim.x+2) + b-1] + s[(a-1)*(blockDim.x+2) + b] + s[(a-1)*(blockDim.x+2) + b+1]
+ s[(a)*(blockDim.x+2) + b-1] + s[(a)*(blockDim.x+2) + b] + s[(a)*(blockDim.x+2) + b+1]
+ s[(a+1)*(blockDim.x+2) + b-1] + s[(a+1)*(blockDim.x+2) + b] + s[(a+1)*(blockDim.x+2) + b+1] ) / 9;
}
else output[(i)*(length)+(j)] = 1000;
}
else return;
}
__global__ void Heat_raffa(double *input, double *output, int length)
{
// shared memory portion
extern __shared__ double sdata[];
unsigned int block_x = threadIdx.x;
unsigned int tid = threadIdx.x;
unsigned int x = (blockIdx.x * blockDim.x) + block_x; // absolute x
unsigned int y = (blockIdx.y * blockDim.y) + threadIdx.y; // absolute y
unsigned int TILE = gridDim.x * blockDim.x;
unsigned int k = y * TILE + x; // absolute index of the array
// index in shared memory: transpose
unsigned int sindex = block_x * blockDim.x + threadIdx.y;
output[k] = input[k]; // DEBUG: only single access
/*
// grid boundary
if (!((x > 0) && (y > 0) && (x < length-1) && (y < length-1))) {
sdata[sindex] = 0;
} else if (((x == length/2-1) || (x == length/2)) && ((y == length/2-1) || (y == length/2))) {
output[k] = 1000;
} else {
// all threads in a warp should share the same cache
// already sum the first three reduction
sdata[sindex] =
input[k-1] + // cache miss every right end block
input[k] +
input[k+1]; // cache miss every left end block
// NOTE: cache missing is better than
// assure all threads are synchronized
__syncthreads();
// select only 16 threads
// no bank conflicts only if COL_THREADS is 16
// TODO: branch on different warps, warning different offset computation
if (tid < COL_THREADS) {
unsigned int offset = 0;
// two rows more are fetched
for (unsigned int row = 1; row < (ROW_THREADS - 1); ++row) {
offset = COL_THREADS * row;
// select the same bank
sdata[sindex + offset] += sdata[sindex - COL_THREADS + offset] + sdata[sindex + COL_THREADS + offset];
}
}
// wait all threads are finished
__syncthreads();
if (tid / WARP_SIZE > 0 && tid / WARP_SIZE < (ROW_THREADS - 1)) {
// write data back into the global memory
// select a thread
for (unsigned int col = 0; col < COL_THREADS; ++col) {
// cache output once
output[y * length + x + col] = sdata[sindex + col];
}
}
output[k] = sdata[sindex];
}*/
}
__global__ void Heat_sh_red_1(double *input, double *output, int length)
{
extern __shared__ double shared[];
int S_size = (blockDim.x+2)*(blockDim.y+2);
int red_size = (blockDim.x)*(blockDim.y+2);
double *s = shared;
double *red = (double*)&shared[S_size];
int I = blockIdx.y*blockDim.y;
int J = blockIdx.x*blockDim.x;
int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block
int a = 0;
int b = 0;
//all threads fill s
for(int k = tid ; k<S_size; k += blockDim.x*blockDim.y)
{
a = k/(blockDim.x+2);
b = k%(blockDim.x+2);
if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
{
s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
}
}
__syncthreads();
for(int k = tid ; k<red_size; k += blockDim.x*blockDim.y)
{
a = k/(blockDim.y+2);
b = k%(blockDim.y+2);
red[a*(blockDim.y+2) + b] = s[b*(blockDim.x+2) + a] + s[b*(blockDim.x+2) + a+1] + s[b*(blockDim.x+2) + a+2]; //reduction + transpose (a and b swaped)
}
__syncthreads();
int i = (blockIdx.y * blockDim.y) + threadIdx.y;
int j = (blockIdx.x * blockDim.x) + threadIdx.x;
a = threadIdx.y;
b = threadIdx.x;
if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
{
if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
{
output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9;
}
else output[(i)*(length)+(j)] = 1000;
}
else return;
}
__global__ void Heat_sh_red_2(double *input, double *output, int length)
{
extern __shared__ double red[];
int red_size = (blockDim.x)*(blockDim.y+2);
int I = blockIdx.y*blockDim.y;
int J = blockIdx.x*blockDim.x;
int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block
int a = 0;
int b = 0;
for(int k = tid ; k<red_size; k += blockDim.x*blockDim.y)
{
a = k/(blockDim.y+2);
b = k%(blockDim.y+2);
if((J-1+a >= 0) && (I-1+b >= 0) && (J+1+a < length) && (I-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
{
red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)]; //reduction + transpose (a and b swaped)
}
}
__syncthreads();
int i = (blockIdx.y * blockDim.y) + threadIdx.y;
int j = (blockIdx.x * blockDim.x) + threadIdx.x;
a = threadIdx.y;
b = threadIdx.x;
if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
{
if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
{
output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9;
}
else output[(i)*(length)+(j)] = 1000;
}
else return;
}
// GPU Optimized function
void GPU_array_process(double *input, double *output, int length, int iterations)
{
//Cuda events for calculating elapsed time
cudaEvent_t cpy_H2D_start, cpy_H2D_end, comp_start, comp_end, cpy_D2H_start, cpy_D2H_end;
cudaEventCreate(&cpy_H2D_start);
cudaEventCreate(&cpy_H2D_end);
cudaEventCreate(&cpy_D2H_start);
cudaEventCreate(&cpy_D2H_end);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_end);
/* Preprocessing goes here */
int SIZE = length*length*sizeof(double);
double* gpu_input;
double* gpu_output;
double* temp; //used to swith pointers between gpu_input and gpu_output
cudaMalloc((void**)&gpu_input, SIZE);
cudaMalloc((void**)&gpu_output, SIZE);
cudaEventRecord(cpy_H2D_start);
/* Copying array from host to device goes here */
cudaMemcpy((void*)gpu_input, (void*)input, SIZE, cudaMemcpyHostToDevice);
cudaMemcpy((void*)gpu_output, (void*)output, SIZE, cudaMemcpyHostToDevice);
int row_threads = 64;
int col_threads = 4;
dim3 thrsPerBlock(row_threads, col_threads);
dim3 nBlks((length-1)/row_threads + 1,(length-1)/col_threads + 1); //if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc...
cudaEventRecord(cpy_H2D_end);
cudaEventSynchronize(cpy_H2D_end);
//Copy array from host to device
cudaEventRecord(comp_start);
/* GPU calculation goes here */
for(size_t i = 0; i<(int) iterations; i++)
{
Heat<<<nBlks, thrsPerBlock>>>(gpu_input, gpu_output, length);
//Heat_sh<<<nBlks, thrsPerBlock, ((row_threads+2)*(col_threads+2))*sizeof(double)>>>(gpu_input, gpu_output, length);
//Heat_raffa<<<nBlks, thrsPerBlock, ((row_threads+2)*(col_threads+2))*sizeof(double)>>>(gpu_input, gpu_output, length);
//Heat_sh_red_1<<<nBlks, thrsPerBlock, (((row_threads+2)*(col_threads+2))+ ((row_threads)*(col_threads+2)))*sizeof(double)>>>(gpu_input, gpu_output, length);
//Heat_sh_red_2<<<nBlks, thrsPerBlock, (((row_threads)*(col_threads+2)))*sizeof(double)>>>(gpu_input, gpu_output, length);
temp = gpu_input;
gpu_input = gpu_output;
gpu_output = temp;
cudaDeviceSynchronize();
}
gpu_output = gpu_input; //unswap from the last iteration
cudaEventRecord(comp_end);
cudaEventSynchronize(comp_end);
cudaEventRecord(comp_end);
cudaEventSynchronize(comp_end);
cudaEventRecord(cpy_D2H_start);
/* Copying array from device to host goes here */
cudaMemcpy((void*)output, (void*)gpu_output, SIZE, cudaMemcpyDeviceToHost);
cudaEventRecord(cpy_D2H_end);
cudaEventSynchronize(cpy_D2H_end);
/* Postprocessing goes here */
cudaFree(gpu_input);
cudaFree(gpu_output);
float time;
cudaEventElapsedTime(&time, cpy_H2D_start, cpy_H2D_end);
cout<<"Host to Device MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
cudaEventElapsedTime(&time, comp_start, comp_end);
cout<<"Computation takes "<<setprecision(4)<<time/1000<<"s"<<endl;
cudaEventElapsedTime(&time, cpy_D2H_start, cpy_D2H_end);
cout<<"Device to Host MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
}

Event Timeline