diff --git a/A4/implementation.cu b/A4/implementation.cu index f7925a6..1bf923c 100644 --- a/A4/implementation.cu +++ b/A4/implementation.cu @@ -1,431 +1,431 @@ /* ============================================================================ Filename : algorithm.c Author : Your name goes here SCIPER : Your SCIPER number ============================================================================ */ #include <iostream> #include <iomanip> #include <sys/time.h> #include <cuda_runtime.h> using namespace std; // CPU Baseline void array_process(double *input, double *output, int length, int iterations) { double *temp; for(int n=0; n<(int) iterations; n++) { for(int i=1; i<length-1; i++) { for(int j=1; j<length-1; j++) { output[(i)*(length)+(j)] = (input[(i-1)*(length)+(j-1)] + input[(i-1)*(length)+(j)] + input[(i-1)*(length)+(j+1)] + input[(i)*(length)+(j-1)] + input[(i)*(length)+(j)] + input[(i)*(length)+(j+1)] + input[(i+1)*(length)+(j-1)] + input[(i+1)*(length)+(j)] + input[(i+1)*(length)+(j+1)] ) / 9; } } output[(length/2-1)*length+(length/2-1)] = 1000; output[(length/2)*length+(length/2-1)] = 1000; output[(length/2-1)*length+(length/2)] = 1000; output[(length/2)*length+(length/2)] = 1000; temp = input; input = output; output = temp; } } __global__ void Heat(double *input, double *output, int length) { int i = (blockIdx.y * blockDim.y) + threadIdx.y; int j = (blockIdx.x * blockDim.x) + threadIdx.x; if((i > 0) && (j > 0) && (i < length-1) && (j < length-1)) { if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) { output[(i)*(length)+(j)] = ( input[(i-1)*(length)+(j-1)] + input[(i-1)*(length)+(j)] + input[(i-1)*(length)+(j+1)] + input[(i)*(length)+(j-1)] + input[(i)*(length)+(j)] + input[(i)*(length)+(j+1)] + input[(i+1)*(length)+(j-1)] + input[(i+1)*(length)+(j)] + input[(i+1)*(length)+(j+1)] ) / 9; } else output[(i)*(length)+(j)] = 1000; } else return; } __global__ void Heat_sh(double *input, double *output, int len) { extern __shared__ double s[]; int length = len; int S_size = (blockDim.x+2)*(blockDim.y+2); int I = blockIdx.y*blockDim.y; int J = blockIdx.x*blockDim.x; int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block int a = 0; int b = 0; /* //first thread fills s if(tid == 0) { for(int k = 0 ; k<S_size; k++) { a = k/(blockDim.x+2); b = k%(blockDim.x+2); if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } } } //first warp fills s int warp_size = 32; int warp_id = threadIdx.x / warp_size; int ALU_id = threadIdx.x % warp_size; if(warp_id == 0) { for(int k = ALU_id ; k<S_size; k += warp_size) { a = k/(blockDim.x+2); b = k%(blockDim.x+2); if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } } } //all threads fill s for(int k = tid ; k<S_size; k += blockDim.x*blockDim.y) { a = k/(blockDim.x+2); b = k%(blockDim.x+2); if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } } */ //unrolled fill a = tid/(blockDim.x+2); b = tid%(blockDim.x+2); if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } a = (blockDim.x*blockDim.y+tid)/(blockDim.x+2); b = (blockDim.x*blockDim.y+tid)%(blockDim.x+2); if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } __syncthreads(); a = threadIdx.y+1; //access s b = threadIdx.x+1; //access s int i = (blockIdx.y * blockDim.y) + threadIdx.y; int j = (blockIdx.x * blockDim.x) + threadIdx.x; if((i > 0) && (j > 0) && (i < length-1) && (j < length-1)) { if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) { output[(i)*(length)+(j)] = ( s[(a-1)*(blockDim.x+2) + b-1] + s[(a-1)*(blockDim.x+2) + b] + s[(a-1)*(blockDim.x+2) + b+1] + s[(a)*(blockDim.x+2) + b-1] + s[(a)*(blockDim.x+2) + b] + s[(a)*(blockDim.x+2) + b+1] + s[(a+1)*(blockDim.x+2) + b-1] + s[(a+1)*(blockDim.x+2) + b] + s[(a+1)*(blockDim.x+2) + b+1] ) / 9; } else output[(i)*(length)+(j)] = 1000; } else return; } __global__ void Heat_sh_mod(double *input, double *output, int len) { extern __shared__ double s[]; int length = len; //int S_size_x = blockDim.x; //int S_size_y = blockDim.y; //int size_x = (blockDim.x-2); //int size_y = (blockDim.y-2); int In_x = blockIdx.x*blockDim.x - 2*blockIdx.x; //nb_blocks in x *blockDim.x int In_y = ((blockIdx.y*gridDim.x)*blockDim.y) - 2*blockIdx.y; //((nb_blocks in y)*blockDim.y) * nb_element per line int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block - if((In_y-1)>=0)&&(In_x-1)>=0)) + if(((In_y-1)>=0)&&((In_x-1)>=0)&&(In_y-1 + threadIdx.y<length)&&(In_x-1 + threadIdx.x<length)) { - s[tid] = input[(In_y-1)*(gridDim.x*blockDim.x) + In_x-1 + tid]; + s[tid] = input[(In_y-1)*(gridDim.x*blockDim.x) + In_x-1 + (threadIdx.y)*blockDim.x +threadIdx.x]; } __syncthreads(); int a = threadIdx.y; //access s int b = threadIdx.x; //access s int i = In_y*(gridDim.x*blockDim.x) + a-1; int j = In_x + b-1; if((a > 0) && (b > 0)&&(a<blockDim.y-1)&&(b<blockDim.x-1)&&(i > 0)&&(j > 0)&&(i < length-1) && (j < length-1)) { if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) { output[(i)*(length)+(j)] = ( s[(a-1)*(blockDim.x) + b-1] + s[(a-1)*(blockDim.x) + b] + s[(a-1)*(blockDim.x) + b+1] + s[(a)*(blockDim.x) + b-1] + s[(a)*(blockDim.x) + b] + s[(a)*(blockDim.x) + b+1] + s[(a+1)*(blockDim.x) + b-1] + s[(a+1)*(blockDim.x) + b] + s[(a+1)*(blockDim.x) + b+1] ) / 9; } else output[(i)*(length)+(j)] = 1000; } else return; } __global__ void Heat_sh_red_1(double *input, double *output, int length) { extern __shared__ double shared[]; int S_size = (blockDim.x+2)*(blockDim.y+2); int red_size = (blockDim.x)*(blockDim.y+2); double *s = shared; double *red = (double*)&shared[S_size]; int I = blockIdx.y*blockDim.y; int J = blockIdx.x*blockDim.x; int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block int a = 0; int b = 0; //all threads fill s for(int k = tid ; k<S_size; k += blockDim.x*blockDim.y) { a = k/(blockDim.x+2); b = k%(blockDim.x+2); if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } } __syncthreads(); for(int k = tid ; k<red_size; k += blockDim.x*blockDim.y) { a = k/(blockDim.y+2); b = k%(blockDim.y+2); red[a*(blockDim.y+2) + b] = s[b*(blockDim.x+2) + a] + s[b*(blockDim.x+2) + a+1] + s[b*(blockDim.x+2) + a+2]; //reduction + transpose (a and b swaped) } __syncthreads(); int i = (blockIdx.y * blockDim.y) + threadIdx.y; int j = (blockIdx.x * blockDim.x) + threadIdx.x; a = threadIdx.y; b = threadIdx.x; if((i > 0) && (j > 0) && (i < length-1) && (j < length-1)) { if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) { output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9; } else output[(i)*(length)+(j)] = 1000; } else return; } __global__ void Heat_sh_red_2(double *input, double *output, int length) { extern __shared__ double red[]; int red_size = (blockDim.x)*(blockDim.y+2); int I = blockIdx.y*blockDim.y; int J = blockIdx.x*blockDim.x; int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block int a = 0; int b = 0; for(int k = tid ; k<red_size; k += blockDim.x*blockDim.y) { a = k/(blockDim.y+2); b = k%(blockDim.y+2); if((J-1+a >= 0) && (I-1+b >= 0) && (J+1+a < length) && (I-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)]; //reduction + transpose (a and b swaped) } } __syncthreads(); int i = (blockIdx.y * blockDim.y) + threadIdx.y; int j = (blockIdx.x * blockDim.x) + threadIdx.x; a = threadIdx.y; b = threadIdx.x; if((i > 0) && (j > 0) && (i < length-1) && (j < length-1)) { if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) { output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9; } else output[(i)*(length)+(j)] = 1000; } else return; } // GPU Optimized function void GPU_array_process(double *input, double *output, int length, int iterations) { //Cuda events for calculating elapsed time cudaEvent_t cpy_H2D_start, cpy_H2D_end, comp_start, comp_end, cpy_D2H_start, cpy_D2H_end; cudaEventCreate(&cpy_H2D_start); cudaEventCreate(&cpy_H2D_end); cudaEventCreate(&cpy_D2H_start); cudaEventCreate(&cpy_D2H_end); cudaEventCreate(&comp_start); cudaEventCreate(&comp_end); /* Preprocessing goes here */ int SIZE = length*length*sizeof(double); double* gpu_input; double* gpu_output; double* temp; //used to swith pointers between gpu_input and gpu_output cudaMalloc((void**)&gpu_input, SIZE); cudaMalloc((void**)&gpu_output, SIZE); cudaEventRecord(cpy_H2D_start); /* Copying array from host to device goes here */ cudaMemcpy((void*)gpu_input, (void*)input, SIZE, cudaMemcpyHostToDevice); cudaMemcpy((void*)gpu_output, (void*)output, SIZE, cudaMemcpyHostToDevice); int row_threads = 16; int col_threads = 16; //dim3 thrsPerBlock(row_threads, col_threads); //dim3 nBlks((length-1)/row_threads + 1,(length-1)/col_threads + 1); //if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc... dim3 nBlks((length-1)/(row_threads-2) + 1,(length-1)/(col_threads-2) + 1); dim3 thrsPerBlock(row_threads, col_threads); cudaEventRecord(cpy_H2D_end); cudaEventSynchronize(cpy_H2D_end); //Copy array from host to device cudaEventRecord(comp_start); /* GPU calculation goes here */ for(size_t i = 0; i<(int) iterations; i++) { //Heat<<<nBlks, thrsPerBlock>>>(gpu_input, gpu_output, length); //Heat_sh<<<nBlks, thrsPerBlock, ((row_threads+2)*(col_threads+2))*sizeof(double)>>>(gpu_input, gpu_output, length); Heat_sh_mod<<<nBlks, thrsPerBlock, (row_threads*col_threads)*sizeof(double)>>>(gpu_input, gpu_output, length); //Heat_sh_red_1<<<nBlks, thrsPerBlock, (((row_threads+2)*(col_threads+2))+ ((row_threads)*(col_threads+2)))*sizeof(double)>>>(gpu_input, gpu_output, length); //Heat_sh_red_2<<<nBlks, thrsPerBlock, (((row_threads)*(col_threads+2)))*sizeof(double)>>>(gpu_input, gpu_output, length); temp = gpu_input; gpu_input = gpu_output; gpu_output = temp; } gpu_output = gpu_input; //unswap from the last iteration cudaEventRecord(comp_end); cudaEventSynchronize(comp_end); cudaEventRecord(cpy_D2H_start); /* Copying array from device to host goes here */ cudaMemcpy((void*)output, (void*)gpu_output, SIZE, cudaMemcpyDeviceToHost); cudaEventRecord(cpy_D2H_end); cudaEventSynchronize(cpy_D2H_end); /* Postprocessing goes here */ cudaFree(gpu_input); cudaFree(gpu_output); float time; cudaEventElapsedTime(&time, cpy_H2D_start, cpy_H2D_end); cout<<"Host to Device MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl; cudaEventElapsedTime(&time, comp_start, comp_end); cout<<"Computation takes "<<setprecision(4)<<time/1000<<"s"<<endl; cudaEventElapsedTime(&time, cpy_D2H_start, cpy_D2H_end); cout<<"Device to Host MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl; }