diff --git a/A4/execute.sh b/A4/execute.sh index 2b32fd1..9c77193 100644 --- a/A4/execute.sh +++ b/A4/execute.sh @@ -1,21 +1,21 @@ #!/bin/bash #SBATCH --partition=gpu #SBATCH --qos=gpu_free #SBATCH --gres=gpu:1 #SBATCH --nodes=1 #SBATCH --time=1:0:0 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=1 #SBATCH --mem 1G #SBATCH --account cs307 #SBATCH --reservation CS307-gpu -length=16 -iterations=1 +length=5000 +iterations=2500 module load gcc cuda echo STARTING AT `date` ./assignment4 $length $iterations echo FINISHED at `date` diff --git a/A4/implementation.cu b/A4/implementation.cu index e2a059e..b2a2da6 100644 --- a/A4/implementation.cu +++ b/A4/implementation.cu @@ -1,429 +1,457 @@ /* ============================================================================ Filename : algorithm.c Author : Your name goes here SCIPER : Your SCIPER number ============================================================================ */ #include #include #include #include using namespace std; // CPU Baseline void array_process(double *input, double *output, int length, int iterations) { double *temp; for(int n=0; n<(int) iterations; n++) { for(int i=1; i 0) && (j > 0) && (i < length-1) && (j < length-1)) { if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) { output[(i)*(length)+(j)] = ( input[(i-1)*(length)+(j-1)] + input[(i-1)*(length)+(j)] + input[(i-1)*(length)+(j+1)] + input[(i)*(length)+(j-1)] + input[(i)*(length)+(j)] + input[(i)*(length)+(j+1)] + input[(i+1)*(length)+(j-1)] + input[(i+1)*(length)+(j)] + input[(i+1)*(length)+(j+1)] ) / 9; } else output[(i)*(length)+(j)] = 1000; } else return; } __global__ void Heat_sh(double *input, double *output, int len) { extern __shared__ double s[]; int length = len; int S_size = (blockDim.x+2)*(blockDim.y+2); int I = blockIdx.y*blockDim.y; int J = blockIdx.x*blockDim.x; int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block int a = 0; int b = 0; /* //first thread fills s if(tid == 0) { for(int k = 0 ; k= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } } } //first warp fills s int warp_size = 32; int warp_id = threadIdx.x / warp_size; int ALU_id = threadIdx.x % warp_size; if(warp_id == 0) { for(int k = ALU_id ; k= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } } } //all threads fill s for(int k = tid ; k= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } } */ //unrolled fill a = tid/(blockDim.x+2); b = tid%(blockDim.x+2); if((a*(blockDim.x+2) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } a = (blockDim.x*blockDim.y+tid)/(blockDim.x+2); b = (blockDim.x*blockDim.y+tid)%(blockDim.x+2); if((a*(blockDim.x+2) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } __syncthreads(); a = threadIdx.y+1; //access s b = threadIdx.x+1; //access s int i = (blockIdx.y * blockDim.y) + threadIdx.y; int j = (blockIdx.x * blockDim.x) + threadIdx.x; if((i > 0) && (j > 0) && (i < length-1) && (j < length-1)) { if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) { output[(i)*(length)+(j)] = ( s[(a-1)*(blockDim.x+2) + b-1] + s[(a-1)*(blockDim.x+2) + b] + s[(a-1)*(blockDim.x+2) + b+1] + s[(a)*(blockDim.x+2) + b-1] + s[(a)*(blockDim.x+2) + b] + s[(a)*(blockDim.x+2) + b+1] + s[(a+1)*(blockDim.x+2) + b-1] + s[(a+1)*(blockDim.x+2) + b] + s[(a+1)*(blockDim.x+2) + b+1] ) / 9; } else output[(i)*(length)+(j)] = 1000; } else return; } __global__ void Heat_sh_mod(double *input, double *output, int len) { extern __shared__ double s[]; int length = len; - //int S_size_x = blockDim.x; - //int S_size_y = blockDim.y; - - //int size_x = (blockDim.x-2); - //int size_y = (blockDim.y-2); - - - - int In_x = blockIdx.x*blockDim.x - 2*blockIdx.x; //nb_blocks in x *blockDim.x - int In_y = ((blockIdx.y*gridDim.x)*blockDim.y) - 2*blockIdx.y; //((nb_blocks in y)*blockDim.y) * nb_element per line - int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block + int In_x = blockIdx.x*blockDim.x - 2*blockIdx.x; //nb_blocks in x *blockDim.x + int In_y = blockIdx.y*blockDim.y - 2*blockIdx.y; //((nb_blocks in y)*blockDim.y) * nb_element per line - s[tid] = input[(In_y-1)*(gridDim.x*blockDim.x) + In_x - 1 + tid]; + if(((In_y-1 + (int)threadIdx.y)>=0)&&((In_x-1 + (int)threadIdx.x)>=0)&&(In_y-1 + (int)threadIdx.y 0) && (b > 0)&&(a 0)&&(j > 0)&&(i < length-1) && (j < length-1)) { if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) { output[(i)*(length)+(j)] = ( s[(a-1)*(blockDim.x) + b-1] + s[(a-1)*(blockDim.x) + b] + s[(a-1)*(blockDim.x) + b+1] + s[(a)*(blockDim.x) + b-1] + s[(a)*(blockDim.x) + b] + s[(a)*(blockDim.x) + b+1] + s[(a+1)*(blockDim.x) + b-1] + s[(a+1)*(blockDim.x) + b] + s[(a+1)*(blockDim.x) + b+1] ) / 9; } else output[(i)*(length)+(j)] = 1000; } else return; } __global__ void Heat_sh_red_1(double *input, double *output, int length) { extern __shared__ double shared[]; int S_size = (blockDim.x+2)*(blockDim.y+2); int red_size = (blockDim.x)*(blockDim.y+2); double *s = shared; double *red = (double*)&shared[S_size]; int I = blockIdx.y*blockDim.y; int J = blockIdx.x*blockDim.x; int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block int a = 0; int b = 0; - - //all threads fill s + for(int k = tid ; k= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; } } + /* + //unrolled fill + + a = tid/(blockDim.x+2); + b = tid%(blockDim.x+2); + + if((a*(blockDim.x+2) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; + } + + a = (blockDim.x*blockDim.y+tid)/(blockDim.x+2); + b = (blockDim.x*blockDim.y+tid)%(blockDim.x+2); + + if((a*(blockDim.x+2) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; + }*/ + __syncthreads(); for(int k = tid ; k 0) && (j > 0) && (i < length-1) && (j < length-1)) { if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) { output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9; } else output[(i)*(length)+(j)] = 1000; } else return; } __global__ void Heat_sh_red_2(double *input, double *output, int length) { extern __shared__ double red[]; int red_size = (blockDim.x)*(blockDim.y+2); int I = blockIdx.y*blockDim.y; int J = blockIdx.x*blockDim.x; int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block int a = 0; int b = 0; + /* + a = tid/(blockDim.y+2); + b = tid%(blockDim.y+2); + if((a*(blockDim.x) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)]; + } + + a = (blockDim.x*blockDim.y+tid)/(blockDim.y); + b = (blockDim.x*blockDim.y+tid)%(blockDim.y); + + if((a*(blockDim.x) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)]; + }*/ + for(int k = tid ; k= 0) && (I-1+b >= 0) && (J+1+a < length) && (I-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block { red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)]; //reduction + transpose (a and b swaped) } } __syncthreads(); int i = (blockIdx.y * blockDim.y) + threadIdx.y; int j = (blockIdx.x * blockDim.x) + threadIdx.x; a = threadIdx.y; b = threadIdx.x; if((i > 0) && (j > 0) && (i < length-1) && (j < length-1)) { if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) { output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9; } else output[(i)*(length)+(j)] = 1000; } else return; } // GPU Optimized function void GPU_array_process(double *input, double *output, int length, int iterations) { //Cuda events for calculating elapsed time cudaEvent_t cpy_H2D_start, cpy_H2D_end, comp_start, comp_end, cpy_D2H_start, cpy_D2H_end; cudaEventCreate(&cpy_H2D_start); cudaEventCreate(&cpy_H2D_end); cudaEventCreate(&cpy_D2H_start); cudaEventCreate(&cpy_D2H_end); cudaEventCreate(&comp_start); cudaEventCreate(&comp_end); /* Preprocessing goes here */ int SIZE = length*length*sizeof(double); double* gpu_input; double* gpu_output; double* temp; //used to swith pointers between gpu_input and gpu_output cudaMalloc((void**)&gpu_input, SIZE); cudaMalloc((void**)&gpu_output, SIZE); cudaEventRecord(cpy_H2D_start); /* Copying array from host to device goes here */ cudaMemcpy((void*)gpu_input, (void*)input, SIZE, cudaMemcpyHostToDevice); cudaMemcpy((void*)gpu_output, (void*)output, SIZE, cudaMemcpyHostToDevice); - int row_threads = 16; - int col_threads = 16; + int row_threads = 4; + int col_threads = 4; //dim3 thrsPerBlock(row_threads, col_threads); - //dim3 nBlks((length-1)/row_threads + 1,(length-1)/col_threads + 1); //if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc... - dim3 nBlks((length-1)/(row_threads-2) + 1,(length-1)/(col_threads-2) + 1); + dim3 nBlks((length-1)/row_threads + 1,(length-1)/col_threads + 1); //if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc... + //dim3 nBlks((length-1)/(row_threads-2) + 1,(length-1)/(col_threads-2) + 1); dim3 thrsPerBlock(row_threads, col_threads); cudaEventRecord(cpy_H2D_end); cudaEventSynchronize(cpy_H2D_end); //Copy array from host to device cudaEventRecord(comp_start); /* GPU calculation goes here */ for(size_t i = 0; i<(int) iterations; i++) { //Heat<<>>(gpu_input, gpu_output, length); //Heat_sh<<>>(gpu_input, gpu_output, length); - Heat_sh_mod<<>>(gpu_input, gpu_output, length); - //Heat_sh_red_1<<>>(gpu_input, gpu_output, length); + //Heat_sh_mod<<>>(gpu_input, gpu_output, length); + Heat_sh_red_1<<>>(gpu_input, gpu_output, length); //Heat_sh_red_2<<>>(gpu_input, gpu_output, length); temp = gpu_input; gpu_input = gpu_output; gpu_output = temp; } gpu_output = gpu_input; //unswap from the last iteration cudaEventRecord(comp_end); cudaEventSynchronize(comp_end); cudaEventRecord(cpy_D2H_start); /* Copying array from device to host goes here */ cudaMemcpy((void*)output, (void*)gpu_output, SIZE, cudaMemcpyDeviceToHost); cudaEventRecord(cpy_D2H_end); cudaEventSynchronize(cpy_D2H_end); /* Postprocessing goes here */ cudaFree(gpu_input); cudaFree(gpu_output); float time; cudaEventElapsedTime(&time, cpy_H2D_start, cpy_H2D_end); cout<<"Host to Device MemCpy takes "<