diff --git a/A4/assignment4 b/A4/assignment4 index 94c0b72..3d1b66b 100755 Binary files a/A4/assignment4 and b/A4/assignment4 differ diff --git a/A4/implementation.cu b/A4/implementation.cu index 6b249b1..8e0f343 100644 --- a/A4/implementation.cu +++ b/A4/implementation.cu @@ -1,279 +1,429 @@ /* ============================================================================ Filename : algorithm.c Author : Your name goes here SCIPER : Your SCIPER number ============================================================================ */ #include #include #include #include using namespace std; -#define SHARED_MEM_SIZE 98304 // 96 KB -#define CACHE_LINE 128 -#define WARP_SIZE 32 - -#define COL_THREADS 16 -#define ROW_THREADS 8 - // CPU Baseline void array_process(double *input, double *output, int length, int iterations) { double *temp; for(int n=0; n<(int) iterations; n++) { for(int i=1; i 0) && (j > 0) && (i < length-1) && (j < length-1)) + { + if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) + { + output[(i)*(length)+(j)] = ( input[(i-1)*(length)+(j-1)] + + input[(i-1)*(length)+(j)] + + input[(i-1)*(length)+(j+1)] + + input[(i)*(length)+(j-1)] + + input[(i)*(length)+(j)] + + input[(i)*(length)+(j+1)] + + input[(i+1)*(length)+(j-1)] + + input[(i+1)*(length)+(j)] + + input[(i+1)*(length)+(j+1)] ) / 9; + } + else output[(i)*(length)+(j)] = 1000; + } + else return; +} + __global__ void Heat(double *input, double *output, int length) { - // shared memory portion - __shared__ double sdata[ROW_THREADS][COL_THREADS]; + int i = (blockIdx.y * blockDim.y) + threadIdx.y; + int j = (blockIdx.x * blockDim.x) + threadIdx.x; + + + if((i > 0) && (j > 0) && (i < length-1) && (j < length-1)) + { + if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) + { + output[(i)*(length)+(j)] = ( input[(i-1)*(length)+(j-1)] + + input[(i-1)*(length)+(j)] + + input[(i-1)*(length)+(j+1)] + + input[(i)*(length)+(j-1)] + + input[(i)*(length)+(j)] + + input[(i)*(length)+(j+1)] + + input[(i+1)*(length)+(j-1)] + + input[(i+1)*(length)+(j)] + + input[(i+1)*(length)+(j+1)] ) / 9; + } + else output[(i)*(length)+(j)] = 1000; + } + else return; +} - unsigned int tid = threadIdx.x; - unsigned int warp = threadIdx.y / 2; +__global__ void Heat_sh(double *input, double *output, int len) +{ + extern __shared__ double s[]; + int length = len; + int S_size = (blockDim.x+2)*(blockDim.y+2); + + + + int I = blockIdx.y*blockDim.y; + int J = blockIdx.x*blockDim.x; + + int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block + + int a = 0; + int b = 0; + + //unrolled fill + + a = tid/(blockDim.x+2); + b = tid%(blockDim.x+2); + + if((a*(blockDim.x+2) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; + } + + a = (blockDim.x*blockDim.y+tid)/(blockDim.x+2); + b = (blockDim.x*blockDim.y+tid)%(blockDim.x+2); + + if((a*(blockDim.x+2) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; + } + + + __syncthreads(); + + a = threadIdx.y+1; //access s + b = threadIdx.x+1; //access s + + int i = (blockIdx.y * blockDim.y) + threadIdx.y; + int j = (blockIdx.x * blockDim.x) + threadIdx.x; + + + if((i > 0) && (j > 0) && (i < length-1) && (j < length-1)) + { + if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) + { + output[(i)*(length)+(j)] = ( s[(a-1)*(blockDim.x+2) + b-1] + s[(a-1)*(blockDim.x+2) + b] + s[(a-1)*(blockDim.x+2) + b+1] + + s[(a)*(blockDim.x+2) + b-1] + s[(a)*(blockDim.x+2) + b] + s[(a)*(blockDim.x+2) + b+1] + + s[(a+1)*(blockDim.x+2) + b-1] + s[(a+1)*(blockDim.x+2) + b] + s[(a+1)*(blockDim.x+2) + b+1] ) / 9; + } + else output[(i)*(length)+(j)] = 1000; + } + else return; +} - unsigned int x = (blockIdx.x * blockDim.x) + threadIdx.x; // absolute x - unsigned int y = (blockIdx.y * (blockDim.y - 2)); // absolute y - - // follow the line setting - y += (warp != 0) ? threadIdx.y - 2 : (blockDim.y-1) * threadIdx.y; - - /* - Block structure: - - -1 ******************* <- warp 0 - 0 ddddddddddddddddddd <- warp 1 - 1 ddddddddddddddddddd <- warp 1 - 2 ddddddddddddddddddd <- warp 2 - 3 ddddddddddddddddddd <- warp 2 - .... - ddddddddddddddddddd <- warp ROW_THREADS/2 - 1 - ddddddddddddddddddd <- warp ROW_THREADS/2 - 1 - ******************* <- warp 0 - */ - - unsigned int TILE = gridDim.x * blockDim.x; - - unsigned int k = y * TILE + x; // absolute index of the array - - if (warp != 0) // && x > 0 && y > 0 && (x < length-1) && (y < length-1)) - output[k] = blockIdx.x; - - // grid boundary - //if ( !( (x > 0) && (y > 0) && (x < length-1) && (y < length-1) ) ) { - //sdata[threadIdx.y][threadIdx.x] = 0; - //} +__global__ void Heat_sh_mod(double *input, double *output, int len) +{ + extern __shared__ double s[]; + int length = len; + + + int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block + + int In_x = blockIdx.x*blockDim.x - 2*blockIdx.x; //nb_blocks in x *blockDim.x + int In_y = blockIdx.y*blockDim.y - 2*blockIdx.y; //((nb_blocks in y)*blockDim.y) * nb_element per line + + if(((In_y-1 + (int)threadIdx.y)>=0)&&((In_x-1 + (int)threadIdx.x)>=0)&&(In_y-1 + (int)threadIdx.y 0) && (b > 0)&&(a 0)&&(j > 0)&&(i < length-1) && (j < length-1)) + { + if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) + { + output[(i)*(length)+(j)] = ( s[(a-1)*(blockDim.x) + b-1] + s[(a-1)*(blockDim.x) + b] + s[(a-1)*(blockDim.x) + b+1] + + s[(a)*(blockDim.x) + b-1] + s[(a)*(blockDim.x) + b] + s[(a)*(blockDim.x) + b+1] + + s[(a+1)*(blockDim.x) + b-1] + s[(a+1)*(blockDim.x) + b] + s[(a+1)*(blockDim.x) + b+1] ) / 9; + } + else output[(i)*(length)+(j)] = 1000; + } + else return; +} - // already sum the first three reductions - double sum = input[k]; - - // boundary terms are branching, O(N) - if (x > 0) - sum += input[k-1]; // non-coalesced access every right end block +__global__ void Heat_sh_red_1(double *input, double *output, int length) +{ + extern __shared__ double shared[]; + int S_size = (blockDim.x+2)*(blockDim.y+2); + int red_size = (blockDim.x)*(blockDim.y+2); + + double *s = shared; + double *red = (double*)&shared[S_size]; + + int I = blockIdx.y*blockDim.y; + int J = blockIdx.x*blockDim.x; + + int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block + + int a = 0; + int b = 0; + + + for(int k = tid ; k= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; + } + } + /* + //unrolled fill + + a = tid/(blockDim.x+2); + b = tid%(blockDim.x+2); + + if((a*(blockDim.x+2) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; + } + + a = (blockDim.x*blockDim.y+tid)/(blockDim.x+2); + b = (blockDim.x*blockDim.y+tid)%(blockDim.x+2); + + if((a*(blockDim.x+2) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)]; + }*/ + + + __syncthreads(); + + for(int k = tid ; k 0) && (j > 0) && (i < length-1) && (j < length-1)) + { + if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) + { + output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9; + } + else output[(i)*(length)+(j)] = 1000; + } + + + else return; +} - // boundary terms are branching, O(N) - if (x < length-1) - sum += input[k+1]; // non-coalesced access every left end block - sdata[threadIdx.y][threadIdx.x] = sum; - // assure all threads are synchronized - __syncthreads(); - - // allow only one warp - - if (warp == 0) { - - // select only 16 threads - // no bank conflicts only if COL_THREADS is 16 - if (threadIdx.y == 0) { - - // two rows more are fetched - for (unsigned int row = 1; row < (ROW_THREADS - 1); ++row) { - - // select the same bank - sdata[row][tid] += sdata[row-1][tid] + sdata[row+1][tid]; - } - } - } - - // wait all threads are finished - __syncthreads(); - /* - if (((x == length/2-1) || (x == length/2)) && ((y == length/2-1) || (y == length/2))) - output[k] = 1000; // special values, a negligible branching - else if (warp != 0) - output[k] = sdata[threadIdx.y][threadIdx.x] / 9; - */ +__global__ void Heat_sh_red_2(double *input, double *output, int length) +{ + extern __shared__ double red[]; + int red_size = (blockDim.x)*(blockDim.y+2); + + int I = blockIdx.y*blockDim.y; + int J = blockIdx.x*blockDim.x; + + int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block + + int a = 0; + int b = 0; + /* + a = tid/(blockDim.y+2); + b = tid%(blockDim.y+2); + + if((a*(blockDim.x) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)]; + } + + a = (blockDim.x*blockDim.y+tid)/(blockDim.y); + b = (blockDim.x*blockDim.y+tid)%(blockDim.y); + + if((a*(blockDim.x) + b= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)]; + }*/ + + + + for(int k = tid ; k= 0) && (I-1+b >= 0) && (J+1+a < length) && (I-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block + { + red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)]; //reduction + transpose (a and b swaped) + } + } + + __syncthreads(); + + int i = (blockIdx.y * blockDim.y) + threadIdx.y; + int j = (blockIdx.x * blockDim.x) + threadIdx.x; + + a = threadIdx.y; + b = threadIdx.x; + + if((i > 0) && (j > 0) && (i < length-1) && (j < length-1)) + { + if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2)))) + { + output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9; + } + else output[(i)*(length)+(j)] = 1000; + } + + + else return; } // GPU Optimized function void GPU_array_process(double *input, double *output, int length, int iterations) { //Cuda events for calculating elapsed time cudaEvent_t cpy_H2D_start, cpy_H2D_end, comp_start, comp_end, cpy_D2H_start, cpy_D2H_end; cudaEventCreate(&cpy_H2D_start); cudaEventCreate(&cpy_H2D_end); cudaEventCreate(&cpy_D2H_start); cudaEventCreate(&cpy_D2H_end); cudaEventCreate(&comp_start); cudaEventCreate(&comp_end); - // defining constants - - unsigned int XBLOCKS = length / COL_THREADS; // ceil to a COL_THREADS multiple, x direction - unsigned int YBLOCKS = length / (ROW_THREADS - 2); - - if (length % COL_THREADS != 0) - ++XBLOCKS; - - if (length % (ROW_THREADS-2) != 0) - ++YBLOCKS; - - unsigned int TILE = XBLOCKS * COL_THREADS; - unsigned int SIZE = TILE*TILE*sizeof(double); - - cout << "X blocks = " << XBLOCKS << endl; - cout << "Y blocks = " << YBLOCKS << endl; - - cout << "X TILE = " << TILE << endl; - cout << "Y TILE = " << (YBLOCKS * (ROW_THREADS-2)) << endl; - /* Preprocessing goes here */ - + int SIZE = length*length*sizeof(double); + double* gpu_input; double* gpu_output; double* temp; //used to swith pointers between gpu_input and gpu_output cudaMalloc((void**)&gpu_input, SIZE); cudaMalloc((void**)&gpu_output, SIZE); + cudaEventRecord(cpy_H2D_start); /* Copying array from host to device goes here */ + cudaMemcpy((void*)gpu_input, (void*)input, SIZE, cudaMemcpyHostToDevice); + cudaMemcpy((void*)gpu_output, (void*)output, SIZE, cudaMemcpyHostToDevice); - for (unsigned int row = 0; row < length; ++row) { - - // copy the row - cudaMemcpy((void*)(gpu_input + row * TILE), (void*)(input + row * length), length, cudaMemcpyHostToDevice); - } - dim3 thrsPerBlock(COL_THREADS, ROW_THREADS); - dim3 nBlks(XBLOCKS, YBLOCKS); //if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc... - //dim3 nBlks(1,1); + int row_threads = 16; + int col_threads = 16; + + //dim3 thrsPerBlock(row_threads, col_threads); + dim3 nBlks((length-1)/row_threads + 1,(length-1)/col_threads + 1); //if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc... + //dim3 nBlks((length-1)/(row_threads-2) + 1,(length-1)/(col_threads-2) + 1); + dim3 thrsPerBlock(row_threads, col_threads); cudaEventRecord(cpy_H2D_end); cudaEventSynchronize(cpy_H2D_end); //Copy array from host to device cudaEventRecord(comp_start); /* GPU calculation goes here */ - - unsigned int shared_mem_size = COL_THREADS * ROW_THREADS * sizeof(double); - for(size_t i = 0; i>>(gpu_input, gpu_output, length); - - // wait until GPU has finished - int error = cudaDeviceSynchronize(); - - if (error != cudaSuccess) { - cout << "Cuda error: " << error << endl; - } - + //Heat_naif<<>>(gpu_input, gpu_output, length); + //Heat<<>>(gpu_input, gpu_output, length); + Heat_sh<<>>(gpu_input, gpu_output, length); + //Heat_sh_mod<<>>(gpu_input, gpu_output, length); + //Heat_sh_red_1<<>>(gpu_input, gpu_output, length); + //Heat_sh_red_2<<>>(gpu_input, gpu_output, length); + temp = gpu_input; gpu_input = gpu_output; gpu_output = temp; + } gpu_output = gpu_input; //unswap from the last iteration + cudaEventRecord(comp_end); cudaEventSynchronize(comp_end); - - /*double mean = 0; - for (int i = 0; i < length; ++i) { - for (int j = 0; j < length; ++j) { - mean += gpu_output[i * TILE + j]; - } - } - - cout << "Massimo: " << (mean/(length*length)) << endl;*/ cudaEventRecord(cpy_D2H_start); /* Copying array from device to host goes here */ - for (unsigned int row = 0; row < length; ++row) { - cudaMemcpy((void*)(output + length * row), (void*)(gpu_output + TILE * row), length, cudaMemcpyDeviceToHost); - } + cudaMemcpy((void*)output, (void*)gpu_output, SIZE, cudaMemcpyDeviceToHost); cudaEventRecord(cpy_D2H_end); cudaEventSynchronize(cpy_D2H_end); /* Postprocessing goes here */ cudaFree(gpu_input); cudaFree(gpu_output); float time; cudaEventElapsedTime(&time, cpy_H2D_start, cpy_H2D_end); cout<<"Host to Device MemCpy takes "<