Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F100813167
implementation.cu
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Feb 2, 23:47
Size
8 KB
Mime Type
text/x-c
Expires
Tue, Feb 4, 23:47 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
24036114
Attached To
R10834 Project_multiproc
implementation.cu
View Options
/*
============================================================================
Filename : algorithm.c
Author : Your name goes here
SCIPER : Your SCIPER number
============================================================================
*/
#include <iostream>
#include <iomanip>
#include <sys/time.h>
#include <cuda_runtime.h>
using namespace std;
#define SHARED_MEM_SIZE 98304 // 96 KB
#define CACHE_LINE 128
#define WARP_SIZE 32
#define COL_THREADS 16
#define ROW_THREADS 8
// CPU Baseline
void array_process(double *input, double *output, int length, int iterations)
{
double *temp;
for(int n=0; n<(int) iterations; n++)
{
for(int i=1; i<length-1; i++)
{
for(int j=1; j<length-1; j++)
{
output[(i)*(length)+(j)] = (input[(i-1)*(length)+(j-1)] +
input[(i-1)*(length)+(j)] +
input[(i-1)*(length)+(j+1)] +
input[(i)*(length)+(j-1)] +
input[(i)*(length)+(j)] +
input[(i)*(length)+(j+1)] +
input[(i+1)*(length)+(j-1)] +
input[(i+1)*(length)+(j)] +
input[(i+1)*(length)+(j+1)] ) / 9;
}
}
output[(length/2-1)*length+(length/2-1)] = 1000;
output[(length/2)*length+(length/2-1)] = 1000;
output[(length/2-1)*length+(length/2)] = 1000;
output[(length/2)*length+(length/2)] = 1000;
temp = input;
input = output;
output = temp;
}
}
/*
* General idea:
* - Allocate double precision shared memory for each block
* - 16 threads per block, because want full occupancy and each thread takes 2 banks
* - sizeof(shared) = 128b, then a block should not exceed sizeof(shared) / sizeof(double) = 16 values
*
* Idea before column reduction:
* - Start a new kernel
* - Avoid conflicts, sum over columns: reduct of 3 times the original matrix
* - Store transposed result into output
* - Swap input and output
*
* Idea after column reduction:
* - Start a new kernel
* - Avoid conflicts, sum over rows with padding of 1
* - Store result into output
*/
__global__ void Heat(double *input, double *output, int length)
{
// shared memory portion
__shared__ double sdata[ROW_THREADS][COL_THREADS];
unsigned int tid = threadIdx.x;
unsigned int warp = threadIdx.y / 2;
unsigned int x = (blockIdx.x * blockDim.x) + threadIdx.x; // absolute x
unsigned int y = (blockIdx.y * (blockDim.y - 2)); // absolute y
// follow the line setting
y += (warp != 0) ? threadIdx.y - 2 : (blockDim.y-1) * threadIdx.y;
/*
Block structure:
-1 ******************* <- warp 0
0 ddddddddddddddddddd <- warp 1
1 ddddddddddddddddddd <- warp 1
2 ddddddddddddddddddd <- warp 2
3 ddddddddddddddddddd <- warp 2
....
ddddddddddddddddddd <- warp ROW_THREADS/2 - 1
ddddddddddddddddddd <- warp ROW_THREADS/2 - 1
******************* <- warp 0
*/
unsigned int TILE = gridDim.x * blockDim.x;
unsigned int k = y * TILE + x; // absolute index of the array
if (warp != 0 && x > 0 && y > 0 && (x < length-1) && (y < length-1))
output[k] = blockIdx.x;
// grid boundary
if ( !( (x > 0) && (y > 0) && (x < length-1) && (y < length-1) ) ) {
sdata[threadIdx.y][threadIdx.x] = 0;
} else if (((x == length/2-1) || (x == length/2)) && ((y == length/2-1) || (y == length/2))) {
output[k] = 1000;
} else {
// all threads in a warp should share the same cache
// already sum the first three reductions
sdata[threadIdx.y][threadIdx.x] =
input[k-1] + // non-coalesced access every right end block
input[k] +
input[k+1]; // non-coalesced access every left end block
// assure all threads are synchronized
__syncthreads();
// allow only one warp
if (warp == 0) {
// select only 16 threads
// no bank conflicts only if COL_THREADS is 16
if (threadIdx.y == 0) {
// two rows more are fetched
for (unsigned int row = 1; row < (ROW_THREADS - 1); ++row) {
// select the same bank
sdata[row][tid] += sdata[row-1][tid] + sdata[row+1][tid];
}
}
}
// wait all threads are finished
__syncthreads();
//if (warp != 0)
//output[k] = 1.0;
//output[k] = sdata[threadIdx.y][threadIdx.x] / 9;
}
}
// GPU Optimized function
void GPU_array_process(double *input, double *output, int length, int iterations)
{
//Cuda events for calculating elapsed time
cudaEvent_t cpy_H2D_start, cpy_H2D_end, comp_start, comp_end, cpy_D2H_start, cpy_D2H_end;
cudaEventCreate(&cpy_H2D_start);
cudaEventCreate(&cpy_H2D_end);
cudaEventCreate(&cpy_D2H_start);
cudaEventCreate(&cpy_D2H_end);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_end);
// defining constants
unsigned int XBLOCKS = length / COL_THREADS; // ceil to a COL_THREADS multiple, x direction
unsigned int YBLOCKS = length / (ROW_THREADS - 2);
if (length % COL_THREADS != 0)
++XBLOCKS;
if (length % (ROW_THREADS-2) != 0)
++YBLOCKS;
unsigned int TILE = XBLOCKS * COL_THREADS;
unsigned int SIZE = TILE*TILE*sizeof(double);
cout << "X blocks = " << XBLOCKS << endl;
cout << "Y blocks = " << YBLOCKS << endl;
cout << "X TILE = " << TILE << endl;
cout << "Y TILE = " << (YBLOCKS * (ROW_THREADS-2)) << endl;
/* Preprocessing goes here */
double* gpu_input;
double* gpu_output;
double* temp; //used to swith pointers between gpu_input and gpu_output
cudaMalloc((void**)&gpu_input, SIZE);
cudaMalloc((void**)&gpu_output, SIZE);
cudaEventRecord(cpy_H2D_start);
/* Copying array from host to device goes here */
for (unsigned int row = 0; row < length; ++row) {
// copy the row
cudaMemcpy((void*)(gpu_input + row * TILE), (void*)(input + row * length), length, cudaMemcpyHostToDevice);
}
dim3 thrsPerBlock(COL_THREADS, ROW_THREADS);
dim3 nBlks(XBLOCKS, YBLOCKS); //if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc...
//dim3 nBlks(1,1);
cudaEventRecord(cpy_H2D_end);
cudaEventSynchronize(cpy_H2D_end);
//Copy array from host to device
cudaEventRecord(comp_start);
/* GPU calculation goes here */
unsigned int shared_mem_size = COL_THREADS * ROW_THREADS * sizeof(double);
for(size_t i = 0; i<iterations; i++)
{
// start kernel and compute evolution
// , SHARED_MEM_SIZE
Heat<<<nBlks, thrsPerBlock, shared_mem_size>>>(gpu_input, gpu_output, length);
// wait until GPU has finished
cudaDeviceSynchronize();
temp = gpu_input;
gpu_input = gpu_output;
gpu_output = temp;
}
gpu_output = gpu_input; //unswap from the last iteration
cudaEventRecord(comp_end);
cudaEventSynchronize(comp_end);
/*double mean = 0;
for (int i = 0; i < length; ++i) {
for (int j = 0; j < length; ++j) {
mean += gpu_output[i * TILE + j];
}
}
cout << "Massimo: " << (mean/(length*length)) << endl;*/
cudaEventRecord(cpy_D2H_start);
/* Copying array from device to host goes here */
for (unsigned int row = 0; row < length; ++row) {
cudaMemcpy((void*)(output + length * row), (void*)(gpu_output + TILE * row), length, cudaMemcpyDeviceToHost);
}
cudaEventRecord(cpy_D2H_end);
cudaEventSynchronize(cpy_D2H_end);
/* Postprocessing goes here */
cudaFree(gpu_input);
cudaFree(gpu_output);
float time;
cudaEventElapsedTime(&time, cpy_H2D_start, cpy_H2D_end);
cout<<"Host to Device MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
cudaEventElapsedTime(&time, comp_start, comp_end);
cout<<"Computation takes "<<setprecision(4)<<time/1000<<"s"<<endl;
cudaEventElapsedTime(&time, cpy_D2H_start, cpy_D2H_end);
cout<<"Device to Host MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
}
Event Timeline
Log In to Comment