__global__ void Heat_sh(double *input, double *output, int length)
{
extern __shared__ double s[];
int S_size = (blockDim.x+2)*(blockDim.y+2);
int I = blockIdx.y*blockDim.y;
int J = blockIdx.x*blockDim.x;
int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block
int a = 0;
int b = 0;
/*
//first thread fills s
if(tid == 0)
{
for(int k = 0 ; k<S_size; k++)
{
a = k/(blockDim.x+2);
b = k%(blockDim.x+2);
if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
for(int k = tid ; k<S_size; k += blockDim.x*blockDim.y)
{
a = k/(blockDim.x+2);
b = k%(blockDim.x+2);
if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
__global__ void Heat_sh_red_1(double *input, double *output, int length)
{
extern __shared__ double shared[];
int S_size = (blockDim.x+2)*(blockDim.y+2);
int red_size = (blockDim.x)*(blockDim.y+2);
double *s = shared;
double *red = (double*)&shared[S_size];
int I = blockIdx.y*blockDim.y;
int J = blockIdx.x*blockDim.x;
int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block
int a = 0;
int b = 0;
//all threads fill s
for(int k = tid ; k<S_size; k += blockDim.x*blockDim.y)
{
a = k/(blockDim.x+2);
b = k%(blockDim.x+2);
if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
__global__ void Heat_sh_red_2(double *input, double *output, int length)
{
extern __shared__ double red[];
int red_size = (blockDim.x)*(blockDim.y+2);
int I = blockIdx.y*blockDim.y;
int J = blockIdx.x*blockDim.x;
int tid = (threadIdx.y)*blockDim.x +threadIdx.x; //tid in a thread block
int a = 0;
int b = 0;
for(int k = tid ; k<red_size; k += blockDim.x*blockDim.y)
{
a = k/(blockDim.y+2);
b = k%(blockDim.y+2);
if((J-1+a >= 0) && (I-1+b >= 0) && (J+1+a < length) && (I-1+b < length)) // "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
{
red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)]; //reduction + transpose (a and b swaped)