diff --git a/A4/execute.sh b/A4/execute.sh
index 2b32fd1..9c77193 100644
--- a/A4/execute.sh
+++ b/A4/execute.sh
@@ -1,21 +1,21 @@
 #!/bin/bash
 #SBATCH --partition=gpu
 #SBATCH --qos=gpu_free
 #SBATCH --gres=gpu:1
 #SBATCH --nodes=1
 #SBATCH --time=1:0:0
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=1
 #SBATCH --mem 1G
 #SBATCH --account cs307
 #SBATCH --reservation CS307-gpu
 
-length=16
-iterations=1
+length=5000
+iterations=2500
 
 module load gcc cuda
 
 echo STARTING AT `date`
 
 ./assignment4 $length $iterations
 echo FINISHED at `date`
diff --git a/A4/implementation.cu b/A4/implementation.cu
index e2a059e..b2a2da6 100644
--- a/A4/implementation.cu
+++ b/A4/implementation.cu
@@ -1,429 +1,457 @@
 /*
 ============================================================================
 Filename    : algorithm.c
 Author      : Your name goes here
 SCIPER      : Your SCIPER number
 ============================================================================
 */
 
 #include <iostream>
 #include <iomanip>
 #include <sys/time.h>
 #include <cuda_runtime.h>
 using namespace std;
 
 // CPU Baseline
 void array_process(double *input, double *output, int length, int iterations)
 {
     double *temp;
 
     for(int n=0; n<(int) iterations; n++)
     {
         for(int i=1; i<length-1; i++)
         {
             for(int j=1; j<length-1; j++)
             {
                 output[(i)*(length)+(j)] = (input[(i-1)*(length)+(j-1)] +
                                             input[(i-1)*(length)+(j)]   +
                                             input[(i-1)*(length)+(j+1)] +
                                             input[(i)*(length)+(j-1)]   +
                                             input[(i)*(length)+(j)]     +
                                             input[(i)*(length)+(j+1)]   +
                                             input[(i+1)*(length)+(j-1)] +
                                             input[(i+1)*(length)+(j)]   +
                                             input[(i+1)*(length)+(j+1)] ) / 9;
             }
         }
         output[(length/2-1)*length+(length/2-1)] = 1000;
         output[(length/2)*length+(length/2-1)]   = 1000;
         output[(length/2-1)*length+(length/2)]   = 1000;
         output[(length/2)*length+(length/2)]     = 1000;
 
         temp = input;
         input = output;
         output = temp;
     }
 }
 
 
 __global__ void Heat(double *input, double *output, int length)
 {
 	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
 	int j = (blockIdx.x * blockDim.x) + threadIdx.x;
 	
 	
 	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
 	{
 		if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
 		{
 			output[(i)*(length)+(j)] = (	input[(i-1)*(length)+(j-1)] +
 							input[(i-1)*(length)+(j)]   +
 							input[(i-1)*(length)+(j+1)] +
 							input[(i)*(length)+(j-1)]   +
 							input[(i)*(length)+(j)]     +
 							input[(i)*(length)+(j+1)]   +
 							input[(i+1)*(length)+(j-1)] +
 							input[(i+1)*(length)+(j)]   +
 							input[(i+1)*(length)+(j+1)] ) / 9;
 		}
 		else output[(i)*(length)+(j)] = 1000;
     	}
     	else return;
 }
 
 
 __global__ void Heat_sh(double *input, double *output, int len)
 {
 	extern __shared__ double s[];
 	int length = len;
 	int S_size = (blockDim.x+2)*(blockDim.y+2);
 	
 	
 	
 	int I = blockIdx.y*blockDim.y;
 	int J = blockIdx.x*blockDim.x;
 	
 	int tid = (threadIdx.y)*blockDim.x +threadIdx.x;	//tid in a thread block
 	
 	int a = 0;
 	int b = 0;
 	
 	/*
 	//first thread fills s
 	
 	if(tid == 0)
 	{
 		for(int k = 0 ; k<S_size; k++)
 		{
 			a = k/(blockDim.x+2);
 			b = k%(blockDim.x+2);
 			
 
 			if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 	// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
 			{
 				s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
 			}
 		}
 	}
 
 	
 	//first warp fills s
 	
 	int warp_size = 32;
 	int warp_id = threadIdx.x / warp_size;
 	int ALU_id = threadIdx.x % warp_size;
 	
 	if(warp_id == 0)
 	{
 		for(int k = ALU_id ; k<S_size; k += warp_size)
 		{
 			a = k/(blockDim.x+2);
 			b = k%(blockDim.x+2);
 			
 			if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 	// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
 			{
 				s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
 			}
 		}
 	}
 	
 	
 	//all threads fill s
 
 	for(int k = tid ; k<S_size; k += blockDim.x*blockDim.y)
 	{
 		a = k/(blockDim.x+2);
 		b = k%(blockDim.x+2);
 		
 		if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
 		{
 			s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
 		}
 	}
 	*/
 	
 	//unrolled fill
 	
 	a = tid/(blockDim.x+2);
 	b = tid%(blockDim.x+2);
 	
 	if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
 	{
 		s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
 	}
 
 	a = (blockDim.x*blockDim.y+tid)/(blockDim.x+2);
 	b = (blockDim.x*blockDim.y+tid)%(blockDim.x+2);
 	
 	if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
 	{
 		s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
 	}
 	
 	
 	__syncthreads();
 	
 	a = threadIdx.y+1;	//access s
 	b = threadIdx.x+1;	//access s
 
 	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
 	int j = (blockIdx.x * blockDim.x) + threadIdx.x;
 	
 
 	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
 	{
 		if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
 		{
 			output[(i)*(length)+(j)] = (	s[(a-1)*(blockDim.x+2) + b-1] 	+ 	s[(a-1)*(blockDim.x+2) + b] +	s[(a-1)*(blockDim.x+2) + b+1] 
 						+ 	s[(a)*(blockDim.x+2) + b-1] 	+ 	s[(a)*(blockDim.x+2) + b] + 	s[(a)*(blockDim.x+2) + b+1] 
 						+ 	s[(a+1)*(blockDim.x+2) + b-1] 	+ 	s[(a+1)*(blockDim.x+2) + b] + 	s[(a+1)*(blockDim.x+2) + b+1] ) / 9;
 		}
 		else output[(i)*(length)+(j)] = 1000;
     }
     else return;
 }
 
 __global__ void Heat_sh_mod(double *input, double *output, int len)
 {
 	extern __shared__ double s[];
 	int length = len;
 	
-	//int S_size_x = blockDim.x;
-	//int S_size_y = blockDim.y;
-	
-	//int size_x = (blockDim.x-2);
-	//int size_y = (blockDim.y-2);
-	
-	
-	
-	int In_x = blockIdx.x*blockDim.x - 2*blockIdx.x;		//nb_blocks in x *blockDim.x
-	int In_y = ((blockIdx.y*gridDim.x)*blockDim.y) - 2*blockIdx.y;		//((nb_blocks in y)*blockDim.y) * nb_element per line
-	
 	
 	int tid = (threadIdx.y)*blockDim.x +threadIdx.x;	//tid in a thread block
 	
+	int In_x = blockIdx.x*blockDim.x - 2*blockIdx.x;		//nb_blocks in x *blockDim.x
+	int In_y = blockIdx.y*blockDim.y - 2*blockIdx.y;		//((nb_blocks in y)*blockDim.y) * nb_element per line
 	
-	s[tid] = input[(In_y-1)*(gridDim.x*blockDim.x) + In_x - 1 + tid];
+	if(((In_y-1 + (int)threadIdx.y)>=0)&&((In_x-1 + (int)threadIdx.x)>=0)&&(In_y-1 + (int)threadIdx.y<length)&&(In_x-1 + (int)threadIdx.x<length))
+	{
+		s[tid] = input[(In_y-1+threadIdx.y)*length + In_x-1+threadIdx.x];
+	}
 	
 	__syncthreads();
 	
 	
 	int a = threadIdx.y;	//access s
 	int b = threadIdx.x;	//access s
 	
-	int i = In_y*(gridDim.x*blockDim.x) + a-1;
-	int j = In_x + b-1;
+	int i = (In_y-1) + a;
+	int j = (In_x-1) + b;
 	
 	if((a > 0) && (b > 0)&&(a<blockDim.y-1)&&(b<blockDim.x-1)&&(i > 0)&&(j > 0)&&(i < length-1) && (j < length-1))
 	{
 		if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
 		{
 			output[(i)*(length)+(j)] = (	s[(a-1)*(blockDim.x) + b-1] 	+ 	s[(a-1)*(blockDim.x) + b] +	s[(a-1)*(blockDim.x) + b+1] 
 						+ 	s[(a)*(blockDim.x) + b-1] 	+ 	s[(a)*(blockDim.x) + b] + 	s[(a)*(blockDim.x) + b+1] 
 						+ 	s[(a+1)*(blockDim.x) + b-1] 	+ 	s[(a+1)*(blockDim.x) + b] + 	s[(a+1)*(blockDim.x) + b+1] ) / 9;
 		}
 		else output[(i)*(length)+(j)] = 1000;
     }
     else return;
 }
 
 __global__ void Heat_sh_red_1(double *input, double *output, int length)
 {
 	extern __shared__ double shared[];
 	int S_size = (blockDim.x+2)*(blockDim.y+2);
 	int red_size = (blockDim.x)*(blockDim.y+2);
 	
 	double *s = shared;
 	double *red = (double*)&shared[S_size];
 	
 	int I = blockIdx.y*blockDim.y;
 	int J = blockIdx.x*blockDim.x;
 	
 	int tid = (threadIdx.y)*blockDim.x +threadIdx.x;	//tid in a thread block
 	
 	int a = 0;
 	int b = 0;
-	
-	//all threads fill s
+
 
 	for(int k = tid ; k<S_size; k += blockDim.x*blockDim.y)
 	{
 		a = k/(blockDim.x+2);
 		b = k%(blockDim.x+2);
 		
 		if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
 		{
 			s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
 		}
 	}
+	/*
+	//unrolled fill
+	
+	a = tid/(blockDim.x+2);
+	b = tid%(blockDim.x+2);
+	
+	if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+	{
+		s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
+	}
+
+	a = (blockDim.x*blockDim.y+tid)/(blockDim.x+2);
+	b = (blockDim.x*blockDim.y+tid)%(blockDim.x+2);
+	
+	if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+	{
+		s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
+	}*/
+	
 	
 	__syncthreads();
 	
 	for(int k = tid ; k<red_size; k += blockDim.x*blockDim.y)
 	{
 		a = k/(blockDim.y+2);
 		b = k%(blockDim.y+2);
 		
 		red[a*(blockDim.y+2) + b] = s[b*(blockDim.x+2) + a] + s[b*(blockDim.x+2) + a+1] + s[b*(blockDim.x+2) + a+2];	//reduction + transpose (a and b swaped)		
 	}
 	
 	__syncthreads();
 	
 	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
 	int j = (blockIdx.x * blockDim.x) + threadIdx.x;
 	
 	a = threadIdx.y;
 	b = threadIdx.x;
 	
 	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
 	{
 		if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
 		{
 			output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9;
 		}
 		else output[(i)*(length)+(j)] = 1000;
 	}
 	
 
     else return;
 }
 
 
 
 __global__ void Heat_sh_red_2(double *input, double *output, int length)
 {
 	extern __shared__ double red[];
 	int red_size = (blockDim.x)*(blockDim.y+2);
 	
 	int I = blockIdx.y*blockDim.y;
 	int J = blockIdx.x*blockDim.x;
 	
 	int tid = (threadIdx.y)*blockDim.x +threadIdx.x;	//tid in a thread block
 	
 	int a = 0;
 	int b = 0;
+	/*
+	a = tid/(blockDim.y+2);
+	b = tid%(blockDim.y+2);
 	
+	if((a*(blockDim.x) + b<red_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+	{
+		red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)];
+	}
+
+	a = (blockDim.x*blockDim.y+tid)/(blockDim.y);
+	b = (blockDim.x*blockDim.y+tid)%(blockDim.y);
+	
+	if((a*(blockDim.x) + b<red_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+	{
+		red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)];
+	}*/
+
 	
 	
 	for(int k = tid ; k<red_size; k += blockDim.x*blockDim.y)
 	{
 		a = k/(blockDim.y+2);
 		b = k%(blockDim.y+2);
 
 		if((J-1+a >= 0) && (I-1+b >= 0) && (J+1+a < length) && (I-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
 		{
 			red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)];	//reduction + transpose (a and b swaped)
 		}
 	}
 	
 	__syncthreads();
 	
 	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
 	int j = (blockIdx.x * blockDim.x) + threadIdx.x;
 	
 	a = threadIdx.y;
 	b = threadIdx.x;
 	
 	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
 	{
 		if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
 		{
 			output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9;
 		}
 		else output[(i)*(length)+(j)] = 1000;
 	}
 	
 
     else return;
 }
 
 
 // GPU Optimized function
 void GPU_array_process(double *input, double *output, int length, int iterations)
 {
     //Cuda events for calculating elapsed time
     cudaEvent_t cpy_H2D_start, cpy_H2D_end, comp_start, comp_end, cpy_D2H_start, cpy_D2H_end;
     cudaEventCreate(&cpy_H2D_start);
     cudaEventCreate(&cpy_H2D_end);
     cudaEventCreate(&cpy_D2H_start);
     cudaEventCreate(&cpy_D2H_end);
     cudaEventCreate(&comp_start);
     cudaEventCreate(&comp_end);
 
     /* Preprocessing goes here */
     int SIZE = length*length*sizeof(double);
     
     double* gpu_input;
     double* gpu_output;
     double* temp;				//used to swith pointers between gpu_input and gpu_output
     cudaMalloc((void**)&gpu_input, SIZE);
     cudaMalloc((void**)&gpu_output, SIZE);
     
     
     cudaEventRecord(cpy_H2D_start);
     /* Copying array from host to device goes here */
     cudaMemcpy((void*)gpu_input, (void*)input, SIZE, cudaMemcpyHostToDevice);
     cudaMemcpy((void*)gpu_output, (void*)output, SIZE, cudaMemcpyHostToDevice);
 
 
-    int row_threads = 16;
-    int col_threads = 16;
+    int row_threads = 4;
+    int col_threads = 4;
 
     //dim3 thrsPerBlock(row_threads, col_threads);
-    //dim3 nBlks((length-1)/row_threads + 1,(length-1)/col_threads + 1);	//if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc...
-    dim3 nBlks((length-1)/(row_threads-2) + 1,(length-1)/(col_threads-2) + 1);
+    dim3 nBlks((length-1)/row_threads + 1,(length-1)/col_threads + 1);	//if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc...
+    //dim3 nBlks((length-1)/(row_threads-2) + 1,(length-1)/(col_threads-2) + 1);
     dim3 thrsPerBlock(row_threads, col_threads);
     
     cudaEventRecord(cpy_H2D_end);
     cudaEventSynchronize(cpy_H2D_end);
 
     //Copy array from host to device
     cudaEventRecord(comp_start);
     /* GPU calculation goes here */
     
     
     for(size_t i = 0; i<(int) iterations; i++)
     {
 		//Heat<<<nBlks, thrsPerBlock>>>(gpu_input, gpu_output, length);
 		//Heat_sh<<<nBlks, thrsPerBlock, ((row_threads+2)*(col_threads+2))*sizeof(double)>>>(gpu_input, gpu_output, length);
-		Heat_sh_mod<<<nBlks, thrsPerBlock, (row_threads*col_threads)*sizeof(double)>>>(gpu_input, gpu_output, length);
-		//Heat_sh_red_1<<<nBlks, thrsPerBlock, (((row_threads+2)*(col_threads+2))+ ((row_threads)*(col_threads+2)))*sizeof(double)>>>(gpu_input, gpu_output, length);
+		//Heat_sh_mod<<<nBlks, thrsPerBlock, (row_threads*col_threads)*sizeof(double)>>>(gpu_input, gpu_output, length);
+		Heat_sh_red_1<<<nBlks, thrsPerBlock, (((row_threads+2)*(col_threads+2))+ ((row_threads)*(col_threads+2)))*sizeof(double)>>>(gpu_input, gpu_output, length);
 		//Heat_sh_red_2<<<nBlks, thrsPerBlock, (((row_threads)*(col_threads+2)))*sizeof(double)>>>(gpu_input, gpu_output, length);
 		
 		temp = gpu_input;
 		gpu_input = gpu_output;
 		gpu_output = temp;
 
     }
 
     gpu_output = gpu_input;	//unswap from the last iteration
     
 
     cudaEventRecord(comp_end);
     cudaEventSynchronize(comp_end);
 
     cudaEventRecord(cpy_D2H_start);
     /* Copying array from device to host goes here */
     
     cudaMemcpy((void*)output, (void*)gpu_output, SIZE, cudaMemcpyDeviceToHost);
     
     
     cudaEventRecord(cpy_D2H_end);
     cudaEventSynchronize(cpy_D2H_end);
 
     /* Postprocessing goes here */
     
     cudaFree(gpu_input);
     cudaFree(gpu_output);
     
     
     float time;
     cudaEventElapsedTime(&time, cpy_H2D_start, cpy_H2D_end);
     cout<<"Host to Device MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
 
     cudaEventElapsedTime(&time, comp_start, comp_end);
     cout<<"Computation takes "<<setprecision(4)<<time/1000<<"s"<<endl;
 
     cudaEventElapsedTime(&time, cpy_D2H_start, cpy_D2H_end);
     cout<<"Device to Host MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
 }