diff --git a/A4/assignment4 b/A4/assignment4
index 94c0b72..3d1b66b 100755
Binary files a/A4/assignment4 and b/A4/assignment4 differ
diff --git a/A4/implementation.cu b/A4/implementation.cu
index 6b249b1..8e0f343 100644
--- a/A4/implementation.cu
+++ b/A4/implementation.cu
@@ -1,279 +1,429 @@
 /*
 ============================================================================
 Filename    : algorithm.c
 Author      : Your name goes here
 SCIPER      : Your SCIPER number
 ============================================================================
 */
 
 #include <iostream>
 #include <iomanip>
 #include <sys/time.h>
 #include <cuda_runtime.h>
 using namespace std;
 
-#define SHARED_MEM_SIZE 98304 // 96 KB
-#define CACHE_LINE 128
-#define WARP_SIZE 32
-
-#define COL_THREADS 16
-#define ROW_THREADS 8
-
 // CPU Baseline
 void array_process(double *input, double *output, int length, int iterations)
 {
     double *temp;
 
     for(int n=0; n<(int) iterations; n++)
     {
         for(int i=1; i<length-1; i++)
         {
             for(int j=1; j<length-1; j++)
             {
                 output[(i)*(length)+(j)] = (input[(i-1)*(length)+(j-1)] +
                                             input[(i-1)*(length)+(j)]   +
                                             input[(i-1)*(length)+(j+1)] +
                                             input[(i)*(length)+(j-1)]   +
                                             input[(i)*(length)+(j)]     +
                                             input[(i)*(length)+(j+1)]   +
                                             input[(i+1)*(length)+(j-1)] +
                                             input[(i+1)*(length)+(j)]   +
                                             input[(i+1)*(length)+(j+1)] ) / 9;
-
             }
         }
         output[(length/2-1)*length+(length/2-1)] = 1000;
         output[(length/2)*length+(length/2-1)]   = 1000;
         output[(length/2-1)*length+(length/2)]   = 1000;
         output[(length/2)*length+(length/2)]     = 1000;
 
         temp = input;
         input = output;
         output = temp;
     }
 }
 
-/*
-* General idea:
-*   - Allocate double precision shared memory for each block
-*   - 16 threads per block, because want full occupancy and each thread takes 2 banks
-*   - sizeof(shared) = 128b, then a block should not exceed sizeof(shared) / sizeof(double) = 16 values
-*
-* Idea before column reduction:
-*   - Start a new kernel
-*   - Avoid conflicts, sum over columns: reduct of 3 times the original matrix
-*   - Store transposed result into output
-*   - Swap input and output
-*
-* Idea after column reduction:
-*   - Start a new kernel
-*   - Avoid conflicts, sum over rows with padding of 1
-*   - Store result into output
-*/
+__global__ void Heat_naif(double *input, double *output, int length)
+{
+	int j = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int i = (blockIdx.x * blockDim.x) + threadIdx.x;
+	
+	
+	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
+	{
+		if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
+		{
+			output[(i)*(length)+(j)] = (	input[(i-1)*(length)+(j-1)] +
+							input[(i-1)*(length)+(j)]   +
+							input[(i-1)*(length)+(j+1)] +
+							input[(i)*(length)+(j-1)]   +
+							input[(i)*(length)+(j)]     +
+							input[(i)*(length)+(j+1)]   +
+							input[(i+1)*(length)+(j-1)] +
+							input[(i+1)*(length)+(j)]   +
+							input[(i+1)*(length)+(j+1)] ) / 9;
+		}
+		else output[(i)*(length)+(j)] = 1000;
+    	}
+    	else return;
+}
+
 
 __global__ void Heat(double *input, double *output, int length)
 {
-    // shared memory portion
-    __shared__ double sdata[ROW_THREADS][COL_THREADS];
+	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int j = (blockIdx.x * blockDim.x) + threadIdx.x;
+	
+	
+	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
+	{
+		if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
+		{
+			output[(i)*(length)+(j)] = (	input[(i-1)*(length)+(j-1)] +
+							input[(i-1)*(length)+(j)]   +
+							input[(i-1)*(length)+(j+1)] +
+							input[(i)*(length)+(j-1)]   +
+							input[(i)*(length)+(j)]     +
+							input[(i)*(length)+(j+1)]   +
+							input[(i+1)*(length)+(j-1)] +
+							input[(i+1)*(length)+(j)]   +
+							input[(i+1)*(length)+(j+1)] ) / 9;
+		}
+		else output[(i)*(length)+(j)] = 1000;
+    	}
+    	else return;
+}
 
-    unsigned int tid = threadIdx.x;
 
-    unsigned int warp = threadIdx.y / 2;
+__global__ void Heat_sh(double *input, double *output, int len)
+{
+	extern __shared__ double s[];
+	int length = len;
+	int S_size = (blockDim.x+2)*(blockDim.y+2);
+	
+	
+	
+	int I = blockIdx.y*blockDim.y;
+	int J = blockIdx.x*blockDim.x;
+	
+	int tid = (threadIdx.y)*blockDim.x +threadIdx.x;	//tid in a thread block
+	
+	int a = 0;
+	int b = 0;
+	
+	//unrolled fill
+	
+	a = tid/(blockDim.x+2);
+	b = tid%(blockDim.x+2);
+	
+	if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+	{
+		s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
+	}
+
+	a = (blockDim.x*blockDim.y+tid)/(blockDim.x+2);
+	b = (blockDim.x*blockDim.y+tid)%(blockDim.x+2);
+	
+	if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+	{
+		s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
+	}
+	
+	
+	__syncthreads();
+	
+	a = threadIdx.y+1;	//access s
+	b = threadIdx.x+1;	//access s
+
+	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int j = (blockIdx.x * blockDim.x) + threadIdx.x;
+	
+
+	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
+	{
+		if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
+		{
+			output[(i)*(length)+(j)] = (	s[(a-1)*(blockDim.x+2) + b-1] 	+ 	s[(a-1)*(blockDim.x+2) + b] +	s[(a-1)*(blockDim.x+2) + b+1] 
+						+ 	s[(a)*(blockDim.x+2) + b-1] 	+ 	s[(a)*(blockDim.x+2) + b] + 	s[(a)*(blockDim.x+2) + b+1] 
+						+ 	s[(a+1)*(blockDim.x+2) + b-1] 	+ 	s[(a+1)*(blockDim.x+2) + b] + 	s[(a+1)*(blockDim.x+2) + b+1] ) / 9;
+		}
+		else output[(i)*(length)+(j)] = 1000;
+    }
+    else return;
+}
 
-	unsigned int x = (blockIdx.x * blockDim.x) + threadIdx.x; // absolute x
-	unsigned int y = (blockIdx.y * (blockDim.y - 2)); // absolute y
-    
-    // follow the line setting
-    y += (warp != 0) ? threadIdx.y - 2 : (blockDim.y-1) * threadIdx.y; 
-
-    /*
-        Block structure:
-
-     -1 ******************* <- warp 0
-      0 ddddddddddddddddddd <- warp 1
-      1 ddddddddddddddddddd <- warp 1
-      2 ddddddddddddddddddd <- warp 2
-      3 ddddddddddddddddddd <- warp 2
-        ....
-        ddddddddddddddddddd <- warp ROW_THREADS/2 - 1
-        ddddddddddddddddddd <- warp ROW_THREADS/2 - 1
-        ******************* <- warp 0
-    */
-
-    unsigned int TILE = gridDim.x * blockDim.x;
-
-    unsigned int k = y * TILE + x; // absolute index of the array
-   
-    if (warp != 0) // && x > 0 && y > 0 && (x < length-1) && (y < length-1))
-        output[k] = blockIdx.x;
-    
-    // grid boundary
-	//if ( !( (x > 0) && (y > 0) && (x < length-1) && (y < length-1) ) ) {
-    //sdata[threadIdx.y][threadIdx.x] = 0;
-    //} 
+__global__ void Heat_sh_mod(double *input, double *output, int len)
+{
+	extern __shared__ double s[];
+	int length = len;
+	
+	
+	int tid = (threadIdx.y)*blockDim.x +threadIdx.x;	//tid in a thread block
+	
+	int In_x = blockIdx.x*blockDim.x - 2*blockIdx.x;		//nb_blocks in x *blockDim.x
+	int In_y = blockIdx.y*blockDim.y - 2*blockIdx.y;		//((nb_blocks in y)*blockDim.y) * nb_element per line
+	
+	if(((In_y-1 + (int)threadIdx.y)>=0)&&((In_x-1 + (int)threadIdx.x)>=0)&&(In_y-1 + (int)threadIdx.y<length)&&(In_x-1 + (int)threadIdx.x<length))
+	{
+		s[tid] = input[(In_y-1+threadIdx.y)*length + In_x-1+threadIdx.x];
+	}
+	
+	__syncthreads();
+	
+	
+	int a = threadIdx.y;	//access s
+	int b = threadIdx.x;	//access s
+	
+	int i = (In_y-1) + a;
+	int j = (In_x-1) + b;
+	
+	if((a > 0) && (b > 0)&&(a<blockDim.y-1)&&(b<blockDim.x-1)&&(i > 0)&&(j > 0)&&(i < length-1) && (j < length-1))
+	{
+		if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
+		{
+			output[(i)*(length)+(j)] = (	s[(a-1)*(blockDim.x) + b-1] 	+ 	s[(a-1)*(blockDim.x) + b] +	s[(a-1)*(blockDim.x) + b+1] 
+						+ 	s[(a)*(blockDim.x) + b-1] 	+ 	s[(a)*(blockDim.x) + b] + 	s[(a)*(blockDim.x) + b+1] 
+						+ 	s[(a+1)*(blockDim.x) + b-1] 	+ 	s[(a+1)*(blockDim.x) + b] + 	s[(a+1)*(blockDim.x) + b+1] ) / 9;
+		}
+		else output[(i)*(length)+(j)] = 1000;
+    }
+    else return;
+}
 
-    // already sum the first three reductions
-    double sum = input[k];
-    
-    // boundary terms are branching, O(N)
-    if (x > 0)
-        sum += input[k-1]; // non-coalesced access every right end block
+__global__ void Heat_sh_red_1(double *input, double *output, int length)
+{
+	extern __shared__ double shared[];
+	int S_size = (blockDim.x+2)*(blockDim.y+2);
+	int red_size = (blockDim.x)*(blockDim.y+2);
+	
+	double *s = shared;
+	double *red = (double*)&shared[S_size];
+	
+	int I = blockIdx.y*blockDim.y;
+	int J = blockIdx.x*blockDim.x;
+	
+	int tid = (threadIdx.y)*blockDim.x +threadIdx.x;	//tid in a thread block
+	
+	int a = 0;
+	int b = 0;
+
+
+	for(int k = tid ; k<S_size; k += blockDim.x*blockDim.y)
+	{
+		a = k/(blockDim.x+2);
+		b = k%(blockDim.x+2);
+		
+		if((I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+		{
+			s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
+		}
+	}
+	/*
+	//unrolled fill
+	
+	a = tid/(blockDim.x+2);
+	b = tid%(blockDim.x+2);
+	
+	if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+	{
+		s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
+	}
+
+	a = (blockDim.x*blockDim.y+tid)/(blockDim.x+2);
+	b = (blockDim.x*blockDim.y+tid)%(blockDim.x+2);
+	
+	if((a*(blockDim.x+2) + b<S_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+	{
+		s[a*(blockDim.x+2) + b] = input[(I-1+a)*length + (J-1+b)];
+	}*/
+	
+	
+	__syncthreads();
+	
+	for(int k = tid ; k<red_size; k += blockDim.x*blockDim.y)
+	{
+		a = k/(blockDim.y+2);
+		b = k%(blockDim.y+2);
+		
+		red[a*(blockDim.y+2) + b] = s[b*(blockDim.x+2) + a] + s[b*(blockDim.x+2) + a+1] + s[b*(blockDim.x+2) + a+2];	//reduction + transpose (a and b swaped)		
+	}
+	
+	__syncthreads();
+	
+	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int j = (blockIdx.x * blockDim.x) + threadIdx.x;
+	
+	a = threadIdx.y;
+	b = threadIdx.x;
+	
+	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
+	{
+		if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
+		{
+			output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9;
+		}
+		else output[(i)*(length)+(j)] = 1000;
+	}
+	
+
+    else return;
+}
 
-    // boundary terms are branching, O(N)
-    if (x < length-1)
-        sum += input[k+1]; // non-coalesced access every left end block
 
-    sdata[threadIdx.y][threadIdx.x] = sum;
 
-    // assure all threads are synchronized
-    __syncthreads();
-        
-    // allow only one warp
-    
-    if (warp == 0) {
-
-        // select only 16 threads
-        // no bank conflicts only if COL_THREADS is 16
-        if (threadIdx.y == 0) {
- 
-            // two rows more are fetched
-            for (unsigned int row = 1; row < (ROW_THREADS - 1); ++row) {
- 
-                // select the same bank
-                sdata[row][tid] += sdata[row-1][tid] + sdata[row+1][tid];
-            }
-        }
-    }
-    
-    // wait all threads are finished
-    __syncthreads();
-    /* 
-    if (((x == length/2-1) || (x == length/2)) && ((y == length/2-1) || (y == length/2)))
-        output[k] = 1000; // special values, a negligible branching
-    else if (warp != 0)
-        output[k] = sdata[threadIdx.y][threadIdx.x] / 9;
-    */
+__global__ void Heat_sh_red_2(double *input, double *output, int length)
+{
+	extern __shared__ double red[];
+	int red_size = (blockDim.x)*(blockDim.y+2);
+	
+	int I = blockIdx.y*blockDim.y;
+	int J = blockIdx.x*blockDim.x;
+	
+	int tid = (threadIdx.y)*blockDim.x +threadIdx.x;	//tid in a thread block
+	
+	int a = 0;
+	int b = 0;
+	/*
+	a = tid/(blockDim.y+2);
+	b = tid%(blockDim.y+2);
+	
+	if((a*(blockDim.x) + b<red_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+	{
+		red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)];
+	}
+
+	a = (blockDim.x*blockDim.y+tid)/(blockDim.y);
+	b = (blockDim.x*blockDim.y+tid)%(blockDim.y);
+	
+	if((a*(blockDim.x) + b<red_size)&&(I-1+a >= 0) && (J-1+b >= 0) && (I-1+a < length) && (J-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+	{
+		red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)];
+	}*/
+
+	
+	
+	for(int k = tid ; k<red_size; k += blockDim.x*blockDim.y)
+	{
+		a = k/(blockDim.y+2);
+		b = k%(blockDim.y+2);
+
+		if((J-1+a >= 0) && (I-1+b >= 0) && (J+1+a < length) && (I-1+b < length)) 		// "-1" because we consider a rectangle that begins 1 element earlier in i and j. Hence one row and one column before the block
+		{
+			red[a*(blockDim.y+2) + b] = input[(I-1+b)*length + (J-1+a)] + input[(I-1+b)*length + (J+a)] + input[(I-1+b)*length + (J+a+1)];	//reduction + transpose (a and b swaped)
+		}
+	}
+	
+	__syncthreads();
+	
+	int i = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int j = (blockIdx.x * blockDim.x) + threadIdx.x;
+	
+	a = threadIdx.y;
+	b = threadIdx.x;
+	
+	if((i > 0) && (j > 0) && (i < length-1) && (j < length-1))
+	{
+		if(!(((i == length/2-1)||(i == length/2))&&((j == length/2-1)||(j == length/2))))
+		{
+			output[(i)*(length)+(j)] = (red[(b)*(blockDim.y+2) + a] + red[(b)*(blockDim.y+2) + a+1] + red[(b)*(blockDim.y+2) + a+2]) / 9;
+		}
+		else output[(i)*(length)+(j)] = 1000;
+	}
+	
+
+    else return;
 }
 
 
 // GPU Optimized function
 void GPU_array_process(double *input, double *output, int length, int iterations)
 {
     //Cuda events for calculating elapsed time
     cudaEvent_t cpy_H2D_start, cpy_H2D_end, comp_start, comp_end, cpy_D2H_start, cpy_D2H_end;
     cudaEventCreate(&cpy_H2D_start);
     cudaEventCreate(&cpy_H2D_end);
     cudaEventCreate(&cpy_D2H_start);
     cudaEventCreate(&cpy_D2H_end);
     cudaEventCreate(&comp_start);
     cudaEventCreate(&comp_end);
 
-    // defining constants
-
-    unsigned int XBLOCKS = length / COL_THREADS; // ceil to a COL_THREADS multiple, x direction
-    unsigned int YBLOCKS = length / (ROW_THREADS - 2); 
-
-    if (length % COL_THREADS != 0)
-        ++XBLOCKS;
-
-    if (length % (ROW_THREADS-2) != 0)
-        ++YBLOCKS;
-
-    unsigned int TILE = XBLOCKS * COL_THREADS;
-    unsigned int SIZE = TILE*TILE*sizeof(double); 
-
-    cout << "X blocks = " << XBLOCKS << endl;
-    cout << "Y blocks = " << YBLOCKS << endl;
-
-    cout << "X TILE = " << TILE << endl;
-    cout << "Y TILE = " << (YBLOCKS * (ROW_THREADS-2)) << endl;
-    
     /* Preprocessing goes here */
-
+    int SIZE = length*length*sizeof(double);
+    
     double* gpu_input;
     double* gpu_output;
     double* temp;				//used to swith pointers between gpu_input and gpu_output
     cudaMalloc((void**)&gpu_input, SIZE);
     cudaMalloc((void**)&gpu_output, SIZE);
     
+    
     cudaEventRecord(cpy_H2D_start);
     /* Copying array from host to device goes here */
+    cudaMemcpy((void*)gpu_input, (void*)input, SIZE, cudaMemcpyHostToDevice);
+    cudaMemcpy((void*)gpu_output, (void*)output, SIZE, cudaMemcpyHostToDevice);
 
-    for (unsigned int row = 0; row < length; ++row) {
-        
-        // copy the row
-        cudaMemcpy((void*)(gpu_input + row * TILE), (void*)(input + row * length), length, cudaMemcpyHostToDevice);
-    }
 
-    dim3 thrsPerBlock(COL_THREADS, ROW_THREADS);
-    dim3 nBlks(XBLOCKS, YBLOCKS);	//if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc...
-    //dim3 nBlks(1,1);
+    int row_threads = 16;
+    int col_threads = 16;
+
+    //dim3 thrsPerBlock(row_threads, col_threads);
+    dim3 nBlks((length-1)/row_threads + 1,(length-1)/col_threads + 1);	//if length == 0,...,16 -> 1 block, if length == 17,...,32 -> 2 blocks etc...
+    //dim3 nBlks((length-1)/(row_threads-2) + 1,(length-1)/(col_threads-2) + 1);
+    dim3 thrsPerBlock(row_threads, col_threads);
     
     cudaEventRecord(cpy_H2D_end);
     cudaEventSynchronize(cpy_H2D_end);
 
     //Copy array from host to device
     cudaEventRecord(comp_start);
     /* GPU calculation goes here */
-
-    unsigned int shared_mem_size = COL_THREADS * ROW_THREADS * sizeof(double);
     
-    for(size_t i = 0; i<iterations; i++)
+    
+    for(size_t i = 0; i<(int) iterations; i++)
     {
-        // start kernel and compute evolution
-        // , SHARED_MEM_SIZE
-		Heat<<<nBlks, thrsPerBlock, shared_mem_size>>>(gpu_input, gpu_output, length);
-	    
-        // wait until GPU has finished    
-		int error = cudaDeviceSynchronize();
-        
-        if (error != cudaSuccess) {
-            cout << "Cuda error: " << error << endl;
-        }
-
+		//Heat_naif<<<nBlks, thrsPerBlock>>>(gpu_input, gpu_output, length);
+		//Heat<<<nBlks, thrsPerBlock>>>(gpu_input, gpu_output, length);
+		Heat_sh<<<nBlks, thrsPerBlock, ((row_threads+2)*(col_threads+2))*sizeof(double)>>>(gpu_input, gpu_output, length);
+		//Heat_sh_mod<<<nBlks, thrsPerBlock, (row_threads*col_threads)*sizeof(double)>>>(gpu_input, gpu_output, length);
+		//Heat_sh_red_1<<<nBlks, thrsPerBlock, (((row_threads+2)*(col_threads+2))+ ((row_threads)*(col_threads+2)))*sizeof(double)>>>(gpu_input, gpu_output, length);
+		//Heat_sh_red_2<<<nBlks, thrsPerBlock, (((row_threads)*(col_threads+2)))*sizeof(double)>>>(gpu_input, gpu_output, length);
+		
 		temp = gpu_input;
 		gpu_input = gpu_output;
 		gpu_output = temp;
+
     }
 
     gpu_output = gpu_input;	//unswap from the last iteration
     
+
     cudaEventRecord(comp_end);
     cudaEventSynchronize(comp_end);
-    
-    /*double mean = 0;
-    for (int i = 0; i < length; ++i) {
-        for (int j = 0; j < length; ++j) {
-            mean += gpu_output[i * TILE + j];
-        }
-    }
-
-    cout << "Massimo: " << (mean/(length*length)) << endl;*/
 
     cudaEventRecord(cpy_D2H_start);
     /* Copying array from device to host goes here */
     
-    for (unsigned int row = 0; row < length; ++row) {
-        cudaMemcpy((void*)(output + length * row), (void*)(gpu_output + TILE * row), length, cudaMemcpyDeviceToHost);
-    }
+    cudaMemcpy((void*)output, (void*)gpu_output, SIZE, cudaMemcpyDeviceToHost);
     
     
     cudaEventRecord(cpy_D2H_end);
     cudaEventSynchronize(cpy_D2H_end);
 
     /* Postprocessing goes here */
     
     cudaFree(gpu_input);
     cudaFree(gpu_output);
     
     
     float time;
     cudaEventElapsedTime(&time, cpy_H2D_start, cpy_H2D_end);
     cout<<"Host to Device MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
 
     cudaEventElapsedTime(&time, comp_start, comp_end);
     cout<<"Computation takes "<<setprecision(4)<<time/1000<<"s"<<endl;
 
     cudaEventElapsedTime(&time, cpy_D2H_start, cpy_D2H_end);
     cout<<"Device to Host MemCpy takes "<<setprecision(4)<<time/1000<<"s"<<endl;
 }
diff --git a/A4/implementation.cu b/A4/implementation_raffa.cu
similarity index 100%
copy from A4/implementation.cu
copy to A4/implementation_raffa.cu