diff --git a/A3/execute_order.sh b/A3/execute_numa_test.sh
similarity index 76%
copy from A3/execute_order.sh
copy to A3/execute_numa_test.sh
index fde5d3b..265f5f2 100644
--- a/A3/execute_order.sh
+++ b/A3/execute_numa_test.sh
@@ -1,53 +1,56 @@
 #!/bin/bash
 #SBATCH --chdir /scratch/jaugey
 #SBATCH --nodes 1
 #SBATCH --ntasks 1
 #SBATCH --cpus-per-task 28
 #SBATCH --mem 10G
 #SBATCH --partition serial
 #SBATCH --account cs307
 #SBATCH --reservation CS307-ex 
 
-NAME=order
+NAME=numa_test
 
 if [ -n "$1" ]; then
     NAME=$1
 fi
 
 echo STARTING AT `date`
 
 # local mode
 echo "Local allocation"
 numactl --localalloc ./$NAME
 
 echo ""
 
 # select node 0
 echo "Binding node 0"
-numactl --membind=0 ./$NAME
+numactl --cpunodebind=1 --membind=0 ./$NAME
+numactl --cpunodebind=1 --membind=0 ./$NAME
+numactl --cpunodebind=1 --membind=0 ./$NAME
+numactl --cpunodebind=1 --membind=0 ./$NAME
 
 echo ""
 
 # select node 1
 echo "Binding node 1"
 numactl --membind=1 ./$NAME
 
 echo ""
 
 # interleaved mode
 echo "Binding interleave node 0"
 numactl --interleave=0 ./$NAME
 
 echo ""
 
 # interleaved mode
 echo "Binding interleave node 1"
-numactl --interleave=1 ./$NAME
+numactl --interleave=0 ./$NAME
 
 echo ""
 
 # interleaved mode
 echo "Binding interleave all nodes"
 numactl --interleave=all ./$NAME
 
 echo FINISHED at `date`
diff --git a/A3/execute_order.sh b/A3/execute_order.sh
index fde5d3b..83ae7e4 100644
--- a/A3/execute_order.sh
+++ b/A3/execute_order.sh
@@ -1,53 +1,30 @@
 #!/bin/bash
 #SBATCH --chdir /scratch/jaugey
 #SBATCH --nodes 1
 #SBATCH --ntasks 1
 #SBATCH --cpus-per-task 28
 #SBATCH --mem 10G
 #SBATCH --partition serial
 #SBATCH --account cs307
 #SBATCH --reservation CS307-ex 
 
 NAME=order
 
 if [ -n "$1" ]; then
     NAME=$1
 fi
 
 echo STARTING AT `date`
 
 # local mode
-echo "Local allocation"
-numactl --localalloc ./$NAME
+echo 1 Node
+numactl -C 0,1,2 ./$NAME
 
 echo ""
 
-# select node 0
-echo "Binding node 0"
-numactl --membind=0 ./$NAME
+# separated node
+echo 2 Nodes
+numactl -C 0,1,27 ./$NAME
 
-echo ""
-
-# select node 1
-echo "Binding node 1"
-numactl --membind=1 ./$NAME
-
-echo ""
-
-# interleaved mode
-echo "Binding interleave node 0"
-numactl --interleave=0 ./$NAME
-
-echo ""
-
-# interleaved mode
-echo "Binding interleave node 1"
-numactl --interleave=1 ./$NAME
-
-echo ""
-
-# interleaved mode
-echo "Binding interleave all nodes"
-numactl --interleave=all ./$NAME
 
 echo FINISHED at `date`
diff --git a/A3/execute_order.sh b/A3/execute_order_test.sh
similarity index 78%
copy from A3/execute_order.sh
copy to A3/execute_order_test.sh
index fde5d3b..7257cbf 100644
--- a/A3/execute_order.sh
+++ b/A3/execute_order_test.sh
@@ -1,53 +1,57 @@
 #!/bin/bash
 #SBATCH --chdir /scratch/jaugey
 #SBATCH --nodes 1
 #SBATCH --ntasks 1
 #SBATCH --cpus-per-task 28
 #SBATCH --mem 10G
 #SBATCH --partition serial
 #SBATCH --account cs307
 #SBATCH --reservation CS307-ex 
 
 NAME=order
 
 if [ -n "$1" ]; then
     NAME=$1
 fi
 
 echo STARTING AT `date`
 
 # local mode
 echo "Local allocation"
 numactl --localalloc ./$NAME
 
 echo ""
 
 # select node 0
 echo "Binding node 0"
-numactl --membind=0 ./$NAME
+numactl --cpunodebind=1 --membind=0 ./$NAME
+numactl --cpunodebind=1 --membind=0 ./$NAME
+numactl --cpunodebind=1 --membind=0 ./$NAME
+numactl --cpunodebind=1 --membind=0 ./$NAME
+numactl --cpunodebind=1 --membind=0 ./$NAME
 
 echo ""
 
 # select node 1
 echo "Binding node 1"
 numactl --membind=1 ./$NAME
 
 echo ""
 
 # interleaved mode
 echo "Binding interleave node 0"
 numactl --interleave=0 ./$NAME
 
 echo ""
 
 # interleaved mode
 echo "Binding interleave node 1"
 numactl --interleave=1 ./$NAME
 
 echo ""
 
 # interleaved mode
 echo "Binding interleave all nodes"
 numactl --interleave=all ./$NAME
 
 echo FINISHED at `date`
diff --git a/A3/numa.c b/A3/numa.c
index 8ec6d22..9772b70 100644
--- a/A3/numa.c
+++ b/A3/numa.c
@@ -1,135 +1,135 @@
 #include <omp.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "utility.h"
 #include <stdint.h>
 
 #define GB (uint64_t)(1 << 30)
 #define SIZE (uint64_t)(8*GB)
 
 volatile uint8_t *arr;
 
 #define CACHE_LINE 64
 
 // louis version accessor
 /*inline uint64_t next_addr(uint64_t i){
 
     // Change this part
     if(i%3==0)
     {
 		return 10 + 10*(arr[i]%10);
 	}
 	else
 	{
 		return 10 + 11*(arr[i]%11);
 	}
 }*/
 
 // louis version initializer
 /*inline void init_array(rand_gen gen){
      
     for(uint64_t i=0; i<SIZE; ++i){
         arr[i] = (uint64_t)(next_rand(gen) * 10);
     }
 }*/
 
 
 
 // ancarola version
 inline uint64_t next_addr(uint64_t i){
 	return arr[i];
 }
 
 // 0, 1 alternated version
 /*inline void init_array(rand_gen gen){
     // elude linear pattern recognizer
     for(uint64_t i=0; i<SIZE; i += 2 * CACHE_LINE){
 
         uint64_t place = i / (2 * CACHE_LINE);
 
         if (place % 2 == 0) {
             // first place
             arr[i] = 2 * CACHE_LINE + 1;
             //printf("Allocando il dio boia: (%d, %d)\n", (int)i, (int)arr[i]);
         } else {
             // second place
             arr[i+1] = 2 * CACHE_LINE - 1; 
             //printf("Allocando il dio boia: (%d, %d)\n", (int)(i+1), (int)arr[i+1]);
         }
     }
 }*/
 
 
 // linearly shifted version
 /*inline void init_array(rand_gen gen){
     // elude linear pattern recognizer
     uint64_t counter = 0;
 
     // shift 1 every two cache lines
     for(uint64_t i=0; i<SIZE; i += 2 * CACHE_LINE){
 
         uint64_t place = i / (2 * CACHE_LINE);
 
         if (place % 2 == 0) {
             arr[i + counter] = 2 * CACHE_LINE;
         } else {
 
             if ((counter + 1) % CACHE_LINE == 0) {
                 arr[i + counter] = CACHE_LINE + 1;
                 counter = 0;
             } else {
                 arr[i + counter] = 2 * CACHE_LINE + 1;
                 ++counter;
             }
         }
     }
 }*/
 
 
 // random version
 inline void init_array(rand_gen gen){
     
     uint64_t r[2];
     r[0] = 0;
     r[1] = (uint64_t)(next_rand(gen) * (CACHE_LINE - 1));
 
-    for(uint64_t i=0; i<SIZE; i += 4 * CACHE_LINE){
+    for(uint64_t i=0; i<SIZE; i += 2 * CACHE_LINE){
 
-        arr[i + r[0]] = 4 * CACHE_LINE + (r[1] - r[0]);
+        arr[i + r[0]] = 2 * CACHE_LINE + (r[1] - r[0]);
         r[0] = r[1];
         r[1] = (uint64_t)(next_rand(gen) * (CACHE_LINE - 1));
     }
 }
 
 int main(){
     uint64_t i, counter;
     double time;
     volatile uint8_t temp;
 
     arr = (uint8_t*) malloc(SIZE);
     rand_gen gen = init_rand();
 
     init_array(gen);
 
     //Start timer
     set_clock();
     
     for(i=0, counter=0; i<SIZE; counter++){
         temp = arr[i];
         i += next_addr(i);
     }
 
     //Stop timer
     time = elapsed_time();
 
     temp = next_rand(gen)*10;
     i = temp; // Just to suppress the compiler warning for not using temp and gen
 
     if(counter < 10000000)
         printf("ERROR: Too few accesses. You have to access more elements to measure reasonable time difference.\n");
     if((time/counter) < 1.0e-8)
         printf("ERROR: Time per access is too small. You have to further deoptimize the program to measure reasonable time difference.\n");
     printf("Traversing %lx GB array took total time = %.4g seconds, number of accesses = %lu, %.4g seconds per access\n", SIZE/GB, time, counter, time/counter);
 
     return 0;    
 }
diff --git a/A3/numa_test b/A3/numa_test
new file mode 100755
index 0000000..2641606
Binary files /dev/null and b/A3/numa_test differ
diff --git a/A3/numa.c b/A3/numa_test.c
similarity index 95%
copy from A3/numa.c
copy to A3/numa_test.c
index 8ec6d22..c5fc278 100644
--- a/A3/numa.c
+++ b/A3/numa_test.c
@@ -1,135 +1,150 @@
 #include <omp.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "utility.h"
 #include <stdint.h>
 
 #define GB (uint64_t)(1 << 30)
 #define SIZE (uint64_t)(8*GB)
 
 volatile uint8_t *arr;
 
 #define CACHE_LINE 64
 
 // louis version accessor
 /*inline uint64_t next_addr(uint64_t i){
 
     // Change this part
     if(i%3==0)
     {
 		return 10 + 10*(arr[i]%10);
 	}
 	else
 	{
 		return 10 + 11*(arr[i]%11);
 	}
 }*/
 
 // louis version initializer
 /*inline void init_array(rand_gen gen){
      
     for(uint64_t i=0; i<SIZE; ++i){
         arr[i] = (uint64_t)(next_rand(gen) * 10);
     }
 }*/
 
 
 
 // ancarola version
-inline uint64_t next_addr(uint64_t i){
+/*inline uint64_t next_addr(uint64_t i){
 	return arr[i];
+}*/
+
+inline uint64_t next_addr(uint64_t i){
+	return 1;
+}
+
+inline void init_array(rand_gen gen){
+	for(uint64_t i=0; i<SIZE; i++){
+		arr[i] = 1;
+	}
 }
 
+
+
 // 0, 1 alternated version
 /*inline void init_array(rand_gen gen){
     // elude linear pattern recognizer
     for(uint64_t i=0; i<SIZE; i += 2 * CACHE_LINE){
 
         uint64_t place = i / (2 * CACHE_LINE);
 
         if (place % 2 == 0) {
             // first place
             arr[i] = 2 * CACHE_LINE + 1;
             //printf("Allocando il dio boia: (%d, %d)\n", (int)i, (int)arr[i]);
         } else {
             // second place
             arr[i+1] = 2 * CACHE_LINE - 1; 
             //printf("Allocando il dio boia: (%d, %d)\n", (int)(i+1), (int)arr[i+1]);
         }
     }
 }*/
 
 
 // linearly shifted version
 /*inline void init_array(rand_gen gen){
     // elude linear pattern recognizer
     uint64_t counter = 0;
 
     // shift 1 every two cache lines
     for(uint64_t i=0; i<SIZE; i += 2 * CACHE_LINE){
 
         uint64_t place = i / (2 * CACHE_LINE);
 
         if (place % 2 == 0) {
             arr[i + counter] = 2 * CACHE_LINE;
         } else {
 
             if ((counter + 1) % CACHE_LINE == 0) {
                 arr[i + counter] = CACHE_LINE + 1;
                 counter = 0;
             } else {
                 arr[i + counter] = 2 * CACHE_LINE + 1;
                 ++counter;
             }
         }
     }
 }*/
 
 
 // random version
-inline void init_array(rand_gen gen){
+/*inline void init_array(rand_gen gen){
     
     uint64_t r[2];
     r[0] = 0;
     r[1] = (uint64_t)(next_rand(gen) * (CACHE_LINE - 1));
 
     for(uint64_t i=0; i<SIZE; i += 4 * CACHE_LINE){
 
         arr[i + r[0]] = 4 * CACHE_LINE + (r[1] - r[0]);
         r[0] = r[1];
         r[1] = (uint64_t)(next_rand(gen) * (CACHE_LINE - 1));
     }
-}
+}*/
+
+
+
 
 int main(){
     uint64_t i, counter;
     double time;
     volatile uint8_t temp;
 
     arr = (uint8_t*) malloc(SIZE);
     rand_gen gen = init_rand();
 
     init_array(gen);
 
     //Start timer
     set_clock();
     
     for(i=0, counter=0; i<SIZE; counter++){
         temp = arr[i];
         i += next_addr(i);
     }
 
     //Stop timer
     time = elapsed_time();
 
     temp = next_rand(gen)*10;
     i = temp; // Just to suppress the compiler warning for not using temp and gen
 
     if(counter < 10000000)
         printf("ERROR: Too few accesses. You have to access more elements to measure reasonable time difference.\n");
     if((time/counter) < 1.0e-8)
         printf("ERROR: Time per access is too small. You have to further deoptimize the program to measure reasonable time difference.\n");
     printf("Traversing %lx GB array took total time = %.4g seconds, number of accesses = %lu, %.4g seconds per access\n", SIZE/GB, time, counter, time/counter);
 
     return 0;    
 }
diff --git a/A3/order_fence.c.save b/A3/order_fence.c.save
new file mode 100644
index 0000000..e69de29
diff --git a/A3/order_fence.c.save.1 b/A3/order_fence.c.save.1
new file mode 100644
index 0000000..fafbc01
--- /dev/null
+++ b/A3/order_fence.c.save.1
@@ -0,0 +1,80 @@
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+volatile int X, Y, r1, r2;
+volatile bool t1block, t2block, t1fin, t2fin;
+
+void *thread1Func(void *param){
+    for(;;){
+        while(t1block); // Wait for blocker to be removed
+        t1block = true; // Set up blocker
+
+        X = 1;
+        asm volatile("mfence" ::: "memory"); // Prevent any compiler reordering
+        r1 = Y;
+
+        t1fin = true; // Signal iteration end
+    }
+    return NULL; // Never returns
+}
+
+void *thread2Func(void *param){
+    for(;;){
+        while(t2block); // Wait for blocker to be removed
+        t2block = true; // Set up blocker
+
+        Y = 1;
+        asm volatile("mfence" ::: "memory"); // Prevent any compiler reordering
+        r2 = X;
+
+        t2fin = true; // Signal iteration end
+    }
+    return NULL; // Never returns
+}
+
+int main(){
+    //Init
+    t1block = t2block = true; // Act as blockers for threads
+    t1fin = t2fin = false; // Act as finish signals for threads
+
+    // Spawn the threads
+    pthread_t thread0, thread1, thread2;
+    thread0 = pthread_self();
+    pthread_create(&thread1, NULL, thread1Func, NULL);
+    pthread_create(&thread2, NULL, thread2Func, NULL);
+
+    /* Set the main thread affinity to CPU 0 */
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(0, &cpuset);
+    int s = pthread_setaffinity_np(thread0, sizeof(cpu_set_t), &cpuset);
+    if (s != 0){
+        printf("Failed to set main thread affinity\n");
+        return 0;
+    }
+           
+    // Repeat the experiment
+    int detected = 0, i = 1;
+    for (i = 1; i < 1000000; i++){
+        // Reset all variables
+        X = 0;
+        Y = 0;
+        t1fin = t2fin = false;
+        t1block = t2block = false;
+
+        // Wait for threads to complete their iteration
+        while(!(t1fin && t2fin));
+
+        // Check if there was a memory reordering
+        if (r1 == 0 && r2 == 0){
+            detected++;
+        }
+    }
+    printf("%d memory re-orderings detected in %d iterations - %d percent\n", detected, i, 100*detected/i);
+    return 0;  // Never returns
+}