diff --git a/A3/execute_order.sh b/A3/execute_numa_test.sh similarity index 76% copy from A3/execute_order.sh copy to A3/execute_numa_test.sh index fde5d3b..265f5f2 100644 --- a/A3/execute_order.sh +++ b/A3/execute_numa_test.sh @@ -1,53 +1,56 @@ #!/bin/bash #SBATCH --chdir /scratch/jaugey #SBATCH --nodes 1 #SBATCH --ntasks 1 #SBATCH --cpus-per-task 28 #SBATCH --mem 10G #SBATCH --partition serial #SBATCH --account cs307 #SBATCH --reservation CS307-ex -NAME=order +NAME=numa_test if [ -n "$1" ]; then NAME=$1 fi echo STARTING AT `date` # local mode echo "Local allocation" numactl --localalloc ./$NAME echo "" # select node 0 echo "Binding node 0" -numactl --membind=0 ./$NAME +numactl --cpunodebind=1 --membind=0 ./$NAME +numactl --cpunodebind=1 --membind=0 ./$NAME +numactl --cpunodebind=1 --membind=0 ./$NAME +numactl --cpunodebind=1 --membind=0 ./$NAME echo "" # select node 1 echo "Binding node 1" numactl --membind=1 ./$NAME echo "" # interleaved mode echo "Binding interleave node 0" numactl --interleave=0 ./$NAME echo "" # interleaved mode echo "Binding interleave node 1" -numactl --interleave=1 ./$NAME +numactl --interleave=0 ./$NAME echo "" # interleaved mode echo "Binding interleave all nodes" numactl --interleave=all ./$NAME echo FINISHED at `date` diff --git a/A3/execute_order.sh b/A3/execute_order.sh index fde5d3b..83ae7e4 100644 --- a/A3/execute_order.sh +++ b/A3/execute_order.sh @@ -1,53 +1,30 @@ #!/bin/bash #SBATCH --chdir /scratch/jaugey #SBATCH --nodes 1 #SBATCH --ntasks 1 #SBATCH --cpus-per-task 28 #SBATCH --mem 10G #SBATCH --partition serial #SBATCH --account cs307 #SBATCH --reservation CS307-ex NAME=order if [ -n "$1" ]; then NAME=$1 fi echo STARTING AT `date` # local mode -echo "Local allocation" -numactl --localalloc ./$NAME +echo 1 Node +numactl -C 0,1,2 ./$NAME echo "" -# select node 0 -echo "Binding node 0" -numactl --membind=0 ./$NAME +# separated node +echo 2 Nodes +numactl -C 0,1,27 ./$NAME -echo "" - -# select node 1 -echo "Binding node 1" -numactl --membind=1 ./$NAME - -echo "" - -# interleaved mode -echo "Binding interleave node 0" -numactl --interleave=0 ./$NAME - -echo "" - -# interleaved mode -echo "Binding interleave node 1" -numactl --interleave=1 ./$NAME - -echo "" - -# interleaved mode -echo "Binding interleave all nodes" -numactl --interleave=all ./$NAME echo FINISHED at `date` diff --git a/A3/execute_order.sh b/A3/execute_order_test.sh similarity index 78% copy from A3/execute_order.sh copy to A3/execute_order_test.sh index fde5d3b..7257cbf 100644 --- a/A3/execute_order.sh +++ b/A3/execute_order_test.sh @@ -1,53 +1,57 @@ #!/bin/bash #SBATCH --chdir /scratch/jaugey #SBATCH --nodes 1 #SBATCH --ntasks 1 #SBATCH --cpus-per-task 28 #SBATCH --mem 10G #SBATCH --partition serial #SBATCH --account cs307 #SBATCH --reservation CS307-ex NAME=order if [ -n "$1" ]; then NAME=$1 fi echo STARTING AT `date` # local mode echo "Local allocation" numactl --localalloc ./$NAME echo "" # select node 0 echo "Binding node 0" -numactl --membind=0 ./$NAME +numactl --cpunodebind=1 --membind=0 ./$NAME +numactl --cpunodebind=1 --membind=0 ./$NAME +numactl --cpunodebind=1 --membind=0 ./$NAME +numactl --cpunodebind=1 --membind=0 ./$NAME +numactl --cpunodebind=1 --membind=0 ./$NAME echo "" # select node 1 echo "Binding node 1" numactl --membind=1 ./$NAME echo "" # interleaved mode echo "Binding interleave node 0" numactl --interleave=0 ./$NAME echo "" # interleaved mode echo "Binding interleave node 1" numactl --interleave=1 ./$NAME echo "" # interleaved mode echo "Binding interleave all nodes" numactl --interleave=all ./$NAME echo FINISHED at `date` diff --git a/A3/numa.c b/A3/numa.c index 8ec6d22..9772b70 100644 --- a/A3/numa.c +++ b/A3/numa.c @@ -1,135 +1,135 @@ #include <omp.h> #include <stdio.h> #include <stdlib.h> #include "utility.h" #include <stdint.h> #define GB (uint64_t)(1 << 30) #define SIZE (uint64_t)(8*GB) volatile uint8_t *arr; #define CACHE_LINE 64 // louis version accessor /*inline uint64_t next_addr(uint64_t i){ // Change this part if(i%3==0) { return 10 + 10*(arr[i]%10); } else { return 10 + 11*(arr[i]%11); } }*/ // louis version initializer /*inline void init_array(rand_gen gen){ for(uint64_t i=0; i<SIZE; ++i){ arr[i] = (uint64_t)(next_rand(gen) * 10); } }*/ // ancarola version inline uint64_t next_addr(uint64_t i){ return arr[i]; } // 0, 1 alternated version /*inline void init_array(rand_gen gen){ // elude linear pattern recognizer for(uint64_t i=0; i<SIZE; i += 2 * CACHE_LINE){ uint64_t place = i / (2 * CACHE_LINE); if (place % 2 == 0) { // first place arr[i] = 2 * CACHE_LINE + 1; //printf("Allocando il dio boia: (%d, %d)\n", (int)i, (int)arr[i]); } else { // second place arr[i+1] = 2 * CACHE_LINE - 1; //printf("Allocando il dio boia: (%d, %d)\n", (int)(i+1), (int)arr[i+1]); } } }*/ // linearly shifted version /*inline void init_array(rand_gen gen){ // elude linear pattern recognizer uint64_t counter = 0; // shift 1 every two cache lines for(uint64_t i=0; i<SIZE; i += 2 * CACHE_LINE){ uint64_t place = i / (2 * CACHE_LINE); if (place % 2 == 0) { arr[i + counter] = 2 * CACHE_LINE; } else { if ((counter + 1) % CACHE_LINE == 0) { arr[i + counter] = CACHE_LINE + 1; counter = 0; } else { arr[i + counter] = 2 * CACHE_LINE + 1; ++counter; } } } }*/ // random version inline void init_array(rand_gen gen){ uint64_t r[2]; r[0] = 0; r[1] = (uint64_t)(next_rand(gen) * (CACHE_LINE - 1)); - for(uint64_t i=0; i<SIZE; i += 4 * CACHE_LINE){ + for(uint64_t i=0; i<SIZE; i += 2 * CACHE_LINE){ - arr[i + r[0]] = 4 * CACHE_LINE + (r[1] - r[0]); + arr[i + r[0]] = 2 * CACHE_LINE + (r[1] - r[0]); r[0] = r[1]; r[1] = (uint64_t)(next_rand(gen) * (CACHE_LINE - 1)); } } int main(){ uint64_t i, counter; double time; volatile uint8_t temp; arr = (uint8_t*) malloc(SIZE); rand_gen gen = init_rand(); init_array(gen); //Start timer set_clock(); for(i=0, counter=0; i<SIZE; counter++){ temp = arr[i]; i += next_addr(i); } //Stop timer time = elapsed_time(); temp = next_rand(gen)*10; i = temp; // Just to suppress the compiler warning for not using temp and gen if(counter < 10000000) printf("ERROR: Too few accesses. You have to access more elements to measure reasonable time difference.\n"); if((time/counter) < 1.0e-8) printf("ERROR: Time per access is too small. You have to further deoptimize the program to measure reasonable time difference.\n"); printf("Traversing %lx GB array took total time = %.4g seconds, number of accesses = %lu, %.4g seconds per access\n", SIZE/GB, time, counter, time/counter); return 0; } diff --git a/A3/numa_test b/A3/numa_test new file mode 100755 index 0000000..2641606 Binary files /dev/null and b/A3/numa_test differ diff --git a/A3/numa.c b/A3/numa_test.c similarity index 95% copy from A3/numa.c copy to A3/numa_test.c index 8ec6d22..c5fc278 100644 --- a/A3/numa.c +++ b/A3/numa_test.c @@ -1,135 +1,150 @@ #include <omp.h> #include <stdio.h> #include <stdlib.h> #include "utility.h" #include <stdint.h> #define GB (uint64_t)(1 << 30) #define SIZE (uint64_t)(8*GB) volatile uint8_t *arr; #define CACHE_LINE 64 // louis version accessor /*inline uint64_t next_addr(uint64_t i){ // Change this part if(i%3==0) { return 10 + 10*(arr[i]%10); } else { return 10 + 11*(arr[i]%11); } }*/ // louis version initializer /*inline void init_array(rand_gen gen){ for(uint64_t i=0; i<SIZE; ++i){ arr[i] = (uint64_t)(next_rand(gen) * 10); } }*/ // ancarola version -inline uint64_t next_addr(uint64_t i){ +/*inline uint64_t next_addr(uint64_t i){ return arr[i]; +}*/ + +inline uint64_t next_addr(uint64_t i){ + return 1; +} + +inline void init_array(rand_gen gen){ + for(uint64_t i=0; i<SIZE; i++){ + arr[i] = 1; + } } + + // 0, 1 alternated version /*inline void init_array(rand_gen gen){ // elude linear pattern recognizer for(uint64_t i=0; i<SIZE; i += 2 * CACHE_LINE){ uint64_t place = i / (2 * CACHE_LINE); if (place % 2 == 0) { // first place arr[i] = 2 * CACHE_LINE + 1; //printf("Allocando il dio boia: (%d, %d)\n", (int)i, (int)arr[i]); } else { // second place arr[i+1] = 2 * CACHE_LINE - 1; //printf("Allocando il dio boia: (%d, %d)\n", (int)(i+1), (int)arr[i+1]); } } }*/ // linearly shifted version /*inline void init_array(rand_gen gen){ // elude linear pattern recognizer uint64_t counter = 0; // shift 1 every two cache lines for(uint64_t i=0; i<SIZE; i += 2 * CACHE_LINE){ uint64_t place = i / (2 * CACHE_LINE); if (place % 2 == 0) { arr[i + counter] = 2 * CACHE_LINE; } else { if ((counter + 1) % CACHE_LINE == 0) { arr[i + counter] = CACHE_LINE + 1; counter = 0; } else { arr[i + counter] = 2 * CACHE_LINE + 1; ++counter; } } } }*/ // random version -inline void init_array(rand_gen gen){ +/*inline void init_array(rand_gen gen){ uint64_t r[2]; r[0] = 0; r[1] = (uint64_t)(next_rand(gen) * (CACHE_LINE - 1)); for(uint64_t i=0; i<SIZE; i += 4 * CACHE_LINE){ arr[i + r[0]] = 4 * CACHE_LINE + (r[1] - r[0]); r[0] = r[1]; r[1] = (uint64_t)(next_rand(gen) * (CACHE_LINE - 1)); } -} +}*/ + + + int main(){ uint64_t i, counter; double time; volatile uint8_t temp; arr = (uint8_t*) malloc(SIZE); rand_gen gen = init_rand(); init_array(gen); //Start timer set_clock(); for(i=0, counter=0; i<SIZE; counter++){ temp = arr[i]; i += next_addr(i); } //Stop timer time = elapsed_time(); temp = next_rand(gen)*10; i = temp; // Just to suppress the compiler warning for not using temp and gen if(counter < 10000000) printf("ERROR: Too few accesses. You have to access more elements to measure reasonable time difference.\n"); if((time/counter) < 1.0e-8) printf("ERROR: Time per access is too small. You have to further deoptimize the program to measure reasonable time difference.\n"); printf("Traversing %lx GB array took total time = %.4g seconds, number of accesses = %lu, %.4g seconds per access\n", SIZE/GB, time, counter, time/counter); return 0; } diff --git a/A3/order_fence.c.save b/A3/order_fence.c.save new file mode 100644 index 0000000..e69de29 diff --git a/A3/order_fence.c.save.1 b/A3/order_fence.c.save.1 new file mode 100644 index 0000000..fafbc01 --- /dev/null +++ b/A3/order_fence.c.save.1 @@ -0,0 +1,80 @@ +#define _GNU_SOURCE +#include <pthread.h> +#include <omp.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdbool.h> + +volatile int X, Y, r1, r2; +volatile bool t1block, t2block, t1fin, t2fin; + +void *thread1Func(void *param){ + for(;;){ + while(t1block); // Wait for blocker to be removed + t1block = true; // Set up blocker + + X = 1; + asm volatile("mfence" ::: "memory"); // Prevent any compiler reordering + r1 = Y; + + t1fin = true; // Signal iteration end + } + return NULL; // Never returns +} + +void *thread2Func(void *param){ + for(;;){ + while(t2block); // Wait for blocker to be removed + t2block = true; // Set up blocker + + Y = 1; + asm volatile("mfence" ::: "memory"); // Prevent any compiler reordering + r2 = X; + + t2fin = true; // Signal iteration end + } + return NULL; // Never returns +} + +int main(){ + //Init + t1block = t2block = true; // Act as blockers for threads + t1fin = t2fin = false; // Act as finish signals for threads + + // Spawn the threads + pthread_t thread0, thread1, thread2; + thread0 = pthread_self(); + pthread_create(&thread1, NULL, thread1Func, NULL); + pthread_create(&thread2, NULL, thread2Func, NULL); + + /* Set the main thread affinity to CPU 0 */ + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + int s = pthread_setaffinity_np(thread0, sizeof(cpu_set_t), &cpuset); + if (s != 0){ + printf("Failed to set main thread affinity\n"); + return 0; + } + + // Repeat the experiment + int detected = 0, i = 1; + for (i = 1; i < 1000000; i++){ + // Reset all variables + X = 0; + Y = 0; + t1fin = t2fin = false; + t1block = t2block = false; + + // Wait for threads to complete their iteration + while(!(t1fin && t2fin)); + + // Check if there was a memory reordering + if (r1 == 0 && r2 == 0){ + detected++; + } + } + printf("%d memory re-orderings detected in %d iterations - %d percent\n", detected, i, 100*detected/i); + return 0; // Never returns +}