Page MenuHomec4science

algorithm_Ancarola.c
No OneTemporary

File Metadata

Created
Sun, Sep 29, 06:05

algorithm_Ancarola.c

/*
============================================================================
Filename : algorithm.c
Author : Your names go here
SCIPER : Your SCIPER numbers
============================================================================
*/
#include <math.h>
#define INPUT(I,J) input[(I)*length+(J)]
#define OUTPUT(I,J) output[(I)*length+(J)]
#define CACHE_SIZE 8
#define COORD(I,J,K) (I) = K / length; (J) = (K) - (I) * length;
void simulate(double *input, double *output, int threads, int length, int iterations)
{
double *temp;
omp_set_num_threads(threads);
double horiz_block[3];
for(int n=0; n < iterations; n++)
{
//printf("Loop: %d\n", n);
#pragma omp parallel for private(horiz_block) collapse(2)
for(int i=1; i<length-1; i++)
{
for(int j=1; j<length-1; j++) {
if ( ((i == length/2-1) || (i== length/2))
&& ((j == length/2-1) || (j == length/2)) ) {
continue;
}
/*
* Store horizontal sums into an array.
* Supposing the cache is of size 4 * sizeof(double),
* the horiz_block array fits into the L1 cache avoiding cache miss while writing into it.
*
* Three cache misses on L1 are performed on reading INPUT because of line jumps.
* However L2 is able to fit a big enough portion of the input array to avoid critically slow cache misses.
*/
horiz_block[0] = INPUT(i-1,j-1) + INPUT(i-1,j) + INPUT(i-1,j+1);
horiz_block[1] = INPUT(i,j-1) + INPUT(i,j) + INPUT(i,j+1);
horiz_block[2] = INPUT(i+1,j-1) + INPUT(i+1,j) + INPUT(i+1,j+1);
/*
* The needed portion of OUTPUT is accessible at worst from L2, while
* horiz_block is forcely in L1 cache (as it's been used by last).
* This means that reading from horiz_block is best optimized and storing into OUTPUT takes a limited latency.
*/
OUTPUT(i,j) = (horiz_block[0] + horiz_block[1] + horiz_block[2]) / 9;
}
}
temp = input;
input = output;
output = temp;
}
}

Event Timeline