diff --git a/roofline/Dgemm/dgemm.c b/roofline/Dgemm/dgemm.c
new file mode 100644
index 0000000..f353545
--- /dev/null
+++ b/roofline/Dgemm/dgemm.c
@@ -0,0 +1,135 @@
+#include <stdio.h>       /* standard I/O routines                 */
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>       /* standard I/O routines                 */
+#include <omp.h>
+
+#include <sys/time.h>
+#include "utils.h"
+//#include <cblas.h>
+#ifdef PAPI
+#include "cscs_papi.h"
+#endif
+
+#define NN 8000;
+#define NTIMES 10
+
+#if 0
+extern "C"{
+void dgemm_ (char *transa,
+		char *transb,
+		int *m, int *n, int *k,
+		double *alpha, double *A, int *lda,
+		double *B, int *ldb,
+		double *beta, double *C, int *ldc) ;
+}	
+#endif
+
+
+double mysecond()
+{
+	struct timeval tp;
+	struct timezone tzp;
+	int i;
+
+	i = gettimeofday(&tp,&tzp);
+	return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+
+
+
+int main (int argc, char *argv[])
+{
+
+	int N = NN;
+	int M = NN;
+	int K = NN;
+	int ii;
+
+	/* Find out my identity in the default communicator */
+
+	int Ncpu = 0;
+
+	if ( argc == 2 ) 
+	{
+		N    = atoi(argv[1]);
+		M = N;
+		K = N;
+	}
+
+	double  alpha =  1.;
+	double  beta  = -1.;
+
+	int lda    = M;
+	int ldb    = K;
+	int ldc    = M;
+
+	int size_A = K*lda;
+	int size_B = N*ldb;
+	int size_C = N*ldc;
+
+	double *A  = (double*) malloc(sizeof(double)*size_A);
+	if (A == 0) printf("Could not allocate A.\n");
+
+	double *B  = (double*) malloc(sizeof(double)*size_B);
+	if (B == 0) printf("Could not allocate B.\n");
+
+	double *C  = (double*) malloc(sizeof(double)*size_C);
+	if (C == 0) printf("Could not allocate C.\n");
+
+	double *Cg = (double*) malloc(sizeof(double)*size_C);
+	if (Cg == 0) printf("Could not allocate Cg.\n");
+
+	fill(A,  size_A,  31.);
+	eye (B,     ldb,   N );
+	fill(C,  size_C,  31.);
+	fill(Cg, size_C,  31.);
+
+
+	int t;
+
+	int nthreads, tid, procs, maxthreads, inpar, dynamic, nested;
+
+	char transa = 'n';
+	char transb = 'n';
+
+	/* Get environment information */
+#ifdef _OPENMP
+	procs      = omp_get_num_procs();
+	nthreads   = omp_get_num_threads();
+	maxthreads = omp_get_max_threads();
+	inpar      = omp_in_parallel();
+	dynamic    = omp_get_dynamic();
+	nested     = omp_get_nested();
+	/* Print environment information */
+	printf("Number of processors          = %d\n", procs);
+	printf("Number of threads             = %d\n", nthreads);
+	printf("Max threads                   = %d\n", maxthreads);
+	printf("In parallel?                  = %d\n", inpar);
+	printf("Dynamic threads enabled?      = %d\n", dynamic);
+	printf("Nested parallelism supported? = %d\n", nested);
+#endif
+
+	double otime = 0.;
+	otime -= mysecond();
+
+	for (ii = 0; ii < NTIMES; ++ii)
+		dgemm_(&transa, &transb,
+				&M, &N, &K,
+				&alpha, A, &lda,
+				B, &ldb,
+				&beta, Cg, &ldc);
+
+	otime += mysecond();
+	otime /= NTIMES;
+
+	printf("Size = %d, Gflops Max: %f  Min: %f  Avg:  %f\n", N, 2.*M*N*K/otime/1e9, 2.*M*N*K/otime/1e9, 2.*M*N*K/otime/1e9);
+
+	free(A);
+	free(B);
+	free(C);
+	free(Cg);
+
+	exit(0);
+}
diff --git a/roofline/Dgemm/make.inc b/roofline/Dgemm/make.inc
new file mode 100644
index 0000000..e7f6fa9
--- /dev/null
+++ b/roofline/Dgemm/make.inc
@@ -0,0 +1,8 @@
+	CC        = icc
+	CPP       = CC
+	FORT      = ftn
+
+	OPTS      = -O3 
+	OPTS     += -mkl
+
+	#LIB       = -lm -lgfortran 
diff --git a/roofline/Dgemm/makefile b/roofline/Dgemm/makefile
new file mode 100644
index 0000000..532de0e
--- /dev/null
+++ b/roofline/Dgemm/makefile
@@ -0,0 +1,28 @@
+include ./make.inc
+
+
+all: dgemm
+
+
+HEADERS1 = utils.h 
+SOURCES1 = dgemm.c  
+
+OBJ1 = $(SOURCES1:.c=.o)
+
+dgemm: $(OBJ1)
+	$(CC) $(OBJ1) $(LDOPTS) $(OPTS) $(LIBDIR) $(LIB) $(INC) -o dgemm 
+
+#.SUFFIXES: .c .cpp .cu .o
+
+%.o : %.c
+	echo $(OBJ)
+	$(CC) $(OPTS) $(INC) -c $<
+
+
+clean:
+	echo $(OBJ1) dgemm 
+	rm -f $(OBJ1) dgemm  
+
+
+.PHONY: clean
+
diff --git a/roofline/Dgemm/utils.h b/roofline/Dgemm/utils.h
new file mode 100644
index 0000000..32c4cef
--- /dev/null
+++ b/roofline/Dgemm/utils.h
@@ -0,0 +1,64 @@
+#include <math.h>
+
+static void simple_dgemm(int n, double alpha, const double *A, const double *B,
+                         double beta, double *C)
+{
+    int i;
+    int j;
+    int k;
+    for (i = 0; i < n; ++i)
+      {
+        for (j = 0; j < n; ++j)
+          {
+            double prod = 0;
+            for (k = 0; k < n; ++k)
+              {
+                prod += A[k * n + i] * B[j * n + k];
+              }
+            //std::cout << prod << std::endl;
+            C[j * n + i] = alpha*prod + beta*C[j*n + i];
+          }
+      }
+}
+
+
+
+double verifyResult(const double *mat, const double *mat_ref, int M, int N)
+{
+  	double norm = 0.0;
+	int i;
+	int j;
+
+    for (i = 0; i < M; i++)
+    	{
+        for (j = 0; j < N; j++)
+          {
+            //mat[i+j*M] << " " << mat_ref[i + j*M] << std::endl;
+            if (abs((double)mat[i + j*M ] - (double)mat_ref[i + j*M ]) > norm)
+              {
+                norm = abs((double)mat[i + j*M] - (double)mat_ref[i + j*M]);
+		if (norm != 0.)
+			printf("%d %d = %f\n", i, j, norm); 
+              }
+          }
+//        std::cout << "----" << std::endl;
+      }
+    return norm;
+}
+
+
+static void fill(double *A, int size, double v)
+{
+	int ii;
+  	for (ii = 0; ii < size; ++ii)
+    	A[ii] = (double) v;
+}
+
+static void eye(double *A, int lda, int n)
+{
+   	fill(A, lda*n, 0.);
+	int ii;
+
+   	for (ii = 0; ii < lda; ++ii)
+    	A[ii*lda + ii] = 1.;
+}
diff --git a/roofline/Stream/Makefile b/roofline/Stream/Makefile
new file mode 100644
index 0000000..9620ff9
--- /dev/null
+++ b/roofline/Stream/Makefile
@@ -0,0 +1,14 @@
+CC = icc 
+
+CFLAGS = -O3 -march=native 
+CFLAGS += -Wall
+CFLAGS += -qopenmp
+
+
+all: stream 
+
+stream: stream.c
+	$(CC) $(CFLAGS) -qopt-streaming-stores=always stream.c -o stream
+
+clean:
+	rm -f stream *.o
diff --git a/roofline/Stream/stream.c b/roofline/Stream/stream.c
new file mode 100644
index 0000000..e0bc7df
--- /dev/null
+++ b/roofline/Stream/stream.c
@@ -0,0 +1,614 @@
+/*-----------------------------------------------------------------------*/
+/* Program: STREAM                                                       */
+/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
+/* Original code developed by John D. McCalpin                           */
+/* Programmers: John D. McCalpin                                         */
+/*              Joe R. Zagar                                             */
+/*                                                                       */
+/* This program measures memory transfer rates in MB/s for simple        */
+/* computational kernels coded in C.                                     */
+/*-----------------------------------------------------------------------*/
+/* Copyright 1991-2013: John D. McCalpin                                 */
+/*-----------------------------------------------------------------------*/
+/* License:                                                              */
+/*  1. You are free to use this program and/or to redistribute           */
+/*     this program.                                                     */
+/*  2. You are free to modify this program for your own use,             */
+/*     including commercial use, subject to the publication              */
+/*     restrictions in item 3.                                           */
+/*  3. You are free to publish results obtained from running this        */
+/*     program, or from works that you derive from this program,         */
+/*     with the following limitations:                                   */
+/*     3a. In order to be referred to as "STREAM benchmark results",     */
+/*         published results must be in conformance to the STREAM        */
+/*         Run Rules, (briefly reviewed below) published at              */
+/*         http://www.cs.virginia.edu/stream/ref.html                    */
+/*         and incorporated herein by reference.                         */
+/*         As the copyright holder, John McCalpin retains the            */
+/*         right to determine conformity with the Run Rules.             */
+/*     3b. Results based on modified source code or on runs not in       */
+/*         accordance with the STREAM Run Rules must be clearly          */
+/*         labelled whenever they are published.  Examples of            */
+/*         proper labelling include:                                     */
+/*           "tuned STREAM benchmark results"                            */
+/*           "based on a variant of the STREAM benchmark code"           */
+/*         Other comparable, clear, and reasonable labelling is          */
+/*         acceptable.                                                   */
+/*     3c. Submission of results to the STREAM benchmark web site        */
+/*         is encouraged, but not required.                              */
+/*  4. Use of this program or creation of derived works based on this    */
+/*     program constitutes acceptance of these licensing restrictions.   */
+/*  5. Absolutely no warranty is expressed or implied.                   */
+/*-----------------------------------------------------------------------*/
+# include <stdio.h>
+# include <unistd.h>
+# include <math.h>
+# include <float.h>
+# include <limits.h>
+# include <sys/time.h>
+
+/*-----------------------------------------------------------------------
+ * INSTRUCTIONS:
+ *
+ *	1) STREAM requires different amounts of memory to run on different
+ *           systems, depending on both the system cache size(s) and the
+ *           granularity of the system timer.
+ *     You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
+ *           to meet *both* of the following criteria:
+ *       (a) Each array must be at least 4 times the size of the
+ *           available cache memory. I don't worry about the difference
+ *           between 10^6 and 2^20, so in practice the minimum array size
+ *           is about 3.8 times the cache size.
+ *           Example 1: One Xeon E3 with 8 MB L3 cache
+ *               STREAM_ARRAY_SIZE should be >= 4 million, giving
+ *               an array size of 30.5 MB and a total memory requirement
+ *               of 91.5 MB.  
+ *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
+ *               STREAM_ARRAY_SIZE should be >= 20 million, giving
+ *               an array size of 153 MB and a total memory requirement
+ *               of 458 MB.  
+ *       (b) The size should be large enough so that the 'timing calibration'
+ *           output by the program is at least 20 clock-ticks.  
+ *           Example: most versions of Windows have a 10 millisecond timer
+ *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
+ *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
+ *               This means the each array must be at least 1 GB, or 128M elements.
+ *
+ *      Version 5.10 increases the default array size from 2 million
+ *          elements to 10 million elements in response to the increasing
+ *          size of L3 caches.  The new default size is large enough for caches
+ *          up to 20 MB. 
+ *      Version 5.10 changes the loop index variables from "register int"
+ *          to "ssize_t", which allows array indices >2^32 (4 billion)
+ *          on properly configured 64-bit systems.  Additional compiler options
+ *          (such as "-mcmodel=medium") may be required for large memory runs.
+ *
+ *      Array size can be set at compile time without modifying the source
+ *          code for the (many) compilers that support preprocessor definitions
+ *          on the compile line.  E.g.,
+ *                gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
+ *          will override the default size of 10M with a new size of 100M elements
+ *          per array.
+ */
+#ifndef STREAM_ARRAY_SIZE
+#   define STREAM_ARRAY_SIZE	80000000
+#endif
+
+/*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
+ *         for any iteration after the first, therefore the minimum value
+ *         for NTIMES is 2.
+ *      There are no rules on maximum allowable values for NTIMES, but
+ *         values larger than the default are unlikely to noticeably
+ *         increase the reported performance.
+ *      NTIMES can also be set on the compile line without changing the source
+ *         code using, for example, "-DNTIMES=7".
+ */
+#ifdef NTIMES
+#if NTIMES<=1
+#   define NTIMES	100
+#endif
+#endif
+#ifndef NTIMES
+#   define NTIMES	100
+#endif
+
+/*  Users are allowed to modify the "OFFSET" variable, which *may* change the
+ *         relative alignment of the arrays (though compilers may change the 
+ *         effective offset by making the arrays non-contiguous on some systems). 
+ *      Use of non-zero values for OFFSET can be especially helpful if the
+ *         STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
+ *      OFFSET can also be set on the compile line without changing the source
+ *         code using, for example, "-DOFFSET=56".
+ */
+#ifndef OFFSET
+#   define OFFSET	0
+#endif
+
+/*
+ *	3) Compile the code with optimization.  Many compilers generate
+ *       unreasonably bad code before the optimizer tightens things up.  
+ *     If the results are unreasonably good, on the other hand, the
+ *       optimizer might be too smart for me!
+ *
+ *     For a simple single-core version, try compiling with:
+ *            cc -O stream.c -o stream
+ *     This is known to work on many, many systems....
+ *
+ *     To use multiple cores, you need to tell the compiler to obey the OpenMP
+ *       directives in the code.  This varies by compiler, but a common example is
+ *            gcc -O -fopenmp stream.c -o stream_omp
+ *       The environment variable OMP_NUM_THREADS allows runtime control of the 
+ *         number of threads/cores used when the resulting "stream_omp" program
+ *         is executed.
+ *
+ *     To run with single-precision variables and arithmetic, simply add
+ *         -DSTREAM_TYPE=float
+ *     to the compile line.
+ *     Note that this changes the minimum array sizes required --- see (1) above.
+ *
+ *     The preprocessor directive "TUNED" does not do much -- it simply causes the 
+ *       code to call separate functions to execute each kernel.  Trivial versions
+ *       of these functions are provided, but they are *not* tuned -- they just 
+ *       provide predefined interfaces to be replaced with tuned code.
+ *
+ *
+ *	4) Optional: Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include info that will help me understand:
+ *		a) the computer hardware configuration (e.g., processor model, memory type)
+ *		b) the compiler name/version and compilation flags
+ *      c) any run-time information (such as OMP_NUM_THREADS)
+ *		d) all of the output from the test case.
+ *
+ * Thanks!
+ *
+ *-----------------------------------------------------------------------*/
+
+# define HLINE "-------------------------------------------------------------\n"
+
+# ifndef MIN
+# define MIN(x,y) ((x)<(y)?(x):(y))
+# endif
+# ifndef MAX
+# define MAX(x,y) ((x)>(y)?(x):(y))
+# endif
+
+#ifndef STREAM_TYPE
+#define STREAM_TYPE double
+#endif
+
+static inline unsigned long long cycles()
+{
+	unsigned long long u;
+	//asm volatile ("rdtsc;shlq $32,%%rdx;orq %%rdx,%%rax":"=a"(u)::"%rdx");
+	asm volatile ("rdtscp;shlq $32,%%rdx;orq %%rdx,%%rax;movq %%rax,%0":"=q"(u)::"%rax", "%rdx", "rcx");
+	return u;
+}
+
+
+
+static STREAM_TYPE	a[STREAM_ARRAY_SIZE+OFFSET],
+			b[STREAM_ARRAY_SIZE+OFFSET],
+			c[STREAM_ARRAY_SIZE+OFFSET];
+
+static double	avgtime[4] = {0}, maxtime[4] = {0},
+		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+
+static char	*label[4] = {"Copy:      ", "Scale:     ",
+	"Add:       ", "Triad:     "};
+
+static double	bytes[4] = {
+	2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+	2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+	3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+	3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
+};
+
+extern double mysecond();
+extern void checkSTREAMresults();
+#ifdef TUNED
+extern void tuned_STREAM_Copy();
+extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
+extern void tuned_STREAM_Add();
+extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
+#endif
+#ifdef _OPENMP
+extern int omp_get_num_threads();
+#endif
+	int
+main()
+{
+	int			quantum, checktick();
+	int			BytesPerWord;
+	int			k;
+	ssize_t		j;
+	STREAM_TYPE		scalar;
+	double		t, times[4][NTIMES];
+
+	/* --- SETUP --- determine precision and check timing --- */
+
+	printf(HLINE);
+	printf("STREAM version $Revision: 5.10 $\n");
+	printf(HLINE);
+	BytesPerWord = sizeof(STREAM_TYPE);
+	printf("This system uses %d bytes per array element.\n",
+			BytesPerWord);
+
+	printf(HLINE);
+#ifdef N
+	printf("*****  WARNING: ******\n");
+	printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
+	printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
+	printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
+	printf("*****  WARNING: ******\n");
+#endif
+
+	printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
+	printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
+			BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
+			BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
+	printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
+			(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
+			(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
+	printf("Each kernel will be executed %d times.\n", NTIMES);
+	printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
+	printf(" will be used to compute the reported bandwidth.\n");
+
+#ifdef _OPENMP
+	printf(HLINE);
+#pragma omp parallel 
+	{
+#pragma omp master
+		{
+			k = omp_get_num_threads();
+			printf ("Number of Threads requested = %i\n",k);
+		}
+	}
+#endif
+
+#ifdef _OPENMP
+	k = 0;
+#pragma omp parallel
+#pragma omp atomic 
+	k++;
+	printf ("Number of Threads counted = %i\n",k);
+#endif
+
+	/* Get initial value for system clock. */
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+		a[j] = 1.0;
+		b[j] = 2.0;
+		c[j] = 0.0;
+	}
+
+	printf(HLINE);
+
+	if  ( (quantum = checktick()) >= 1) 
+		printf("Your clock granularity/precision appears to be "
+				"%d microseconds.\n", quantum);
+	else {
+		printf("Your clock granularity appears to be "
+				"less than one microsecond.\n");
+		quantum = 1;
+	}
+
+	t = mysecond();
+#pragma omp parallel for
+	for (j = 0; j < STREAM_ARRAY_SIZE; j++)
+		a[j] = 2.0E0 * a[j];
+	t = 1.0E6 * (mysecond() - t);
+
+	printf("Each test below will take on the order"
+			" of %d microseconds.\n", (int) t  );
+	printf("   (= %d clock ticks)\n", (int) (t/quantum) );
+	printf("Increase the size of the arrays if this shows that\n");
+	printf("you are not getting at least 20 clock ticks per test.\n");
+
+	printf(HLINE);
+
+	printf("WARNING -- The above is only a rough guideline.\n");
+	printf("For best results, please be sure you know the\n");
+	printf("precision of your system timer.\n");
+	printf(HLINE);
+
+	/*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+	unsigned long long c0 = 0;
+	long long c1 = 0;
+
+
+
+	c0 = cycles();
+	sleep(1);
+	c1 = cycles();
+	printf("1 second sleep, number of cycles = %ld\n", c1 - c0);
+
+	scalar = 3.0;
+	c0 = cycles();
+	//printf("c0 = %ld\n", c0);
+	for (k=0; k<NTIMES; k++)
+	{
+		times[0][k] = mysecond();
+#ifdef TUNED
+		tuned_STREAM_Copy();
+#else
+#pragma omp parallel for
+		for (j=0; j<STREAM_ARRAY_SIZE; j++)
+			c[j] = a[j];
+#endif
+		times[0][k] = mysecond() - times[0][k];
+
+		times[1][k] = mysecond();
+#ifdef TUNED
+		tuned_STREAM_Scale(scalar);
+#else
+#pragma omp parallel for
+		for (j=0; j<STREAM_ARRAY_SIZE; j++)
+			b[j] = scalar*c[j];
+#endif
+		times[1][k] = mysecond() - times[1][k];
+
+		times[2][k] = mysecond();
+#ifdef TUNED
+		tuned_STREAM_Add();
+#else
+#pragma omp parallel for
+		for (j=0; j<STREAM_ARRAY_SIZE; j++)
+			c[j] = a[j]+b[j];
+#endif
+		times[2][k] = mysecond() - times[2][k];
+
+		times[3][k] = mysecond();
+#ifdef TUNED
+		__asm__ volatile("# Triad begins");
+		tuned_STREAM_Triad(scalar);
+		__asm__ volatile("# Triad ends");
+		//printf("%ld\n", c0);
+#else
+#pragma omp parallel for
+		for (j=0; j<STREAM_ARRAY_SIZE; j++)
+			a[j] = b[j]+scalar*c[j];
+#endif
+		times[3][k] = mysecond() - times[3][k];
+	}
+	c1 = cycles();
+	//printf("c1 = %ld\n", c1);
+
+	/*	--- SUMMARY --- */
+
+	for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
+	{
+		for (j=0; j<4; j++)
+		{
+			avgtime[j] = avgtime[j] + times[j][k];
+			mintime[j] = MIN(mintime[j], times[j][k]);
+			maxtime[j] = MAX(maxtime[j], times[j][k]);
+		}
+	}
+
+	printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
+	for (j=0; j<4; j++) {
+		avgtime[j] = avgtime[j]/(double)(NTIMES-1);
+
+		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
+				1.0E-06 * bytes[j]/mintime[j],
+				avgtime[j],
+				mintime[j],
+				maxtime[j]);
+	}
+	printf(HLINE);
+
+	//printf("cycles : %lf\n", (double) (c1 -  c0)/NTIMES/STREAM_ARRAY_SIZE);
+	/* --- Check Results --- */
+	checkSTREAMresults();
+	printf(HLINE);
+
+	return 0;
+}
+
+# define	M	20
+
+int
+checktick()
+{
+	int		i, minDelta, Delta;
+	double	t1, t2, timesfound[M];
+
+	/*  Collect a sequence of M unique time values from the system. */
+
+	for (i = 0; i < M; i++) {
+		t1 = mysecond();
+		while( ((t2=mysecond()) - t1) < 1.0E-6 )
+			;
+		timesfound[i] = t1 = t2;
+	}
+
+	/*
+	 * Determine the minimum difference between these M values.
+	 * This result will be our estimate (in microseconds) for the
+	 * clock granularity.
+	 */
+
+	minDelta = 1000000;
+	for (i = 1; i < M; i++) {
+		Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+		minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+	return(minDelta);
+}
+
+
+
+/* A gettimeofday routine to give access to the wall
+   clock timer on most UNIX-like systems.  */
+
+#include <sys/time.h>
+
+double mysecond()
+{
+	struct timeval tp;
+	struct timezone tzp;
+	int i;
+
+	i = gettimeofday(&tp,&tzp);
+	return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+#ifndef abs
+#define abs(a) ((a) >= 0 ? (a) : -(a))
+#endif
+void checkSTREAMresults ()
+{
+	STREAM_TYPE aj,bj,cj,scalar;
+	STREAM_TYPE aSumErr,bSumErr,cSumErr;
+	STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
+	double epsilon;
+	ssize_t	j;
+	int	k,ierr,err;
+
+	/* reproduce initialization */
+	aj = 1.0;
+	bj = 2.0;
+	cj = 0.0;
+	/* a[] is modified during timing check */
+	aj = 2.0E0 * aj;
+	/* now execute timing loop */
+	scalar = 3.0;
+	for (k=0; k<NTIMES; k++)
+	{
+		cj = aj;
+		bj = scalar*cj;
+		cj = aj+bj;
+		aj = bj+scalar*cj;
+	}
+
+	/* accumulate deltas between observed and expected results */
+	aSumErr = 0.0;
+	bSumErr = 0.0;
+	cSumErr = 0.0;
+	for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+		aSumErr += abs(a[j] - aj);
+		bSumErr += abs(b[j] - bj);
+		cSumErr += abs(c[j] - cj);
+		// if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj);	// MCCALPIN
+	}
+	aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+	bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+	cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+
+	if (sizeof(STREAM_TYPE) == 4) {
+		epsilon = 1.e-6;
+	}
+	else if (sizeof(STREAM_TYPE) == 8) {
+		epsilon = 1.e-13;
+	}
+	else {
+		printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
+		epsilon = 1.e-6;
+	}
+
+	err = 0;
+	if (abs(aAvgErr/aj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(a[j]/aj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+							j,aj,a[j],abs((aj-a[j])/aAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array a[], %d errors were found.\n",ierr);
+	}
+	if (abs(bAvgErr/bj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(b[j]/bj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+							j,bj,b[j],abs((bj-b[j])/bAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array b[], %d errors were found.\n",ierr);
+	}
+	if (abs(cAvgErr/cj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(c[j]/cj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+							j,cj,c[j],abs((cj-c[j])/cAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array c[], %d errors were found.\n",ierr);
+	}
+	if (err == 0) {
+		printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
+	}
+#ifdef VERBOSE
+	printf ("Results Validation Verbose Results: \n");
+	printf ("    Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
+	printf ("    Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
+	printf ("    Rel Errors on a, b, c:     %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
+#endif
+}
+
+#ifdef TUNED
+/* stubs for "tuned" versions of the kernels */
+void tuned_STREAM_Copy()
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+		c[j] = a[j];
+}
+
+void tuned_STREAM_Scale(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+		b[j] = scalar*c[j];
+}
+
+void tuned_STREAM_Add()
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+		c[j] = a[j]+b[j];
+}
+
+	inline
+void tuned_STREAM_Triad(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+		a[j] = b[j]+scalar*c[j];
+}
+/* end of stubs for the "tuned" versions of the kernels */
+#endif