diff --git a/roofline/Dgemm/dgemm.c b/roofline/Dgemm/dgemm.c new file mode 100644 index 0000000..f353545 --- /dev/null +++ b/roofline/Dgemm/dgemm.c @@ -0,0 +1,135 @@ +#include /* standard I/O routines */ +#include +#include +#include /* standard I/O routines */ +#include + +#include +#include "utils.h" +//#include +#ifdef PAPI +#include "cscs_papi.h" +#endif + +#define NN 8000; +#define NTIMES 10 + +#if 0 +extern "C"{ +void dgemm_ (char *transa, + char *transb, + int *m, int *n, int *k, + double *alpha, double *A, int *lda, + double *B, int *ldb, + double *beta, double *C, int *ldc) ; +} +#endif + + +double mysecond() +{ + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + + + + +int main (int argc, char *argv[]) +{ + + int N = NN; + int M = NN; + int K = NN; + int ii; + + /* Find out my identity in the default communicator */ + + int Ncpu = 0; + + if ( argc == 2 ) + { + N = atoi(argv[1]); + M = N; + K = N; + } + + double alpha = 1.; + double beta = -1.; + + int lda = M; + int ldb = K; + int ldc = M; + + int size_A = K*lda; + int size_B = N*ldb; + int size_C = N*ldc; + + double *A = (double*) malloc(sizeof(double)*size_A); + if (A == 0) printf("Could not allocate A.\n"); + + double *B = (double*) malloc(sizeof(double)*size_B); + if (B == 0) printf("Could not allocate B.\n"); + + double *C = (double*) malloc(sizeof(double)*size_C); + if (C == 0) printf("Could not allocate C.\n"); + + double *Cg = (double*) malloc(sizeof(double)*size_C); + if (Cg == 0) printf("Could not allocate Cg.\n"); + + fill(A, size_A, 31.); + eye (B, ldb, N ); + fill(C, size_C, 31.); + fill(Cg, size_C, 31.); + + + int t; + + int nthreads, tid, procs, maxthreads, inpar, dynamic, nested; + + char transa = 'n'; + char transb = 'n'; + + /* Get environment information */ +#ifdef _OPENMP + procs = omp_get_num_procs(); + nthreads = omp_get_num_threads(); + maxthreads = omp_get_max_threads(); + inpar = omp_in_parallel(); + dynamic = omp_get_dynamic(); + nested = omp_get_nested(); + /* Print environment information */ + printf("Number of processors = %d\n", procs); + printf("Number of threads = %d\n", nthreads); + printf("Max threads = %d\n", maxthreads); + printf("In parallel? = %d\n", inpar); + printf("Dynamic threads enabled? = %d\n", dynamic); + printf("Nested parallelism supported? = %d\n", nested); +#endif + + double otime = 0.; + otime -= mysecond(); + + for (ii = 0; ii < NTIMES; ++ii) + dgemm_(&transa, &transb, + &M, &N, &K, + &alpha, A, &lda, + B, &ldb, + &beta, Cg, &ldc); + + otime += mysecond(); + otime /= NTIMES; + + printf("Size = %d, Gflops Max: %f Min: %f Avg: %f\n", N, 2.*M*N*K/otime/1e9, 2.*M*N*K/otime/1e9, 2.*M*N*K/otime/1e9); + + free(A); + free(B); + free(C); + free(Cg); + + exit(0); +} diff --git a/roofline/Dgemm/make.inc b/roofline/Dgemm/make.inc new file mode 100644 index 0000000..e7f6fa9 --- /dev/null +++ b/roofline/Dgemm/make.inc @@ -0,0 +1,8 @@ + CC = icc + CPP = CC + FORT = ftn + + OPTS = -O3 + OPTS += -mkl + + #LIB = -lm -lgfortran diff --git a/roofline/Dgemm/makefile b/roofline/Dgemm/makefile new file mode 100644 index 0000000..532de0e --- /dev/null +++ b/roofline/Dgemm/makefile @@ -0,0 +1,28 @@ +include ./make.inc + + +all: dgemm + + +HEADERS1 = utils.h +SOURCES1 = dgemm.c + +OBJ1 = $(SOURCES1:.c=.o) + +dgemm: $(OBJ1) + $(CC) $(OBJ1) $(LDOPTS) $(OPTS) $(LIBDIR) $(LIB) $(INC) -o dgemm + +#.SUFFIXES: .c .cpp .cu .o + +%.o : %.c + echo $(OBJ) + $(CC) $(OPTS) $(INC) -c $< + + +clean: + echo $(OBJ1) dgemm + rm -f $(OBJ1) dgemm + + +.PHONY: clean + diff --git a/roofline/Dgemm/utils.h b/roofline/Dgemm/utils.h new file mode 100644 index 0000000..32c4cef --- /dev/null +++ b/roofline/Dgemm/utils.h @@ -0,0 +1,64 @@ +#include + +static void simple_dgemm(int n, double alpha, const double *A, const double *B, + double beta, double *C) +{ + int i; + int j; + int k; + for (i = 0; i < n; ++i) + { + for (j = 0; j < n; ++j) + { + double prod = 0; + for (k = 0; k < n; ++k) + { + prod += A[k * n + i] * B[j * n + k]; + } + //std::cout << prod << std::endl; + C[j * n + i] = alpha*prod + beta*C[j*n + i]; + } + } +} + + + +double verifyResult(const double *mat, const double *mat_ref, int M, int N) +{ + double norm = 0.0; + int i; + int j; + + for (i = 0; i < M; i++) + { + for (j = 0; j < N; j++) + { + //mat[i+j*M] << " " << mat_ref[i + j*M] << std::endl; + if (abs((double)mat[i + j*M ] - (double)mat_ref[i + j*M ]) > norm) + { + norm = abs((double)mat[i + j*M] - (double)mat_ref[i + j*M]); + if (norm != 0.) + printf("%d %d = %f\n", i, j, norm); + } + } +// std::cout << "----" << std::endl; + } + return norm; +} + + +static void fill(double *A, int size, double v) +{ + int ii; + for (ii = 0; ii < size; ++ii) + A[ii] = (double) v; +} + +static void eye(double *A, int lda, int n) +{ + fill(A, lda*n, 0.); + int ii; + + for (ii = 0; ii < lda; ++ii) + A[ii*lda + ii] = 1.; +} diff --git a/roofline/Stream/Makefile b/roofline/Stream/Makefile new file mode 100644 index 0000000..9620ff9 --- /dev/null +++ b/roofline/Stream/Makefile @@ -0,0 +1,14 @@ +CC = icc + +CFLAGS = -O3 -march=native +CFLAGS += -Wall +CFLAGS += -qopenmp + + +all: stream + +stream: stream.c + $(CC) $(CFLAGS) -qopt-streaming-stores=always stream.c -o stream + +clean: + rm -f stream *.o diff --git a/roofline/Stream/stream.c b/roofline/Stream/stream.c new file mode 100644 index 0000000..e0bc7df --- /dev/null +++ b/roofline/Stream/stream.c @@ -0,0 +1,614 @@ +/*-----------------------------------------------------------------------*/ +/* Program: STREAM */ +/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */ +/* Original code developed by John D. McCalpin */ +/* Programmers: John D. McCalpin */ +/* Joe R. Zagar */ +/* */ +/* This program measures memory transfer rates in MB/s for simple */ +/* computational kernels coded in C. */ +/*-----------------------------------------------------------------------*/ +/* Copyright 1991-2013: John D. McCalpin */ +/*-----------------------------------------------------------------------*/ +/* License: */ +/* 1. You are free to use this program and/or to redistribute */ +/* this program. */ +/* 2. You are free to modify this program for your own use, */ +/* including commercial use, subject to the publication */ +/* restrictions in item 3. */ +/* 3. You are free to publish results obtained from running this */ +/* program, or from works that you derive from this program, */ +/* with the following limitations: */ +/* 3a. In order to be referred to as "STREAM benchmark results", */ +/* published results must be in conformance to the STREAM */ +/* Run Rules, (briefly reviewed below) published at */ +/* http://www.cs.virginia.edu/stream/ref.html */ +/* and incorporated herein by reference. */ +/* As the copyright holder, John McCalpin retains the */ +/* right to determine conformity with the Run Rules. */ +/* 3b. Results based on modified source code or on runs not in */ +/* accordance with the STREAM Run Rules must be clearly */ +/* labelled whenever they are published. Examples of */ +/* proper labelling include: */ +/* "tuned STREAM benchmark results" */ +/* "based on a variant of the STREAM benchmark code" */ +/* Other comparable, clear, and reasonable labelling is */ +/* acceptable. */ +/* 3c. Submission of results to the STREAM benchmark web site */ +/* is encouraged, but not required. */ +/* 4. Use of this program or creation of derived works based on this */ +/* program constitutes acceptance of these licensing restrictions. */ +/* 5. Absolutely no warranty is expressed or implied. */ +/*-----------------------------------------------------------------------*/ +# include +# include +# include +# include +# include +# include + +/*----------------------------------------------------------------------- + * INSTRUCTIONS: + * + * 1) STREAM requires different amounts of memory to run on different + * systems, depending on both the system cache size(s) and the + * granularity of the system timer. + * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) + * to meet *both* of the following criteria: + * (a) Each array must be at least 4 times the size of the + * available cache memory. I don't worry about the difference + * between 10^6 and 2^20, so in practice the minimum array size + * is about 3.8 times the cache size. + * Example 1: One Xeon E3 with 8 MB L3 cache + * STREAM_ARRAY_SIZE should be >= 4 million, giving + * an array size of 30.5 MB and a total memory requirement + * of 91.5 MB. + * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) + * STREAM_ARRAY_SIZE should be >= 20 million, giving + * an array size of 153 MB and a total memory requirement + * of 458 MB. + * (b) The size should be large enough so that the 'timing calibration' + * output by the program is at least 20 clock-ticks. + * Example: most versions of Windows have a 10 millisecond timer + * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. + * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. + * This means the each array must be at least 1 GB, or 128M elements. + * + * Version 5.10 increases the default array size from 2 million + * elements to 10 million elements in response to the increasing + * size of L3 caches. The new default size is large enough for caches + * up to 20 MB. + * Version 5.10 changes the loop index variables from "register int" + * to "ssize_t", which allows array indices >2^32 (4 billion) + * on properly configured 64-bit systems. Additional compiler options + * (such as "-mcmodel=medium") may be required for large memory runs. + * + * Array size can be set at compile time without modifying the source + * code for the (many) compilers that support preprocessor definitions + * on the compile line. E.g., + * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M + * will override the default size of 10M with a new size of 100M elements + * per array. + */ +#ifndef STREAM_ARRAY_SIZE +# define STREAM_ARRAY_SIZE 80000000 +#endif + +/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result + * for any iteration after the first, therefore the minimum value + * for NTIMES is 2. + * There are no rules on maximum allowable values for NTIMES, but + * values larger than the default are unlikely to noticeably + * increase the reported performance. + * NTIMES can also be set on the compile line without changing the source + * code using, for example, "-DNTIMES=7". + */ +#ifdef NTIMES +#if NTIMES<=1 +# define NTIMES 100 +#endif +#endif +#ifndef NTIMES +# define NTIMES 100 +#endif + +/* Users are allowed to modify the "OFFSET" variable, which *may* change the + * relative alignment of the arrays (though compilers may change the + * effective offset by making the arrays non-contiguous on some systems). + * Use of non-zero values for OFFSET can be especially helpful if the + * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. + * OFFSET can also be set on the compile line without changing the source + * code using, for example, "-DOFFSET=56". + */ +#ifndef OFFSET +# define OFFSET 0 +#endif + +/* + * 3) Compile the code with optimization. Many compilers generate + * unreasonably bad code before the optimizer tightens things up. + * If the results are unreasonably good, on the other hand, the + * optimizer might be too smart for me! + * + * For a simple single-core version, try compiling with: + * cc -O stream.c -o stream + * This is known to work on many, many systems.... + * + * To use multiple cores, you need to tell the compiler to obey the OpenMP + * directives in the code. This varies by compiler, but a common example is + * gcc -O -fopenmp stream.c -o stream_omp + * The environment variable OMP_NUM_THREADS allows runtime control of the + * number of threads/cores used when the resulting "stream_omp" program + * is executed. + * + * To run with single-precision variables and arithmetic, simply add + * -DSTREAM_TYPE=float + * to the compile line. + * Note that this changes the minimum array sizes required --- see (1) above. + * + * The preprocessor directive "TUNED" does not do much -- it simply causes the + * code to call separate functions to execute each kernel. Trivial versions + * of these functions are provided, but they are *not* tuned -- they just + * provide predefined interfaces to be replaced with tuned code. + * + * + * 4) Optional: Mail the results to mccalpin@cs.virginia.edu + * Be sure to include info that will help me understand: + * a) the computer hardware configuration (e.g., processor model, memory type) + * b) the compiler name/version and compilation flags + * c) any run-time information (such as OMP_NUM_THREADS) + * d) all of the output from the test case. + * + * Thanks! + * + *-----------------------------------------------------------------------*/ + +# define HLINE "-------------------------------------------------------------\n" + +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif + +#ifndef STREAM_TYPE +#define STREAM_TYPE double +#endif + +static inline unsigned long long cycles() +{ + unsigned long long u; + //asm volatile ("rdtsc;shlq $32,%%rdx;orq %%rdx,%%rax":"=a"(u)::"%rdx"); + asm volatile ("rdtscp;shlq $32,%%rdx;orq %%rdx,%%rax;movq %%rax,%0":"=q"(u)::"%rax", "%rdx", "rcx"); + return u; +} + + + +static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], + b[STREAM_ARRAY_SIZE+OFFSET], + c[STREAM_ARRAY_SIZE+OFFSET]; + +static double avgtime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + +static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + +static double bytes[4] = { + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE +}; + +extern double mysecond(); +extern void checkSTREAMresults(); +#ifdef TUNED +extern void tuned_STREAM_Copy(); +extern void tuned_STREAM_Scale(STREAM_TYPE scalar); +extern void tuned_STREAM_Add(); +extern void tuned_STREAM_Triad(STREAM_TYPE scalar); +#endif +#ifdef _OPENMP +extern int omp_get_num_threads(); +#endif + int +main() +{ + int quantum, checktick(); + int BytesPerWord; + int k; + ssize_t j; + STREAM_TYPE scalar; + double t, times[4][NTIMES]; + + /* --- SETUP --- determine precision and check timing --- */ + + printf(HLINE); + printf("STREAM version $Revision: 5.10 $\n"); + printf(HLINE); + BytesPerWord = sizeof(STREAM_TYPE); + printf("This system uses %d bytes per array element.\n", + BytesPerWord); + + printf(HLINE); +#ifdef N + printf("***** WARNING: ******\n"); + printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); + printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); + printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); + printf("***** WARNING: ******\n"); +#endif + + printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); + printf("Memory per array = %.1f MiB (= %.1f GiB).\n", + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); + printf("Total memory required = %.1f MiB (= %.1f GiB).\n", + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); + printf("Each kernel will be executed %d times.\n", NTIMES); + printf(" The *best* time for each kernel (excluding the first iteration)\n"); + printf(" will be used to compute the reported bandwidth.\n"); + +#ifdef _OPENMP + printf(HLINE); +#pragma omp parallel + { +#pragma omp master + { + k = omp_get_num_threads(); + printf ("Number of Threads requested = %i\n",k); + } + } +#endif + +#ifdef _OPENMP + k = 0; +#pragma omp parallel +#pragma omp atomic + k++; + printf ("Number of Threads counted = %i\n",k); +#endif + + /* Get initial value for system clock. */ +#pragma omp parallel for + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else { + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + quantum = 1; + } + + t = mysecond(); +#pragma omp parallel for + for (j = 0; j < STREAM_ARRAY_SIZE; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + + unsigned long long c0 = 0; + long long c1 = 0; + + + + c0 = cycles(); + sleep(1); + c1 = cycles(); + printf("1 second sleep, number of cycles = %ld\n", c1 - c0); + + scalar = 3.0; + c0 = cycles(); + //printf("c0 = %ld\n", c0); + for (k=0; k + +double mysecond() +{ + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +#ifndef abs +#define abs(a) ((a) >= 0 ? (a) : -(a)) +#endif +void checkSTREAMresults () +{ + STREAM_TYPE aj,bj,cj,scalar; + STREAM_TYPE aSumErr,bSumErr,cSumErr; + STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; + double epsilon; + ssize_t j; + int k,ierr,err; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = 3.0; + for (k=0; k epsilon) { + err++; + printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,aj,a[j],abs((aj-a[j])/aAvgErr)); + } +#endif + } + } + printf(" For array a[], %d errors were found.\n",ierr); + } + if (abs(bAvgErr/bj) > epsilon) { + err++; + printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,bj,b[j],abs((bj-b[j])/bAvgErr)); + } +#endif + } + } + printf(" For array b[], %d errors were found.\n",ierr); + } + if (abs(cAvgErr/cj) > epsilon) { + err++; + printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,cj,c[j],abs((cj-c[j])/cAvgErr)); + } +#endif + } + } + printf(" For array c[], %d errors were found.\n",ierr); + } + if (err == 0) { + printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); + } +#ifdef VERBOSE + printf ("Results Validation Verbose Results: \n"); + printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); + printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); + printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); +#endif +} + +#ifdef TUNED +/* stubs for "tuned" versions of the kernels */ +void tuned_STREAM_Copy() +{ + ssize_t j; +#pragma omp parallel for + for (j=0; j