Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F120527572
dgemm.c
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Jul 5, 00:44
Size
6 KB
Mime Type
text/x-c
Expires
Mon, Jul 7, 00:44 (2 d)
Engine
blob
Format
Raw Data
Handle
27202558
Attached To
R31 arm64-hpc
dgemm.c
View Options
/*
* This matrix multiply driver was originally written by Jason Riedy.
* Most of the code (and comments) are his, but there are several
* things I've hacked up. It seemed to work for him, so any errors
* are probably mine.
*
* Ideally, you won't need to change this file. You may want to change
* a few settings to speed debugging runs, but remember to change back
* to the original settings during final testing.
*
* The output format: "Size: %u\tmflop/s: %g\n"
* */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <float.h>
#include <math.h>
#include <sys/types.h>
#include <sys/resource.h>
#include <unistd.h>
#include <sys/time.h>
#include "mm_malloc.h"
//#include <mkl.h>
/*
* We try to run enough iterations to get reasonable timings. The matrices
* are multiplied at least MIN_RUNS times. If that doesn't take MIN_CPU_SECS
* seconds, then we double the number of iterations and try again.
*
* You may want to modify these to speed debugging...
* */
#define MIN_RUNS 1
#define MIN_CPU_SECS 0.00000001
#define _alpha 1.e0
#define _beta 1.e0
#define NN 512
void dgemm( const int, const int, const int, const double, const double *, const int, const double *, const int, const double, double* , const int );
void dgemm_ref( const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double* C, const int ldc)
{
int i, j, k;
long int ops = 0;
long int mem = 0;
for (i = 0; i < M; ++i)
for (j = 0; j < N; ++j)
{
double cij = beta*C[j*ldc + i];
mem += 8;
for (k = 0; k < K; ++k)
{
cij += alpha*A[i + k*lda]*B[k + j*ldb];
ops += 2;
mem += 2*8;
}
C[j*ldc + i] = cij;
mem += 8;
}
printf("ref ops = %ld\n", ops);
printf("ref mem = %ld\n", mem);
}
double mysecond()
{
struct timeval tp;
struct timezone tzp;
int i;
i = gettimeofday(&tp,&tzp);
return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
}
static void init(double *A, int M, int N, double v)
{
int ii, jj;
for (ii = 0; ii < M; ++ii)
for (jj= 0; jj < N; ++jj)
A[jj*M + ii] = v + jj + 0*(jj*N + ii + 0);
}
static void init_t(double *A, int M, int N, double v)
{
int ii, jj;
for (ii = 0; ii < M; ++ii)
for (jj= 0; jj < N; ++jj)
{
if (ii == jj)
A[jj*M + ii] = v;//(jj*M + ii);
// printf("%d %d %f\n", ii, jj, A[jj*M + ii]);
}
}
void
matrix_init (double *A, const int M, const int N, double val)
{
int i;
for (i = 0; i < M*N; ++i)
{
A[i] = drand48 ();
//A[i] = M*val/N;
}
}
void
matrix_clear (double *C, const int M, const int N)
{
memset (C, 0, M*N*sizeof (double));
}
double
time_dgemm (const int M, const int N, const unsigned K,
const double alpha, const double *A, const int lda,
const double *B, const int ldb,
const double beta, double *C, const unsigned ldc)
{
double mflops, mflop_s;
double secs = -1;
int num_iterations = MIN_RUNS;
int i;
double* Ca = (double*) _mm_malloc(N*ldc*sizeof(double), 32);
while (secs < MIN_CPU_SECS)
{
double cpu_time = 0;
double last_clock = mysecond();
for (i = 0; i < num_iterations; ++i)
{
memcpy(Ca, C, N*ldc*sizeof(double));
cpu_time -= mysecond();
dgemm(M, N, K, alpha, A, lda, B, ldb, beta, Ca, ldc);
cpu_time += mysecond();
}
mflops = 2.0 * num_iterations*M*N*K/1.0e6;
secs = cpu_time;
mflop_s = mflops/secs;
num_iterations *= 2;
}
memcpy(C, Ca, N*ldc*sizeof(double));
_mm_free(Ca);
return mflop_s;
}
double time_dgemm_blas(const int M, const unsigned N, const int K,
const double alpha, const double *A, const int lda,
const double *B, const int ldb,
const double beta, double *C, const int ldc)
{
double mflops, mflop_s;
double secs = -1;
int num_iterations = MIN_RUNS;
int i;
char transa = 'n';
char transb = 'n';
double* Ca = (double*) _mm_malloc(N*ldc*sizeof(double), 32);
while (secs < MIN_CPU_SECS)
{
double cpu_time = 0;
for (i = 0; i < num_iterations; ++i)
{
memcpy(Ca, C, N*ldc*sizeof(double));
cpu_time -= mysecond();
dgemm_(&transa, &transb, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, Ca, &ldc);
//dgemm_ref (M, N, K, alpha, A, lda, B, ldb, beta, Ca, ldc);
cpu_time += mysecond();
}
mflops = 2.0*num_iterations*M*N*K/1.0e6;
secs = cpu_time;
mflop_s = mflops/secs;
num_iterations *= 2;
}
memcpy(C, Ca, N*ldc*sizeof(double));
_mm_free(Ca);
return mflop_s;
}
int
main (int argc, char *argv[])
{
int sz_i;
double mflop_s, mflop_b;
int M, N, K;
if ( argc == 4 )
{
M = atoi(argv[1]);
N = atoi(argv[2]);
K = atoi(argv[3]);
}
else if (argc == 2)
{
M = atoi(argv[1]);
N = M;
K = M;
}
else
{
M = NN;
N = NN;
K = NN;
}
int lda = M;
int ldb = K;
int ldc = M;
double* A = (double*) _mm_malloc(M*K*sizeof(double), 4096);
double* B = (double*) _mm_malloc(K*N*sizeof(double), 4096);
double* C = (double*) _mm_malloc(M*N*sizeof(double), 4096);
double* Cb = (double*) _mm_malloc(M*N*sizeof(double), 4096);
matrix_init(A, M, K, 1.);
//init_t(A, M, K, 1.);
matrix_init(B, K, N, 1.);
//init_t(B, K, N, 1.);
//matrix_clear(C);
//init_t(C, N, M, 1.);
matrix_init(C, M, N, 1.);
memcpy(Cb, C, M*N*sizeof(double));
//matrix_init(Cb, M, N, 0.);
//const int M = test_sizes[sz_i];
printf("Size: %u %u %u\t", M, N, K) ; fflush(stdout);
mflop_s = time_dgemm (M, N, K, _alpha, A, lda, B, ldb, _beta, C , ldc);
printf ("my Gflops/s: %g ", mflop_s/1000.); fflush(stdout);
#if 1
mflop_b = time_dgemm_blas(M, N, K, _alpha, A, lda, B, ldb, _beta, Cb, ldc);
printf ("blas Gflops/s: %g\n", mflop_b/1000);
#if 1
int ii, jj;
for (jj = 0; jj < N; ++jj)
{
for (ii = 0; ii < M; ++ii)
{
if (abs(C[jj*ldc + ii] - Cb[jj*ldc + ii]) != 0)
printf("i = %d, j = %d, C = %f, should be %f\n", ii, jj, C[jj*ldc + ii], Cb[jj*ldc + ii]);
}
}
#endif
#endif
printf("\n");
_mm_free(A);
_mm_free(B);
_mm_free(C);
_mm_free(Cb);
return 0;
}
Event Timeline
Log In to Comment