dgemm.c
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Jul 5, 00:44

dgemm.c
View Options

	/*
	* This matrix multiply driver was originally written by Jason Riedy.
	* Most of the code (and comments) are his, but there are several
	* things I've hacked up. It seemed to work for him, so any errors
	* are probably mine.
	*
	* Ideally, you won't need to change this file. You may want to change
	* a few settings to speed debugging runs, but remember to change back
	* to the original settings during final testing.
	*
	* The output format: "Size: %u\tmflop/s: %g\n"
	* */


	#include <stdlib.h>
	#include <stdio.h>
	#include <string.h>

	#include <float.h>
	#include <math.h>

	#include <sys/types.h>
	#include <sys/resource.h>

	#include <unistd.h>
	#include <sys/time.h>

	#include "mm_malloc.h"
	//#include <mkl.h>

	/*
	* We try to run enough iterations to get reasonable timings. The matrices
	* are multiplied at least MIN_RUNS times. If that doesn't take MIN_CPU_SECS
	* seconds, then we double the number of iterations and try again.
	*
	* You may want to modify these to speed debugging...
	* */
	#define MIN_RUNS 1
	#define MIN_CPU_SECS 0.00000001

	#define _alpha 1.e0
	#define _beta 1.e0

	#define NN 512

	void dgemm( const int, const int, const int, const double, const double , const int, const double , const int, const double, double* , const int );


	void dgemm_ref( const int M, const int N, const int K, const double alpha, const double A, const int lda, const double B, const int ldb, const double beta, double* C, const int ldc)
	{
	int i, j, k;
	long int ops = 0;
	long int mem = 0;

	for (i = 0; i < M; ++i)
	for (j = 0; j < N; ++j)
	{
	double cij = betaC[jldc + i];
	mem += 8;
	for (k = 0; k < K; ++k)
	{
	cij += alphaA[i + klda]B[k + jldb];
	ops += 2;
	mem += 2*8;
	}
	C[j*ldc + i] = cij;
	mem += 8;
	}
	printf("ref ops = %ld\n", ops);
	printf("ref mem = %ld\n", mem);
	}

	double mysecond()
	{
	struct timeval tp;
	struct timezone tzp;
	int i;

	i = gettimeofday(&tp,&tzp);
	return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
	}


	static void init(double *A, int M, int N, double v)
	{
	int ii, jj;
	for (ii = 0; ii < M; ++ii)
	for (jj= 0; jj < N; ++jj)
	A[jjM + ii] = v + jj + 0(jj*N + ii + 0);
	}

	static void init_t(double *A, int M, int N, double v)
	{
	int ii, jj;
	for (ii = 0; ii < M; ++ii)
	for (jj= 0; jj < N; ++jj)
	{
	if (ii == jj)
	A[jjM + ii] = v;//(jjM + ii);
	// printf("%d %d %f\n", ii, jj, A[jj*M + ii]);
	}
	}

	void
	matrix_init (double *A, const int M, const int N, double val)
	{
	int i;

	for (i = 0; i < M*N; ++i)
	{
	A[i] = drand48 ();
	//A[i] = M*val/N;
	}
	}

	void
	matrix_clear (double *C, const int M, const int N)
	{
	memset (C, 0, MNsizeof (double));
	}


	double
	time_dgemm (const int M, const int N, const unsigned K,
	const double alpha, const double *A, const int lda,
	const double *B, const int ldb,
	const double beta, double *C, const unsigned ldc)
	{
	double mflops, mflop_s;
	double secs = -1;

	int num_iterations = MIN_RUNS;
	int i;

	double* Ca = (double) _mm_malloc(Nldc*sizeof(double), 32);

	while (secs < MIN_CPU_SECS)
	{
	double cpu_time = 0;

	double last_clock = mysecond();
	for (i = 0; i < num_iterations; ++i)
	{
	memcpy(Ca, C, Nldcsizeof(double));
	cpu_time -= mysecond();
	dgemm(M, N, K, alpha, A, lda, B, ldb, beta, Ca, ldc);
	cpu_time += mysecond();
	}

	mflops = 2.0 * num_iterationsMN*K/1.0e6;
	secs = cpu_time;
	mflop_s = mflops/secs;

	num_iterations *= 2;
	}

	memcpy(C, Ca, Nldcsizeof(double));
	_mm_free(Ca);
	return mflop_s;
	}



	double time_dgemm_blas(const int M, const unsigned N, const int K,
	const double alpha, const double *A, const int lda,
	const double *B, const int ldb,
	const double beta, double *C, const int ldc)
	{

	double mflops, mflop_s;
	double secs = -1;

	int num_iterations = MIN_RUNS;
	int i;

	char transa = 'n';
	char transb = 'n';

	double* Ca = (double) _mm_malloc(Nldc*sizeof(double), 32);

	while (secs < MIN_CPU_SECS)
	{

	double cpu_time = 0;

	for (i = 0; i < num_iterations; ++i)
	{
	memcpy(Ca, C, Nldcsizeof(double));
	cpu_time -= mysecond();
	dgemm_(&transa, &transb, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, Ca, &ldc);
	//dgemm_ref (M, N, K, alpha, A, lda, B, ldb, beta, Ca, ldc);
	cpu_time += mysecond();
	}

	mflops = 2.0num_iterationsMNK/1.0e6;
	secs = cpu_time;
	mflop_s = mflops/secs;

	num_iterations *= 2;
	}
	memcpy(C, Ca, Nldcsizeof(double));
	_mm_free(Ca);

	return mflop_s;
	}

	int
	main (int argc, char *argv[])
	{
	int sz_i;
	double mflop_s, mflop_b;

	int M, N, K;

	if ( argc == 4 )
	{
	M = atoi(argv[1]);
	N = atoi(argv[2]);
	K = atoi(argv[3]);
	}
	else if (argc == 2)
	{
	M = atoi(argv[1]);
	N = M;
	K = M;
	}
	else
	{
	M = NN;
	N = NN;
	K = NN;
	}

	int lda = M;
	int ldb = K;
	int ldc = M;

	double* A = (double) _mm_malloc(MK*sizeof(double), 4096);
	double* B = (double) _mm_malloc(KN*sizeof(double), 4096);
	double* C = (double) _mm_malloc(MN*sizeof(double), 4096);
	double* Cb = (double) _mm_malloc(MN*sizeof(double), 4096);

	matrix_init(A, M, K, 1.);
	//init_t(A, M, K, 1.);
	matrix_init(B, K, N, 1.);
	//init_t(B, K, N, 1.);
	//matrix_clear(C);
	//init_t(C, N, M, 1.);
	matrix_init(C, M, N, 1.);
	memcpy(Cb, C, MNsizeof(double));
	//matrix_init(Cb, M, N, 0.);

	//const int M = test_sizes[sz_i];
	printf("Size: %u %u %u\t", M, N, K) ; fflush(stdout);

	mflop_s = time_dgemm (M, N, K, _alpha, A, lda, B, ldb, _beta, C , ldc);
	printf ("my Gflops/s: %g ", mflop_s/1000.); fflush(stdout);

	#if 1
	mflop_b = time_dgemm_blas(M, N, K, _alpha, A, lda, B, ldb, _beta, Cb, ldc);
	printf ("blas Gflops/s: %g\n", mflop_b/1000);

	#if 1
	int ii, jj;
	for (jj = 0; jj < N; ++jj)
	{
	for (ii = 0; ii < M; ++ii)
	{
	if (abs(C[jjldc + ii] - Cb[jjldc + ii]) != 0)
	printf("i = %d, j = %d, C = %f, should be %f\n", ii, jj, C[jjldc + ii], Cb[jjldc + ii]);
	}
	}
	#endif
	#endif
	printf("\n");
	_mm_free(A);
	_mm_free(B);
	_mm_free(C);
	_mm_free(Cb);


	return 0;
	}

dgemm.cNo OneTemporaryActions

File Metadata

dgemm.cView Options

Event Timeline

dgemm.c
No OneTemporary
Actions

dgemm.c
View Options