Page MenuHomec4science

dgemm-unroll.c
No OneTemporary

File Metadata

Created
Wed, Aug 14, 20:23

dgemm-unroll.c

#include <sys/time.h>
#include <stdlib.h>
#include "mm_malloc.h"
#if !defined(BLOCK_SIZE)
#ifndef M_BLOCK_SIZE
#define M_BLOCK_SIZE 128
#endif
#ifndef N_BLOCK_SIZE
#define N_BLOCK_SIZE 8
#endif
#ifndef K_BLOCK_SIZE
#define K_BLOCK_SIZE 128
#endif
#else
#define N_BLOCK_SIZE BLOCK_SIZE
#define M_BLOCK_SIZE BLOCK_SIZE
#define K_BLOCK_SIZE BLOCK_SIZE
#endif
#define PAGESIZE 4096;
#define NUMPERPAGE 512 // # of elements to fit a page
#define PREFETCH __builtin_prefetch
#define min(a,b) (((a)<(b))?(a):(b))
//double Ab[M_BLOCK_SIZE*K_BLOCK_SIZE];
//double Bb[K_BLOCK_SIZE*N_BLOCK_SIZE];
//double Cb[M_BLOCK_SIZE*N_BLOCK_SIZE];
//#define A_BLOCK
//#define B_BLOCK
//#define C_BLOCK
double myseconds()
{
struct timeval tp;
struct timezone tzp;
int i;
i = gettimeofday(&tp,&tzp);
return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
}
void dgemm( const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double* C, const int ldc)
{
int ib, jb, kb;
int i, j, k;
double* Ab = (double*) _mm_malloc(M_BLOCK_SIZE*K_BLOCK_SIZE*sizeof(double),32);
double* Bb = (double*) _mm_malloc(K_BLOCK_SIZE*N_BLOCK_SIZE*sizeof(double),32);
//
double b00, b10, b20, b30;
double b01, b11, b21, b31;
//
double copytime = 0.;
double computetime = 0.;
//
for( kb = 0; kb < K; kb += K_BLOCK_SIZE ){ // GEMM
int Kb = min( K_BLOCK_SIZE, K - kb );
//
for( jb = 0; jb < N; jb += N_BLOCK_SIZE ){ // GEBP
int Nb = min( N_BLOCK_SIZE, N - jb );
//
copytime -= myseconds();
for (j = 0; j < Nb; j = j + 1)
for (k = 0; k < Kb; k = k + 1){
Bb[j*K_BLOCK_SIZE + k] = B[(j + jb)*ldb + k + kb];
}
copytime += myseconds();
//
for( ib = 0; ib < M; ib += M_BLOCK_SIZE ){ // GEPP
int Mb = min( M_BLOCK_SIZE, M - ib );
copytime -= myseconds();
for(k = 0; k < Kb; ++k)
for (i = 0; i < Mb; i = i + 1)
{
Ab[(i + 0)*K_BLOCK_SIZE + k] = A[(k + kb)*lda + i + ib];
}
copytime += myseconds();
//
for (i = 0; i < Mb - Mb%4; i = i + 4)
{
for (j = 0; j < Nb - Nb%2; j = j + 2)
{
double c00 = 0.;
double c01 = 0.;
double c02 = 0.;
double c03 = 0.;
//
double c10 = 0.;
double c11 = 0.;
double c12 = 0.;
double c13 = 0.;
//
double c20 = 0.;
double c21 = 0.;
double c22 = 0.;
double c23 = 0.;
//
double c30 = 0.;
double c31 = 0.;
double c32 = 0.;
double c33 = 0.;
double* pA = &Ab[(i + 0)*K_BLOCK_SIZE];
double* pB = &Bb[(j + 0)*K_BLOCK_SIZE];
//PREFETCH((void*) &Bb[(j + 4)*K_BLOCK_SIZE]);
//PREFETCH((void*) &Bb[(j + 5)*K_BLOCK_SIZE]);
//
//PREFETCH((void*) &Ab[(i + 4)*K_BLOCK_SIZE]);
//PREFETCH((void*) &Ab[(i + 5)*K_BLOCK_SIZE]);
//PREFETCH((void*) &Ab[(i + 6)*K_BLOCK_SIZE]);
//PREFETCH((void*) &Ab[(i + 7)*K_BLOCK_SIZE]);
//PREFETCH((void*) &Ab[(i + 8)*K_BLOCK_SIZE]);
//PREFETCH((void*) &Ab[(i + 9)*K_BLOCK_SIZE]);
//PREFETCH((void*) &Ab[(i + 10)*K_BLOCK_SIZE]);
//PREFETCH((void*) &Ab[(i + 11)*K_BLOCK_SIZE]);
//computetime -= myseconds();
b00 = Bb[(j + 0)*K_BLOCK_SIZE + 0];
b10 = Bb[(j + 1)*K_BLOCK_SIZE + 0];
b20 = Bb[(j + 2)*K_BLOCK_SIZE + 0];
b30 = Bb[(j + 3)*K_BLOCK_SIZE + 0];
PREFETCH((void*) &Ab[(i + 4)*K_BLOCK_SIZE + 000]);
PREFETCH((void*) &Ab[(i + 5)*K_BLOCK_SIZE + 000]);
PREFETCH((void*) &Ab[(i + 6)*K_BLOCK_SIZE + 000]);
PREFETCH((void*) &Ab[(i + 7)*K_BLOCK_SIZE + 000]);
for (k = 0; k < Kb - 1; k = k + 1)
{
//
//
//
c00 += Ab[(i + 0)*K_BLOCK_SIZE + k]*b00;
c01 += Ab[(i + 1)*K_BLOCK_SIZE + k]*b00;
c02 += Ab[(i + 2)*K_BLOCK_SIZE + k]*b00;
c03 += Ab[(i + 3)*K_BLOCK_SIZE + k]*b00;
b00 = Bb[(j + 0)*K_BLOCK_SIZE + k + 1];
//
c10 += Ab[(i + 0)*K_BLOCK_SIZE + k]*b10;
c11 += Ab[(i + 1)*K_BLOCK_SIZE + k]*b10;
c12 += Ab[(i + 2)*K_BLOCK_SIZE + k]*b10;
c13 += Ab[(i + 3)*K_BLOCK_SIZE + k]*b10;
b10 = Bb[(j + 1)*K_BLOCK_SIZE + k + 1];
/*
c20 += Ab[(i + 0)*K_BLOCK_SIZE + k]*b20;
c21 += Ab[(i + 1)*K_BLOCK_SIZE + k]*b20;
c22 += Ab[(i + 2)*K_BLOCK_SIZE + k]*b20;
c23 += Ab[(i + 3)*K_BLOCK_SIZE + k]*b20;
b20 = Bb[(j + 2)*K_BLOCK_SIZE + k + 1];
//
c30 += Ab[(i + 0)*K_BLOCK_SIZE + k]*b30;
c31 += Ab[(i + 1)*K_BLOCK_SIZE + k]*b30;
c32 += Ab[(i + 2)*K_BLOCK_SIZE + k]*b30;
c33 += Ab[(i + 3)*K_BLOCK_SIZE + k]*b30;
b30 = Bb[(j + 3)*K_BLOCK_SIZE + k + 1];
*/
}
c00 += Ab[(i + 0)*K_BLOCK_SIZE + k]*b00;
c01 += Ab[(i + 1)*K_BLOCK_SIZE + k]*b00;
c02 += Ab[(i + 2)*K_BLOCK_SIZE + k]*b00;
c03 += Ab[(i + 3)*K_BLOCK_SIZE + k]*b00;
//
c10 += Ab[(i + 0)*K_BLOCK_SIZE + k]*b10;
c11 += Ab[(i + 1)*K_BLOCK_SIZE + k]*b10;
c12 += Ab[(i + 2)*K_BLOCK_SIZE + k]*b10;
c13 += Ab[(i + 3)*K_BLOCK_SIZE + k]*b10;
/*
c20 += Ab[(i + 0)*K_BLOCK_SIZE + k]*b20;
c21 += Ab[(i + 1)*K_BLOCK_SIZE + k]*b20;
c22 += Ab[(i + 2)*K_BLOCK_SIZE + k]*b20;
c23 += Ab[(i + 3)*K_BLOCK_SIZE + k]*b20;
//
c30 += Ab[(i + 0)*K_BLOCK_SIZE + k]*b30;
c31 += Ab[(i + 1)*K_BLOCK_SIZE + k]*b30;
c32 += Ab[(i + 2)*K_BLOCK_SIZE + k]*b30;
c33 += Ab[(i + 3)*K_BLOCK_SIZE + k]*b30;
*/
//computetime += myseconds();
//
C[(j + jb + 0)*ldc + i + ib + 0] += c00;
C[(j + jb + 0)*ldc + i + ib + 1] += c01;
C[(j + jb + 0)*ldc + i + ib + 2] += c02;
C[(j + jb + 0)*ldc + i + ib + 3] += c03;
//
C[(j + jb + 1)*ldc + i + ib + 0] += c10;
C[(j + jb + 1)*ldc + i + ib + 1] += c11;
C[(j + jb + 1)*ldc + i + ib + 2] += c12;
C[(j + jb + 1)*ldc + i + ib + 3] += c13;
/*
C[(j + jb + 2)*ldc + i + ib + 0] += c20;
C[(j + jb + 2)*ldc + i + ib + 1] += c21;
C[(j + jb + 2)*ldc + i + ib + 2] += c22;
C[(j + jb + 2)*ldc + i + ib + 3] += c23;
//
C[(j + jb + 3)*ldc + i + ib + 0] += c30;
C[(j + jb + 3)*ldc + i + ib + 1] += c31;
C[(j + jb + 3)*ldc + i + ib + 2] += c32;
C[(j + jb + 3)*ldc + i + ib + 3] += c33;
*/
}
}
}
}
}
printf("copy time = %f, compute time = %f, %f GFlops\n", copytime, computetime, 2.*M*N*K/computetime/1e9);
_mm_free(Ab);
_mm_free(Bb);
}

Event Timeline