Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F77498809
dgemm.c
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Aug 14, 20:20
Size
6 KB
Mime Type
text/x-c
Expires
Fri, Aug 16, 20:20 (2 d)
Engine
blob
Format
Raw Data
Handle
19877365
Attached To
R31 arm64-hpc
dgemm.c
View Options
/*
* This matrix multiply driver was originally written by Jason Riedy.
* Most of the code (and comments) are his, but there are several
* things I've hacked up. It seemed to work for him, so any errors
* are probably mine.
*
* Ideally, you won't need to change this file. You may want to change
* a few settings to speed debugging runs, but remember to change back
* to the original settings during final testing.
*
* The output format: "Size: %u\tmflop/s: %g\n"
* */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <float.h>
#include <math.h>
#include <sys/types.h>
#include <sys/resource.h>
#include <unistd.h>
#include <sys/time.h>
#include "mm_malloc.h"
//#include <mkl.h>
/*
* We try to run enough iterations to get reasonable timings. The matrices
* are multiplied at least MIN_RUNS times. If that doesn't take MIN_CPU_SECS
* seconds, then we double the number of iterations and try again.
*
* You may want to modify these to speed debugging...
* */
#define MIN_RUNS 1
#define MIN_CPU_SECS 0.00000001
#define _alpha 1.e0
#define _beta 1.e0
#define NN 512
void
dgemm
(
const
int
,
const
int
,
const
int
,
const
double
,
const
double
*
,
const
int
,
const
double
*
,
const
int
,
const
double
,
double
*
,
const
int
);
void
dgemm_ref
(
const
int
M
,
const
int
N
,
const
int
K
,
const
double
alpha
,
const
double
*
A
,
const
int
lda
,
const
double
*
B
,
const
int
ldb
,
const
double
beta
,
double
*
C
,
const
int
ldc
)
{
int
i
,
j
,
k
;
long
int
ops
=
0
;
long
int
mem
=
0
;
for
(
i
=
0
;
i
<
M
;
++
i
)
for
(
j
=
0
;
j
<
N
;
++
j
)
{
double
cij
=
beta
*
C
[
j
*
ldc
+
i
];
mem
+=
8
;
for
(
k
=
0
;
k
<
K
;
++
k
)
{
cij
+=
alpha
*
A
[
i
+
k
*
lda
]
*
B
[
k
+
j
*
ldb
];
ops
+=
2
;
mem
+=
2
*
8
;
}
C
[
j
*
ldc
+
i
]
=
cij
;
mem
+=
8
;
}
printf
(
"ref ops = %ld
\n
"
,
ops
);
printf
(
"ref mem = %ld
\n
"
,
mem
);
}
double
mysecond
()
{
struct
timeval
tp
;
struct
timezone
tzp
;
int
i
;
i
=
gettimeofday
(
&
tp
,
&
tzp
);
return
(
(
double
)
tp
.
tv_sec
+
(
double
)
tp
.
tv_usec
*
1.e-6
);
}
static
void
init
(
double
*
A
,
int
M
,
int
N
,
double
v
)
{
int
ii
,
jj
;
for
(
ii
=
0
;
ii
<
M
;
++
ii
)
for
(
jj
=
0
;
jj
<
N
;
++
jj
)
A
[
jj
*
M
+
ii
]
=
v
+
jj
+
0
*
(
jj
*
N
+
ii
+
0
);
}
static
void
init_t
(
double
*
A
,
int
M
,
int
N
,
double
v
)
{
int
ii
,
jj
;
for
(
ii
=
0
;
ii
<
M
;
++
ii
)
for
(
jj
=
0
;
jj
<
N
;
++
jj
)
{
if
(
ii
==
jj
)
A
[
jj
*
M
+
ii
]
=
v
;
//(jj*M + ii);
// printf("%d %d %f\n", ii, jj, A[jj*M + ii]);
}
}
void
matrix_init
(
double
*
A
,
const
int
M
,
const
int
N
,
double
val
)
{
int
i
;
for
(
i
=
0
;
i
<
M
*
N
;
++
i
)
{
A
[
i
]
=
drand48
();
//A[i] = M*val/N;
}
}
void
matrix_clear
(
double
*
C
,
const
int
M
,
const
int
N
)
{
memset
(
C
,
0
,
M
*
N
*
sizeof
(
double
));
}
double
time_dgemm
(
const
int
M
,
const
int
N
,
const
unsigned
K
,
const
double
alpha
,
const
double
*
A
,
const
int
lda
,
const
double
*
B
,
const
int
ldb
,
const
double
beta
,
double
*
C
,
const
unsigned
ldc
)
{
double
mflops
,
mflop_s
;
double
secs
=
-
1
;
int
num_iterations
=
MIN_RUNS
;
int
i
;
double
*
Ca
=
(
double
*
)
_mm_malloc
(
N
*
ldc
*
sizeof
(
double
),
32
);
while
(
secs
<
MIN_CPU_SECS
)
{
double
cpu_time
=
0
;
double
last_clock
=
mysecond
();
for
(
i
=
0
;
i
<
num_iterations
;
++
i
)
{
memcpy
(
Ca
,
C
,
N
*
ldc
*
sizeof
(
double
));
cpu_time
-=
mysecond
();
dgemm
(
M
,
N
,
K
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
Ca
,
ldc
);
cpu_time
+=
mysecond
();
}
mflops
=
2.0
*
num_iterations
*
M
*
N
*
K
/
1.0e6
;
secs
=
cpu_time
;
mflop_s
=
mflops
/
secs
;
num_iterations
*=
2
;
}
memcpy
(
C
,
Ca
,
N
*
ldc
*
sizeof
(
double
));
_mm_free
(
Ca
);
return
mflop_s
;
}
double
time_dgemm_blas
(
const
int
M
,
const
unsigned
N
,
const
int
K
,
const
double
alpha
,
const
double
*
A
,
const
int
lda
,
const
double
*
B
,
const
int
ldb
,
const
double
beta
,
double
*
C
,
const
int
ldc
)
{
double
mflops
,
mflop_s
;
double
secs
=
-
1
;
int
num_iterations
=
MIN_RUNS
;
int
i
;
char
transa
=
'n'
;
char
transb
=
'n'
;
double
*
Ca
=
(
double
*
)
_mm_malloc
(
N
*
ldc
*
sizeof
(
double
),
32
);
while
(
secs
<
MIN_CPU_SECS
)
{
double
cpu_time
=
0
;
for
(
i
=
0
;
i
<
num_iterations
;
++
i
)
{
memcpy
(
Ca
,
C
,
N
*
ldc
*
sizeof
(
double
));
cpu_time
-=
mysecond
();
dgemm_
(
&
transa
,
&
transb
,
&
M
,
&
N
,
&
K
,
&
alpha
,
A
,
&
lda
,
B
,
&
ldb
,
&
beta
,
Ca
,
&
ldc
);
//dgemm_ref (M, N, K, alpha, A, lda, B, ldb, beta, Ca, ldc);
cpu_time
+=
mysecond
();
}
mflops
=
2.0
*
num_iterations
*
M
*
N
*
K
/
1.0e6
;
secs
=
cpu_time
;
mflop_s
=
mflops
/
secs
;
num_iterations
*=
2
;
}
memcpy
(
C
,
Ca
,
N
*
ldc
*
sizeof
(
double
));
_mm_free
(
Ca
);
return
mflop_s
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
int
sz_i
;
double
mflop_s
,
mflop_b
;
int
M
,
N
,
K
;
if
(
argc
==
4
)
{
M
=
atoi
(
argv
[
1
]);
N
=
atoi
(
argv
[
2
]);
K
=
atoi
(
argv
[
3
]);
}
else
if
(
argc
==
2
)
{
M
=
atoi
(
argv
[
1
]);
N
=
M
;
K
=
M
;
}
else
{
M
=
NN
;
N
=
NN
;
K
=
NN
;
}
int
lda
=
M
;
int
ldb
=
K
;
int
ldc
=
M
;
double
*
A
=
(
double
*
)
_mm_malloc
(
M
*
K
*
sizeof
(
double
),
4096
);
double
*
B
=
(
double
*
)
_mm_malloc
(
K
*
N
*
sizeof
(
double
),
4096
);
double
*
C
=
(
double
*
)
_mm_malloc
(
M
*
N
*
sizeof
(
double
),
4096
);
double
*
Cb
=
(
double
*
)
_mm_malloc
(
M
*
N
*
sizeof
(
double
),
4096
);
matrix_init
(
A
,
M
,
K
,
1.
);
//init_t(A, M, K, 1.);
matrix_init
(
B
,
K
,
N
,
1.
);
//init_t(B, K, N, 1.);
//matrix_clear(C);
//init_t(C, N, M, 1.);
matrix_init
(
C
,
M
,
N
,
1.
);
memcpy
(
Cb
,
C
,
M
*
N
*
sizeof
(
double
));
//matrix_init(Cb, M, N, 0.);
//const int M = test_sizes[sz_i];
printf
(
"Size: %u %u %u
\t
"
,
M
,
N
,
K
)
;
fflush
(
stdout
);
mflop_s
=
time_dgemm
(
M
,
N
,
K
,
_alpha
,
A
,
lda
,
B
,
ldb
,
_beta
,
C
,
ldc
);
printf
(
"my Gflops/s: %g "
,
mflop_s
/
1000.
);
fflush
(
stdout
);
#if 1
mflop_b
=
time_dgemm_blas
(
M
,
N
,
K
,
_alpha
,
A
,
lda
,
B
,
ldb
,
_beta
,
Cb
,
ldc
);
printf
(
"blas Gflops/s: %g
\n
"
,
mflop_b
/
1000
);
#if 1
int
ii
,
jj
;
for
(
jj
=
0
;
jj
<
N
;
++
jj
)
{
for
(
ii
=
0
;
ii
<
M
;
++
ii
)
{
if
(
abs
(
C
[
jj
*
ldc
+
ii
]
-
Cb
[
jj
*
ldc
+
ii
])
!=
0
)
printf
(
"i = %d, j = %d, C = %f, should be %f
\n
"
,
ii
,
jj
,
C
[
jj
*
ldc
+
ii
],
Cb
[
jj
*
ldc
+
ii
]);
}
}
#endif
#endif
printf
(
"
\n
"
);
_mm_free
(
A
);
_mm_free
(
B
);
_mm_free
(
C
);
_mm_free
(
Cb
);
return
0
;
}
Event Timeline
Log In to Comment