hw1: optimized blocked dgemm for matrix c

This commit is contained in:
Claudio Maggioni 2022-09-28 18:20:36 +02:00
parent 201e8c9927
commit df1cb3cf01
7 changed files with 1978777 additions and 348 deletions

View file

@ -88,7 +88,7 @@ and so on and so forth.
Therefore, for \texttt{csize = 128} and \texttt{stride = 1} the array will
access all indexes between 0 and 127 sequentially, and for \texttt{csize =
$2^{20}$} and \texttt{stride = $2^{10}$} the benchmark will access index 0, then
index $2^{10}-1$, and finally index $2^{20}-1$i.
index $2^{10}-1$, and finally index $2^{20}-1$.
\subsection{Analyzing Benchmark Results}

File diff suppressed because it is too large Load diff

View file

@ -1,4 +1,5 @@
#include <string.h>
/*
Please include compiler name below (you may also include any other modules you would like to be loaded)
@ -16,43 +17,49 @@ LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKL
const char* dgemm_desc = "Block-based dgemm.";
const int block_size = 50;
const int block_size = 18;
inline int min(int a, int b) {
return a < b ? a : b;
}
inline void naivemm(int r_min, int r_max, int k_min, int k_max, int c_min, int c_max, int n, double* A, double* B, double* C) {
/* For each row i of A */
for (int i = r_min; i < r_max; ++i) {
/* For each column j of B */
for (int j = c_min; j < c_max; ++j) {
for(int k = k_min; k < k_max; k++) {
C[i + j * n] += A[i + k * n] * B[k + j * n];
inline void naivemm(int r_min, int r_max, int k_min, int k_max, int c_min, int c_max, int n, double* A, double* B, double* C_temp) {
for (int i = r_min, ii = 0; i < r_max; ++i, ++ii) {
for (int j = c_min, jj = 0; j < c_max; ++j, ++jj) {
for (int k = k_min; k < k_max; k++) {
C_temp[ii + jj * block_size] += A[i + k * n] * B[k + j * n];
}
}
}
}
inline void store_c(double* C, double* C_temp, int r_min, int r_max, int c_min, int c_max, int n) {
for (int j = c_min, jj = 0; j < c_max; ++j, ++jj) {
memcpy(C + j * n + r_min, C_temp + jj * block_size, (r_max - r_min) * sizeof(double));
}
}
/* This routine performs a dgemm operation
* C := C + A * B
* where A, B, and C are lda-by-lda matrices stored in column-major format.
* On exit, A and B maintain their input values. */
void square_dgemm(int n, double* A, double* B, double* C) {
/* For each row i of A */
double C_temp[block_size * block_size];
for (int i = 0; i < n; i += block_size) {
int i_next = min(i + block_size, n);
/* For each column j of B */
for (int j = 0; j < n; j += block_size) {
int j_next = min(j + block_size, n);
memset(C_temp, 0, block_size * block_size * sizeof(double));
for (int k = 0; k < n; k += block_size) {
int k_next = min(k + block_size, n);
naivemm(i, i_next, k, k_next, j, j_next, n, A, B, C);
naivemm(i, i_next, k, k_next, j, j_next, n, A, B, C_temp);
}
store_c(C, C_temp, i, i_next, j, j_next, n);
}
}
}

View file

@ -1,7 +1,7 @@
%!PS-Adobe-2.0
%%Title: timing.ps
%%Creator: gnuplot 5.2 patchlevel 8
%%CreationDate: Wed Sep 28 13:13:45 2022
%%CreationDate: Wed Sep 28 17:53:39 2022
%%DocumentFonts: (atend)
%%BoundingBox: 50 50 554 770
%%Orientation: Landscape
@ -483,7 +483,7 @@ SDict begin [
/Creator (gnuplot 5.2 patchlevel 8)
% /Producer (gnuplot)
% /Keywords ()
/CreationDate (Wed Sep 28 13:13:45 2022)
/CreationDate (Wed Sep 28 17:53:39 2022)
/DOCINFO pdfmark
end
} ifelse
@ -545,34 +545,18 @@ LCb setrgbcolor
-63 0 V
stroke
854 448 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 100)]
[ [(Helvetica) 140.0 0.0 true true 0 ( 1000)]
] -46.7 MRshow
/Helvetica findfont 140 scalefont setfont
/vshift -46 def
1.000 UL
LTb
LCb setrgbcolor
938 867 M
938 1076 M
31 0 V
5978 0 R
-31 0 V
938 1111 M
31 0 V
5978 0 R
-31 0 V
938 1285 M
31 0 V
5978 0 R
-31 0 V
938 1420 M
31 0 V
5978 0 R
-31 0 V
938 1530 M
31 0 V
5978 0 R
-31 0 V
938 1623 M
938 1443 M
31 0 V
5978 0 R
-31 0 V
@ -580,7 +564,23 @@ LCb setrgbcolor
31 0 V
5978 0 R
-31 0 V
938 1775 M
938 1906 M
31 0 V
5978 0 R
-31 0 V
938 2071 M
31 0 V
5978 0 R
-31 0 V
938 2210 M
31 0 V
5978 0 R
-31 0 V
938 2331 M
31 0 V
5978 0 R
-31 0 V
938 2438 M
31 0 V
5978 0 R
-31 0 V
@ -588,107 +588,53 @@ stroke
0.500 UL
LTa
LCa setrgbcolor
938 1838 M
938 2534 M
6009 0 V
stroke
1.000 UL
LTb
LCb setrgbcolor
938 1838 M
938 2534 M
63 0 V
5946 0 R
-63 0 V
stroke
854 1838 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 1000)]
] -46.7 MRshow
/Helvetica findfont 140 scalefont setfont
1.000 UL
LTb
LCb setrgbcolor
938 2257 M
31 0 V
5978 0 R
-31 0 V
938 2502 M
31 0 V
5978 0 R
-31 0 V
938 2675 M
31 0 V
5978 0 R
-31 0 V
938 2810 M
31 0 V
5978 0 R
-31 0 V
938 2920 M
31 0 V
5978 0 R
-31 0 V
938 3013 M
31 0 V
5978 0 R
-31 0 V
938 3094 M
31 0 V
5978 0 R
-31 0 V
938 3165 M
31 0 V
5978 0 R
-31 0 V
stroke
0.500 UL
LTa
LCa setrgbcolor
938 3229 M
6009 0 V
stroke
1.000 UL
LTb
LCb setrgbcolor
938 3229 M
63 0 V
5946 0 R
-63 0 V
stroke
854 3229 M
854 2534 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 10000)]
] -46.7 MRshow
/Helvetica findfont 140 scalefont setfont
1.000 UL
LTb
LCb setrgbcolor
938 3647 M
938 3161 M
31 0 V
5978 0 R
-31 0 V
938 3892 M
938 3529 M
31 0 V
5978 0 R
-31 0 V
938 4066 M
938 3789 M
31 0 V
5978 0 R
-31 0 V
938 4200 M
938 3991 M
31 0 V
5978 0 R
-31 0 V
938 4311 M
938 4156 M
31 0 V
5978 0 R
-31 0 V
938 4404 M
938 4296 M
31 0 V
5978 0 R
-31 0 V
938 4484 M
938 4417 M
31 0 V
5978 0 R
-31 0 V
938 4555 M
938 4524 M
31 0 V
5978 0 R
-31 0 V
@ -729,7 +675,7 @@ LCb setrgbcolor
0 -63 V
stroke
938 308 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 0)]
[ [(Helvetica) 140.0 0.0 true true 0 ( 30.6)]
] -46.7 MCshow
/Helvetica findfont 140 scalefont setfont
1.000 UL
@ -750,7 +696,7 @@ LCb setrgbcolor
0 -63 V
stroke
1689 308 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 100)]
[ [(Helvetica) 140.0 0.0 true true 0 ( 30.7)]
] -46.7 MCshow
/Helvetica findfont 140 scalefont setfont
1.000 UL
@ -771,7 +717,7 @@ LCb setrgbcolor
0 -63 V
stroke
2440 308 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 200)]
[ [(Helvetica) 140.0 0.0 true true 0 ( 30.8)]
] -46.7 MCshow
/Helvetica findfont 140 scalefont setfont
1.000 UL
@ -792,7 +738,7 @@ LCb setrgbcolor
0 -63 V
stroke
3191 308 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 300)]
[ [(Helvetica) 140.0 0.0 true true 0 ( 30.9)]
] -46.7 MCshow
/Helvetica findfont 140 scalefont setfont
1.000 UL
@ -813,7 +759,7 @@ LCb setrgbcolor
0 -63 V
stroke
3943 308 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 400)]
[ [(Helvetica) 140.0 0.0 true true 0 ( 31)]
] -46.7 MCshow
/Helvetica findfont 140 scalefont setfont
1.000 UL
@ -834,7 +780,7 @@ LCb setrgbcolor
0 -63 V
stroke
4694 308 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 500)]
[ [(Helvetica) 140.0 0.0 true true 0 ( 31.1)]
] -46.7 MCshow
/Helvetica findfont 140 scalefont setfont
1.000 UL
@ -857,7 +803,7 @@ LCb setrgbcolor
0 -63 V
stroke
5445 308 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 600)]
[ [(Helvetica) 140.0 0.0 true true 0 ( 31.2)]
] -46.7 MCshow
/Helvetica findfont 140 scalefont setfont
1.000 UL
@ -880,7 +826,7 @@ LCb setrgbcolor
0 -63 V
stroke
6196 308 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 700)]
[ [(Helvetica) 140.0 0.0 true true 0 ( 31.3)]
] -46.7 MCshow
/Helvetica findfont 140 scalefont setfont
1.000 UL
@ -901,7 +847,7 @@ LCb setrgbcolor
0 -63 V
stroke
6947 308 M
[ [(Helvetica) 140.0 0.0 true true 0 ( 800)]
[ [(Helvetica) 140.0 0.0 true true 0 ( 31.4)]
] -46.7 MCshow
/Helvetica findfont 140 scalefont setfont
1.000 UL
@ -960,58 +906,8 @@ LTb
0.58 0.00 0.83 C
6380 4486 M
399 0 V
1171 2295 M
7 69 V
481 -156 V
8 46 V
225 -35 V
7 -150 V
8 86 V
466 20 V
7 -109 V
278 98 V
195 2 V
8 -631 V
7 636 V
466 -5 V
8 -77 V
7 77 V
721 -1 V
466 1 V
7 -46 V
233 45 V
8 -654 V
954 640 V
7 -487 V
954 308 V
8 -337 V
7 488 V
1171 2295 Pls
1178 2364 Pls
1659 2208 Pls
1667 2254 Pls
1892 2219 Pls
1899 2069 Pls
1907 2155 Pls
2373 2175 Pls
2380 2066 Pls
2658 2164 Pls
2853 2166 Pls
2861 1535 Pls
2868 2171 Pls
3334 2166 Pls
3342 2089 Pls
3349 2166 Pls
4070 2165 Pls
4536 2166 Pls
4543 2120 Pls
4776 2165 Pls
4784 1511 Pls
5738 2151 Pls
5745 1664 Pls
6699 1972 Pls
6707 1635 Pls
6714 2123 Pls
3942 1253 M
3942 1253 Pls
6579 4486 Pls
% End plot #1
% Begin plot #2
@ -1030,58 +926,8 @@ LTb
0.00 0.62 0.45 C
6380 4346 M
399 0 V
1171 2175 M
7 -15 V
481 8 V
8 0 V
225 -3 V
7 -82 V
8 64 V
466 -5 V
7 -54 V
278 40 V
195 -6 V
8 -446 V
7 445 V
466 8 V
8 -121 V
7 108 V
721 14 V
466 -4 V
7 -36 V
233 16 V
8 -553 V
954 567 V
7 -549 V
954 539 V
8 -424 V
7 409 V
1171 2175 Crs
1178 2160 Crs
1659 2168 Crs
1667 2168 Crs
1892 2165 Crs
1899 2083 Crs
1907 2147 Crs
2373 2142 Crs
2380 2088 Crs
2658 2128 Crs
2853 2122 Crs
2861 1676 Crs
2868 2121 Crs
3334 2129 Crs
3342 2008 Crs
3349 2116 Crs
4070 2130 Crs
4536 2126 Crs
4543 2090 Crs
4776 2106 Crs
4784 1553 Crs
5738 2120 Crs
5745 1571 Crs
6699 2110 Crs
6707 1686 Crs
6714 2095 Crs
3942 1205 M
3942 1205 Crs
6579 4346 Crs
% End plot #2
% Begin plot #3
@ -1100,58 +946,8 @@ LTb
0.34 0.71 0.91 C
6380 4206 M
399 0 V
1171 3733 M
7 79 V
481 134 V
8 -28 V
225 -40 V
7 64 V
8 -28 V
466 9 V
7 66 V
278 -17 V
195 -28 V
8 33 V
7 -30 V
466 2 V
8 59 V
7 -32 V
721 -6 V
466 17 V
7 9 V
233 -33 V
8 38 V
954 -23 V
7 12 V
954 8 V
8 23 V
7 -52 V
1171 3733 Star
1178 3812 Star
1659 3946 Star
1667 3918 Star
1892 3878 Star
1899 3942 Star
1907 3914 Star
2373 3923 Star
2380 3989 Star
2658 3972 Star
2853 3944 Star
2861 3977 Star
2868 3947 Star
3334 3949 Star
3342 4008 Star
3349 3976 Star
4070 3970 Star
4536 3987 Star
4543 3996 Star
4776 3963 Star
4784 4001 Star
5738 3978 Star
5745 3990 Star
6699 3998 Star
6707 4021 Star
6714 3969 Star
3942 3120 M
3942 3120 Star
6579 4206 Star
% End plot #3
2.000 UL

View file

@ -1,29 +1,4 @@
#Description: Naive, three-loop dgemm.
Size: 31 Mflop/s: 2131.35 Percentage: 5.79
Size: 32 Mflop/s: 2387.28 Percentage: 6.49
Size: 96 Mflop/s: 1844.52 Percentage: 5.01
Size: 97 Mflop/s: 1991.74 Percentage: 5.41
Size: 127 Mflop/s: 1878.09 Percentage: 5.10
Size: 128 Mflop/s: 1466.11 Percentage: 3.98
Size: 129 Mflop/s: 1688.41 Percentage: 4.59
Size: 191 Mflop/s: 1747.28 Percentage: 4.75
Size: 192 Mflop/s: 1458.67 Percentage: 3.96
Size: 229 Mflop/s: 1714.53 Percentage: 4.66
Size: 255 Mflop/s: 1719.28 Percentage: 4.67
Size: 256 Mflop/s: 604.682 Percentage: 1.64
Size: 257 Mflop/s: 1733.51 Percentage: 4.71
Size: 319 Mflop/s: 1720.89 Percentage: 4.68
Size: 320 Mflop/s: 1514.24 Percentage: 4.11
Size: 321 Mflop/s: 1721.2 Percentage: 4.68
Size: 417 Mflop/s: 1718.17 Percentage: 4.67
Size: 479 Mflop/s: 1719.18 Percentage: 4.67
Size: 480 Mflop/s: 1594.88 Percentage: 4.33
Size: 511 Mflop/s: 1716.8 Percentage: 4.67
Size: 512 Mflop/s: 581.233 Percentage: 1.58
Size: 639 Mflop/s: 1678.33 Percentage: 4.56
Size: 640 Mflop/s: 749.008 Percentage: 2.04
Size: 767 Mflop/s: 1247.59 Percentage: 3.39
Size: 768 Mflop/s: 714.52 Percentage: 1.94
Size: 769 Mflop/s: 1603.09 Percentage: 4.36
#Average percentage of Peak = 4.24797
Size: 31 Mflop/s: 2431.2 Percentage: 6.61
#Average percentage of Peak = 6.60652

View file

@ -1,29 +1,4 @@
#Description: Reference dgemm.
Size: 31 Mflop/s: 23035.3 Percentage: 62.60
Size: 32 Mflop/s: 26290.9 Percentage: 71.44
Size: 96 Mflop/s: 32829.1 Percentage: 89.21
Size: 97 Mflop/s: 31312.6 Percentage: 85.09
Size: 127 Mflop/s: 29329 Percentage: 79.70
Size: 128 Mflop/s: 32578.6 Percentage: 88.53
Size: 129 Mflop/s: 31113.1 Percentage: 84.55
Size: 191 Mflop/s: 31590.5 Percentage: 85.84
Size: 192 Mflop/s: 35219.4 Percentage: 95.70
Size: 229 Mflop/s: 34236 Percentage: 93.03
Size: 255 Mflop/s: 32692.5 Percentage: 88.84
Size: 256 Mflop/s: 34510 Percentage: 93.78
Size: 257 Mflop/s: 32844.7 Percentage: 89.25
Size: 319 Mflop/s: 32950.9 Percentage: 89.54
Size: 320 Mflop/s: 36332.4 Percentage: 98.73
Size: 321 Mflop/s: 34460.3 Percentage: 93.64
Size: 417 Mflop/s: 34136 Percentage: 92.76
Size: 479 Mflop/s: 35101.8 Percentage: 95.39
Size: 480 Mflop/s: 35608.8 Percentage: 96.76
Size: 511 Mflop/s: 33768.6 Percentage: 91.76
Size: 512 Mflop/s: 35947 Percentage: 97.68
Size: 639 Mflop/s: 34572.5 Percentage: 93.95
Size: 640 Mflop/s: 35268.1 Percentage: 95.84
Size: 767 Mflop/s: 35731.4 Percentage: 97.10
Size: 768 Mflop/s: 37114.6 Percentage:100.85
Size: 769 Mflop/s: 34093.6 Percentage: 92.65
#Average percentage of Peak = 90.1618
Size: 31 Mflop/s: 19099.4 Percentage: 51.90
#Average percentage of Peak = 51.9005

View file

@ -1,29 +1,3 @@
#Description: Naive, three-loop dgemm.
#Description: Block-based dgemm.
Size: 31 Mflop/s: 1065.56 Percentage: 2.90
Size: 32 Mflop/s: 1703.76 Percentage: 4.63
Size: 96 Mflop/s: 1730.73 Percentage: 4.70
Size: 97 Mflop/s: 1728.48 Percentage: 4.70
Size: 127 Mflop/s: 1718.52 Percentage: 4.67
Size: 128 Mflop/s: 1533.64 Percentage: 4.17
Size: 129 Mflop/s: 1724.17 Percentage: 4.69
Size: 191 Mflop/s: 1636.9 Percentage: 4.45
Size: 192 Mflop/s: 1534.75 Percentage: 4.17
Size: 229 Mflop/s: 1604.48 Percentage: 4.36
Size: 255 Mflop/s: 1462.11 Percentage: 3.97
Size: 256 Mflop/s: 730.562 Percentage: 1.99
Size: 257 Mflop/s: 1483.12 Percentage: 4.03
Size: 319 Mflop/s: 1409.3 Percentage: 3.83
Size: 320 Mflop/s: 1303.95 Percentage: 3.54
Size: 321 Mflop/s: 1621.34 Percentage: 4.41
Size: 417 Mflop/s: 1496.69 Percentage: 4.07
Size: 479 Mflop/s: 1518.7 Percentage: 4.13
Size: 480 Mflop/s: 1429.18 Percentage: 3.88
Size: 511 Mflop/s: 1371.7 Percentage: 3.73
Size: 512 Mflop/s: 602.424 Percentage: 1.64
Size: 639 Mflop/s: 1339.03 Percentage: 3.64
Size: 640 Mflop/s: 913.949 Percentage: 2.48
Size: 767 Mflop/s: 1566.19 Percentage: 4.26
Size: 768 Mflop/s: 757.52 Percentage: 2.06
Size: 769 Mflop/s: 1559.49 Percentage: 4.24
#Average percentage of Peak = 3.81963
Size: 31 Mflop/s: 2306.44 Percentage: 6.27