hw1: compiler flags

This commit is contained in:
Claudio Maggioni 2022-10-12 11:02:19 +02:00
parent abbd0275ec
commit 980eb5f0b9
6 changed files with 259 additions and 263 deletions

View file

@ -3,8 +3,8 @@
#
CC = gcc
OPT = -O2
CFLAGS = -Wall -std=gnu99 $(OPT)
OPT = -O3
CFLAGS = -Wall -std=gnu99 -march=haswell -ffast-math $(OPT)
LDFLAGS = -Wall
# librt is needed for clock_gettime
LDLIBS = -lrt -Wl,--no-as-needed -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lpthread -lm -ldl -m64 -I${MKLROOT}/include

View file

@ -15,29 +15,11 @@ LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKL
*/
#define MIN(a,b) (((a)<(b))?(a):(b))
const char* dgemm_desc = "Block-based dgemm.";
const int block_size = 26;
inline int min(int a, int b) {
return a < b ? a : b;
}
inline void naivemm(int r_min, int r_max, int k_min, int k_max, int c_min, int c_max, int n, double* A_row, double* B, double* C_temp) {
for (int i = r_min, ii = 0; i < r_max; ++i, ++ii) {
for (int j = c_min, jj = 0; j < c_max; ++j, ++jj) {
for (int k = k_min; k < k_max; k++) {
C_temp[ii + jj * block_size] += A_row[i * n + k] * B[k + j * n];
}
}
}
}
inline void store_c(double* C, double* C_temp, int r_min, int r_max, int c_min, int c_max, int n) {
for (int j = c_min, jj = 0; j < c_max; ++j, ++jj) {
memcpy(C + j * n + r_min, C_temp + jj * block_size, (r_max - r_min) * sizeof(double));
}
}
int block_size = 32;
/* This routine performs a dgemm operation
* C := C + A * B
@ -57,19 +39,33 @@ void square_dgemm(int n, double* A, double* B, double* C) {
}
for (int i = 0; i < n; i += block_size) {
int i_next = min(i + block_size, n);
int i_next = MIN(i + block_size, n);
for (int j = 0; j < n; j += block_size) {
int j_next = min(j + block_size, n);
int j_next = MIN(j + block_size, n);
// clear matrix C_temp
memset(C_temp, 0, block_size * block_size * sizeof(double));
for (int k = 0; k < n; k += block_size) {
int k_next = min(k + block_size, n);
naivemm(i, i_next, k, k_next, j, j_next, n, A_row, B, C_temp);
int k_next = MIN(k + block_size, n);
// begin naivemm
for (int i2 = i, ii2 = 0; i2 < i_next; ++i2, ++ii2) {
for (int j2 = j, jj2 = 0; j2 < j_next; ++j2, ++jj2) {
for (int k2 = k; k2 < k_next; k2++) {
C_temp[ii2 + jj2 * block_size] += A_row[i2 * n + k2] * B[k2 + j2 * n];
}
}
}
// end naivemm
}
store_c(C, C_temp, i, i_next, j, j_next, n);
// store C_temp in C
for (int j2 = j, jj2 = 0; j2 < j_next; ++j2, ++jj2) {
memcpy(C + j2 * n + i, C_temp + jj2 * block_size, (i_next - i) * sizeof(double));
}
// end store C_temp
}
}
}

View file

@ -1,7 +1,7 @@
%!PS-Adobe-2.0
%%Title: timing.ps
%%Creator: gnuplot 5.2 patchlevel 8
%%CreationDate: Mon Oct 3 21:44:11 2022
%%CreationDate: Wed Oct 12 11:03:04 2022
%%DocumentFonts: (atend)
%%BoundingBox: 50 50 554 770
%%Orientation: Landscape
@ -483,7 +483,7 @@ SDict begin [
/Creator (gnuplot 5.2 patchlevel 8)
% /Producer (gnuplot)
% /Keywords ()
/CreationDate (Mon Oct 3 21:44:11 2022)
/CreationDate (Wed Oct 12 11:03:04 2022)
/DOCINFO pdfmark
end
} ifelse
@ -960,58 +960,58 @@ LTb
0.58 0.00 0.83 C
6380 4486 M
399 0 V
1171 2365 M
7 2 V
481 -111 V
8 0 V
225 -23 V
7 -63 V
8 57 V
466 -55 V
7 -15 V
278 7 V
195 2 V
8 -480 V
7 483 V
466 -9 V
8 -112 V
7 125 V
721 -3 V
466 0 V
7 -19 V
233 20 V
8 -480 V
954 473 V
7 -573 V
954 564 V
8 -457 V
7 460 V
1171 2365 Pls
1178 2367 Pls
1659 2256 Pls
1667 2256 Pls
1892 2233 Pls
1899 2170 Pls
1907 2227 Pls
2373 2172 Pls
2380 2157 Pls
2658 2164 Pls
2853 2166 Pls
2861 1686 Pls
2868 2169 Pls
3334 2160 Pls
3342 2048 Pls
3349 2173 Pls
4070 2170 Pls
4536 2170 Pls
4543 2151 Pls
4776 2171 Pls
4784 1691 Pls
5738 2164 Pls
5745 1591 Pls
6699 2155 Pls
6707 1698 Pls
6714 2158 Pls
1171 2529 M
7 42 V
481 -132 V
8 6 V
225 -40 V
7 -211 V
8 237 V
466 -92 V
7 -145 V
278 126 V
195 -12 V
8 -598 V
7 615 V
466 -18 V
8 -180 V
7 180 V
721 -13 V
466 -6 V
7 -79 V
233 81 V
8 -586 V
954 516 V
7 -505 V
954 487 V
8 -489 V
7 489 V
1171 2529 Pls
1178 2571 Pls
1659 2439 Pls
1667 2445 Pls
1892 2405 Pls
1899 2194 Pls
1907 2431 Pls
2373 2339 Pls
2380 2194 Pls
2658 2320 Pls
2853 2308 Pls
2861 1710 Pls
2868 2325 Pls
3334 2307 Pls
3342 2127 Pls
3349 2307 Pls
4070 2294 Pls
4536 2288 Pls
4543 2209 Pls
4776 2290 Pls
4784 1704 Pls
5738 2220 Pls
5745 1715 Pls
6699 2202 Pls
6707 1713 Pls
6714 2202 Pls
6579 4486 Pls
% End plot #1
% Begin plot #2
@ -1030,58 +1030,58 @@ LTb
0.00 0.62 0.45 C
6380 4346 M
399 0 V
1171 2290 M
7 21 V
481 37 V
8 -1 V
225 2 V
7 -12 V
8 13 V
466 3 V
7 0 V
278 2 V
195 -5 V
8 -7 V
7 6 V
466 8 V
8 1 V
7 0 V
721 3 V
466 -38 V
7 -12 V
233 10 V
1171 2651 M
7 199 V
481 31 V
8 -101 V
225 27 V
7 -19 V
8 -29 V
466 43 V
7 71 V
278 -65 V
195 -58 V
8 17 V
7 -19 V
466 73 V
8 66 V
7 -64 V
721 4 V
466 1 V
7 62 V
233 -146 V
8 15 V
954 71 V
7 8 V
954 -65 V
8 16 V
954 2 V
7 -7 V
954 16 V
8 10 V
7 -11 V
1171 2290 Crs
1178 2311 Crs
1659 2348 Crs
1667 2347 Crs
1892 2349 Crs
1899 2337 Crs
1907 2350 Crs
2373 2353 Crs
2380 2353 Crs
2658 2355 Crs
2853 2350 Crs
2861 2343 Crs
2868 2349 Crs
3334 2357 Crs
3342 2358 Crs
3349 2358 Crs
4070 2361 Crs
4536 2323 Crs
4543 2311 Crs
4776 2321 Crs
4784 2337 Crs
5738 2339 Crs
5745 2332 Crs
6699 2348 Crs
6707 2358 Crs
6714 2347 Crs
7 -18 V
1171 2651 Crs
1178 2850 Crs
1659 2881 Crs
1667 2780 Crs
1892 2807 Crs
1899 2788 Crs
1907 2759 Crs
2373 2802 Crs
2380 2873 Crs
2658 2808 Crs
2853 2750 Crs
2861 2767 Crs
2868 2748 Crs
3334 2821 Crs
3342 2887 Crs
3349 2823 Crs
4070 2827 Crs
4536 2828 Crs
4543 2890 Crs
4776 2744 Crs
4784 2759 Crs
5738 2830 Crs
5745 2838 Crs
6699 2773 Crs
6707 2789 Crs
6714 2771 Crs
6579 4346 Crs
% End plot #2
% Begin plot #3
@ -1100,58 +1100,58 @@ LTb
0.34 0.71 0.91 C
6380 4206 M
399 0 V
1171 3743 M
7 112 V
481 86 V
8 -53 V
225 -26 V
7 81 V
8 -27 V
466 20 V
7 10 V
278 28 V
195 -16 V
8 0 V
7 4 V
466 12 V
1171 3798 M
7 73 V
481 75 V
8 -21 V
225 -27 V
7 46 V
8 -18 V
466 10 V
7 58 V
278 -17 V
195 -13 V
8 25 V
7 -25 V
466 11 V
8 35 V
7 -16 V
721 11 V
466 -55 V
7 74 V
233 -53 V
8 41 V
954 -22 V
7 22 V
954 -14 V
8 -64 V
7 17 V
1171 3743 Star
1178 3855 Star
1659 3941 Star
1667 3888 Star
1892 3862 Star
1899 3943 Star
1907 3916 Star
466 0 V
7 29 V
233 -46 V
8 37 V
954 -7 V
7 21 V
954 -17 V
8 24 V
7 -26 V
1171 3798 Star
1178 3871 Star
1659 3946 Star
1667 3925 Star
1892 3898 Star
1899 3944 Star
1907 3926 Star
2373 3936 Star
2380 3946 Star
2658 3974 Star
2853 3958 Star
2861 3958 Star
2868 3962 Star
3334 3974 Star
3342 4009 Star
3349 3993 Star
4070 4004 Star
4536 3949 Star
4543 4023 Star
4776 3970 Star
4784 4011 Star
5738 3989 Star
5745 4011 Star
6699 3997 Star
6707 3933 Star
6714 3950 Star
2380 3994 Star
2658 3977 Star
2853 3964 Star
2861 3989 Star
2868 3964 Star
3334 3975 Star
3342 4010 Star
3349 3994 Star
4070 4005 Star
4536 4005 Star
4543 4034 Star
4776 3988 Star
4784 4025 Star
5738 4018 Star
5745 4039 Star
6699 4022 Star
6707 4046 Star
6714 4020 Star
6579 4206 Star
% End plot #3
2.000 UL

View file

@ -1,29 +1,29 @@
#Description: Naive, three-loop dgemm.
Size: 31 Mflop/s: 2393.33 Percentage: 6.50
Size: 32 Mflop/s: 2400.13 Percentage: 6.52
Size: 96 Mflop/s: 1998.74 Percentage: 5.43
Size: 97 Mflop/s: 1996.01 Percentage: 5.42
Size: 127 Mflop/s: 1923.81 Percentage: 5.23
Size: 128 Mflop/s: 1731.98 Percentage: 4.71
Size: 129 Mflop/s: 1903.31 Percentage: 5.17
Size: 191 Mflop/s: 1736.78 Percentage: 4.72
Size: 192 Mflop/s: 1694.44 Percentage: 4.60
Size: 229 Mflop/s: 1715.1 Percentage: 4.66
Size: 255 Mflop/s: 1720.39 Percentage: 4.67
Size: 256 Mflop/s: 777.65 Percentage: 2.11
Size: 257 Mflop/s: 1729.27 Percentage: 4.70
Size: 319 Mflop/s: 1704.8 Percentage: 4.63
Size: 320 Mflop/s: 1414.84 Percentage: 3.84
Size: 321 Mflop/s: 1741.3 Percentage: 4.73
Size: 417 Mflop/s: 1733 Percentage: 4.71
Size: 479 Mflop/s: 1731.17 Percentage: 4.70
Size: 480 Mflop/s: 1678.77 Percentage: 4.56
Size: 511 Mflop/s: 1733.6 Percentage: 4.71
Size: 512 Mflop/s: 782.96 Percentage: 2.13
Size: 639 Mflop/s: 1714.42 Percentage: 4.66
Size: 640 Mflop/s: 663.418 Percentage: 1.80
Size: 767 Mflop/s: 1690.82 Percentage: 4.59
Size: 768 Mflop/s: 792.043 Percentage: 2.15
Size: 769 Mflop/s: 1696.95 Percentage: 4.61
#Average percentage of Peak = 4.47314
Size: 31 Mflop/s: 3140.45 Percentage: 8.53
Size: 32 Mflop/s: 3364.78 Percentage: 9.14
Size: 96 Mflop/s: 2703.08 Percentage: 7.35
Size: 97 Mflop/s: 2729.68 Percentage: 7.42
Size: 127 Mflop/s: 2556.58 Percentage: 6.95
Size: 128 Mflop/s: 1803.41 Percentage: 4.90
Size: 129 Mflop/s: 2669.26 Percentage: 7.25
Size: 191 Mflop/s: 2290.09 Percentage: 6.22
Size: 192 Mflop/s: 1801.66 Percentage: 4.90
Size: 229 Mflop/s: 2218.61 Percentage: 6.03
Size: 255 Mflop/s: 2178.15 Percentage: 5.92
Size: 256 Mflop/s: 808.413 Percentage: 2.20
Size: 257 Mflop/s: 2238.93 Percentage: 6.08
Size: 319 Mflop/s: 2174.45 Percentage: 5.91
Size: 320 Mflop/s: 1612.13 Percentage: 4.38
Size: 321 Mflop/s: 2173.64 Percentage: 5.91
Size: 417 Mflop/s: 2125.36 Percentage: 5.78
Size: 479 Mflop/s: 2107.13 Percentage: 5.73
Size: 480 Mflop/s: 1848.43 Percentage: 5.02
Size: 511 Mflop/s: 2112.99 Percentage: 5.74
Size: 512 Mflop/s: 801.127 Percentage: 2.18
Size: 639 Mflop/s: 1881.94 Percentage: 5.11
Size: 640 Mflop/s: 815.847 Percentage: 2.22
Size: 767 Mflop/s: 1825.75 Percentage: 4.96
Size: 768 Mflop/s: 812.933 Percentage: 2.21
Size: 769 Mflop/s: 1825.38 Percentage: 4.96
#Average percentage of Peak = 5.4996

View file

@ -1,29 +1,29 @@
#Description: Reference dgemm.
Size: 31 Mflop/s: 23449.2 Percentage: 63.72
Size: 32 Mflop/s: 28198.9 Percentage: 76.63
Size: 96 Mflop/s: 32542.3 Percentage: 88.43
Size: 97 Mflop/s: 29801.3 Percentage: 80.98
Size: 127 Mflop/s: 28557.8 Percentage: 77.60
Size: 128 Mflop/s: 32643.3 Percentage: 88.70
Size: 129 Mflop/s: 31198.2 Percentage: 84.78
Size: 191 Mflop/s: 32247.3 Percentage: 87.63
Size: 192 Mflop/s: 32830.6 Percentage: 89.21
Size: 229 Mflop/s: 34360.9 Percentage: 93.37
Size: 255 Mflop/s: 33477.7 Percentage: 90.97
Size: 256 Mflop/s: 33473.9 Percentage: 90.96
Size: 257 Mflop/s: 33686.5 Percentage: 91.54
Size: 319 Mflop/s: 34335.2 Percentage: 93.30
Size: 320 Mflop/s: 36438.1 Percentage: 99.02
Size: 321 Mflop/s: 35433.7 Percentage: 96.29
Size: 417 Mflop/s: 36133.7 Percentage: 98.19
Size: 479 Mflop/s: 32951.4 Percentage: 89.54
Size: 480 Mflop/s: 37260 Percentage:101.25
Size: 511 Mflop/s: 34128 Percentage: 92.74
Size: 512 Mflop/s: 36526.4 Percentage: 99.26
Size: 639 Mflop/s: 35249.2 Percentage: 95.79
Size: 640 Mflop/s: 36538.7 Percentage: 99.29
Size: 767 Mflop/s: 35718.5 Percentage: 97.06
Size: 768 Mflop/s: 32116.8 Percentage: 87.27
Size: 769 Mflop/s: 33033.9 Percentage: 89.77
#Average percentage of Peak = 90.1266
Size: 31 Mflop/s: 25677.4 Percentage: 69.78
Size: 32 Mflop/s: 28952.1 Percentage: 78.67
Size: 96 Mflop/s: 32816.4 Percentage: 89.18
Size: 97 Mflop/s: 31699.2 Percentage: 86.14
Size: 127 Mflop/s: 30274.5 Percentage: 82.27
Size: 128 Mflop/s: 32721.7 Percentage: 88.92
Size: 129 Mflop/s: 31746.4 Percentage: 86.27
Size: 191 Mflop/s: 32263.1 Percentage: 87.67
Size: 192 Mflop/s: 35491.2 Percentage: 96.44
Size: 229 Mflop/s: 34557.2 Percentage: 93.91
Size: 255 Mflop/s: 33771.3 Percentage: 91.77
Size: 256 Mflop/s: 35221.1 Percentage: 95.71
Size: 257 Mflop/s: 33807.9 Percentage: 91.87
Size: 319 Mflop/s: 34415.8 Percentage: 93.52
Size: 320 Mflop/s: 36500.2 Percentage: 99.19
Size: 321 Mflop/s: 35508.1 Percentage: 96.49
Size: 417 Mflop/s: 36157.6 Percentage: 98.25
Size: 479 Mflop/s: 36186.4 Percentage: 98.33
Size: 480 Mflop/s: 37971.3 Percentage:103.18
Size: 511 Mflop/s: 35144 Percentage: 95.50
Size: 512 Mflop/s: 37362.5 Percentage:101.53
Size: 639 Mflop/s: 36989.1 Percentage:100.51
Size: 640 Mflop/s: 38267.8 Percentage:103.99
Size: 767 Mflop/s: 37220.8 Percentage:101.14
Size: 768 Mflop/s: 38744 Percentage:105.28
Size: 769 Mflop/s: 37076.1 Percentage:100.75
#Average percentage of Peak = 93.7023

View file

@ -1,29 +1,29 @@
#Description: Block-based dgemm.
Size: 31 Mflop/s: 2112.63 Percentage: 5.74
Size: 32 Mflop/s: 2187.44 Percentage: 5.94
Size: 96 Mflop/s: 2325.39 Percentage: 6.32
Size: 97 Mflop/s: 2322.81 Percentage: 6.31
Size: 127 Mflop/s: 2330.3 Percentage: 6.33
Size: 128 Mflop/s: 2282.93 Percentage: 6.20
Size: 129 Mflop/s: 2334.25 Percentage: 6.34
Size: 191 Mflop/s: 2345.91 Percentage: 6.37
Size: 192 Mflop/s: 2345.38 Percentage: 6.37
Size: 229 Mflop/s: 2351.01 Percentage: 6.39
Size: 255 Mflop/s: 2335.21 Percentage: 6.35
Size: 256 Mflop/s: 2306.48 Percentage: 6.27
Size: 257 Mflop/s: 2330.68 Percentage: 6.33
Size: 319 Mflop/s: 2360.03 Percentage: 6.41
Size: 320 Mflop/s: 2364.53 Percentage: 6.43
Size: 321 Mflop/s: 2366.38 Percentage: 6.43
Size: 417 Mflop/s: 2378.34 Percentage: 6.46
Size: 479 Mflop/s: 2233.05 Percentage: 6.07
Size: 480 Mflop/s: 2187.87 Percentage: 5.95
Size: 511 Mflop/s: 2224.61 Percentage: 6.05
Size: 512 Mflop/s: 2284.85 Percentage: 6.21
Size: 639 Mflop/s: 2292.78 Percentage: 6.23
Size: 640 Mflop/s: 2264.7 Percentage: 6.15
Size: 767 Mflop/s: 2324.83 Percentage: 6.32
Size: 768 Mflop/s: 2363.92 Percentage: 6.42
Size: 769 Mflop/s: 2321.31 Percentage: 6.31
#Average percentage of Peak = 6.25811
Size: 31 Mflop/s: 3844.56 Percentage: 10.45
Size: 32 Mflop/s: 5342.55 Percentage: 14.52
Size: 96 Mflop/s: 5620.08 Percentage: 15.27
Size: 97 Mflop/s: 4754.1 Percentage: 12.92
Size: 127 Mflop/s: 4977.82 Percentage: 13.53
Size: 128 Mflop/s: 4817.8 Percentage: 13.09
Size: 129 Mflop/s: 4594.25 Percentage: 12.48
Size: 191 Mflop/s: 4931.27 Percentage: 13.40
Size: 192 Mflop/s: 5549.67 Percentage: 15.08
Size: 229 Mflop/s: 4982.59 Percentage: 13.54
Size: 255 Mflop/s: 4528.43 Percentage: 12.31
Size: 256 Mflop/s: 4652.68 Percentage: 12.64
Size: 257 Mflop/s: 4512.33 Percentage: 12.26
Size: 319 Mflop/s: 5093.38 Percentage: 13.84
Size: 320 Mflop/s: 5674.61 Percentage: 15.42
Size: 321 Mflop/s: 5111.09 Percentage: 13.89
Size: 417 Mflop/s: 5143.98 Percentage: 13.98
Size: 479 Mflop/s: 5152.51 Percentage: 14.00
Size: 480 Mflop/s: 5703 Percentage: 15.50
Size: 511 Mflop/s: 4479.96 Percentage: 12.17
Size: 512 Mflop/s: 4596.26 Percentage: 12.49
Size: 639 Mflop/s: 5168.59 Percentage: 14.05
Size: 640 Mflop/s: 5232.97 Percentage: 14.22
Size: 767 Mflop/s: 4701.09 Percentage: 12.77
Size: 768 Mflop/s: 4826.12 Percentage: 13.11
Size: 769 Mflop/s: 4686.21 Percentage: 12.73
#Average percentage of Peak = 13.4488