hw1: compiler flags

This commit is contained in:
Claudio Maggioni 2022-10-12 11:02:19 +02:00
parent abbd0275ec
commit 980eb5f0b9
6 changed files with 259 additions and 263 deletions

View file

@ -3,8 +3,8 @@
# #
CC = gcc CC = gcc
OPT = -O2 OPT = -O3
CFLAGS = -Wall -std=gnu99 $(OPT) CFLAGS = -Wall -std=gnu99 -march=haswell -ffast-math $(OPT)
LDFLAGS = -Wall LDFLAGS = -Wall
# librt is needed for clock_gettime # librt is needed for clock_gettime
LDLIBS = -lrt -Wl,--no-as-needed -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lpthread -lm -ldl -m64 -I${MKLROOT}/include LDLIBS = -lrt -Wl,--no-as-needed -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lpthread -lm -ldl -m64 -I${MKLROOT}/include

View file

@ -15,29 +15,11 @@ LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKL
*/ */
#define MIN(a,b) (((a)<(b))?(a):(b))
const char* dgemm_desc = "Block-based dgemm."; const char* dgemm_desc = "Block-based dgemm.";
const int block_size = 26; int block_size = 32;
inline int min(int a, int b) {
return a < b ? a : b;
}
inline void naivemm(int r_min, int r_max, int k_min, int k_max, int c_min, int c_max, int n, double* A_row, double* B, double* C_temp) {
for (int i = r_min, ii = 0; i < r_max; ++i, ++ii) {
for (int j = c_min, jj = 0; j < c_max; ++j, ++jj) {
for (int k = k_min; k < k_max; k++) {
C_temp[ii + jj * block_size] += A_row[i * n + k] * B[k + j * n];
}
}
}
}
inline void store_c(double* C, double* C_temp, int r_min, int r_max, int c_min, int c_max, int n) {
for (int j = c_min, jj = 0; j < c_max; ++j, ++jj) {
memcpy(C + j * n + r_min, C_temp + jj * block_size, (r_max - r_min) * sizeof(double));
}
}
/* This routine performs a dgemm operation /* This routine performs a dgemm operation
* C := C + A * B * C := C + A * B
@ -57,19 +39,33 @@ void square_dgemm(int n, double* A, double* B, double* C) {
} }
for (int i = 0; i < n; i += block_size) { for (int i = 0; i < n; i += block_size) {
int i_next = min(i + block_size, n); int i_next = MIN(i + block_size, n);
for (int j = 0; j < n; j += block_size) { for (int j = 0; j < n; j += block_size) {
int j_next = min(j + block_size, n); int j_next = MIN(j + block_size, n);
// clear matrix C_temp
memset(C_temp, 0, block_size * block_size * sizeof(double)); memset(C_temp, 0, block_size * block_size * sizeof(double));
for (int k = 0; k < n; k += block_size) { for (int k = 0; k < n; k += block_size) {
int k_next = min(k + block_size, n); int k_next = MIN(k + block_size, n);
naivemm(i, i_next, k, k_next, j, j_next, n, A_row, B, C_temp);
// begin naivemm
for (int i2 = i, ii2 = 0; i2 < i_next; ++i2, ++ii2) {
for (int j2 = j, jj2 = 0; j2 < j_next; ++j2, ++jj2) {
for (int k2 = k; k2 < k_next; k2++) {
C_temp[ii2 + jj2 * block_size] += A_row[i2 * n + k2] * B[k2 + j2 * n];
}
}
}
// end naivemm
} }
store_c(C, C_temp, i, i_next, j, j_next, n); // store C_temp in C
for (int j2 = j, jj2 = 0; j2 < j_next; ++j2, ++jj2) {
memcpy(C + j2 * n + i, C_temp + jj2 * block_size, (i_next - i) * sizeof(double));
}
// end store C_temp
} }
} }
} }

View file

@ -1,7 +1,7 @@
%!PS-Adobe-2.0 %!PS-Adobe-2.0
%%Title: timing.ps %%Title: timing.ps
%%Creator: gnuplot 5.2 patchlevel 8 %%Creator: gnuplot 5.2 patchlevel 8
%%CreationDate: Mon Oct 3 21:44:11 2022 %%CreationDate: Wed Oct 12 11:03:04 2022
%%DocumentFonts: (atend) %%DocumentFonts: (atend)
%%BoundingBox: 50 50 554 770 %%BoundingBox: 50 50 554 770
%%Orientation: Landscape %%Orientation: Landscape
@ -483,7 +483,7 @@ SDict begin [
/Creator (gnuplot 5.2 patchlevel 8) /Creator (gnuplot 5.2 patchlevel 8)
% /Producer (gnuplot) % /Producer (gnuplot)
% /Keywords () % /Keywords ()
/CreationDate (Mon Oct 3 21:44:11 2022) /CreationDate (Wed Oct 12 11:03:04 2022)
/DOCINFO pdfmark /DOCINFO pdfmark
end end
} ifelse } ifelse
@ -960,58 +960,58 @@ LTb
0.58 0.00 0.83 C 0.58 0.00 0.83 C
6380 4486 M 6380 4486 M
399 0 V 399 0 V
1171 2365 M 1171 2529 M
7 2 V 7 42 V
481 -111 V 481 -132 V
8 0 V 8 6 V
225 -23 V 225 -40 V
7 -63 V 7 -211 V
8 57 V 8 237 V
466 -55 V 466 -92 V
7 -15 V 7 -145 V
278 7 V 278 126 V
195 2 V 195 -12 V
8 -480 V 8 -598 V
7 483 V 7 615 V
466 -9 V 466 -18 V
8 -112 V 8 -180 V
7 125 V 7 180 V
721 -3 V 721 -13 V
466 0 V 466 -6 V
7 -19 V 7 -79 V
233 20 V 233 81 V
8 -480 V 8 -586 V
954 473 V 954 516 V
7 -573 V 7 -505 V
954 564 V 954 487 V
8 -457 V 8 -489 V
7 460 V 7 489 V
1171 2365 Pls 1171 2529 Pls
1178 2367 Pls 1178 2571 Pls
1659 2256 Pls 1659 2439 Pls
1667 2256 Pls 1667 2445 Pls
1892 2233 Pls 1892 2405 Pls
1899 2170 Pls 1899 2194 Pls
1907 2227 Pls 1907 2431 Pls
2373 2172 Pls 2373 2339 Pls
2380 2157 Pls 2380 2194 Pls
2658 2164 Pls 2658 2320 Pls
2853 2166 Pls 2853 2308 Pls
2861 1686 Pls 2861 1710 Pls
2868 2169 Pls 2868 2325 Pls
3334 2160 Pls 3334 2307 Pls
3342 2048 Pls 3342 2127 Pls
3349 2173 Pls 3349 2307 Pls
4070 2170 Pls 4070 2294 Pls
4536 2170 Pls 4536 2288 Pls
4543 2151 Pls 4543 2209 Pls
4776 2171 Pls 4776 2290 Pls
4784 1691 Pls 4784 1704 Pls
5738 2164 Pls 5738 2220 Pls
5745 1591 Pls 5745 1715 Pls
6699 2155 Pls 6699 2202 Pls
6707 1698 Pls 6707 1713 Pls
6714 2158 Pls 6714 2202 Pls
6579 4486 Pls 6579 4486 Pls
% End plot #1 % End plot #1
% Begin plot #2 % Begin plot #2
@ -1030,58 +1030,58 @@ LTb
0.00 0.62 0.45 C 0.00 0.62 0.45 C
6380 4346 M 6380 4346 M
399 0 V 399 0 V
1171 2290 M 1171 2651 M
7 21 V 7 199 V
481 37 V 481 31 V
8 -1 V 8 -101 V
225 2 V 225 27 V
7 -12 V 7 -19 V
8 13 V 8 -29 V
466 3 V 466 43 V
7 0 V 7 71 V
278 2 V 278 -65 V
195 -5 V 195 -58 V
8 -7 V 8 17 V
7 6 V 7 -19 V
466 8 V 466 73 V
8 1 V 8 66 V
7 0 V 7 -64 V
721 3 V 721 4 V
466 -38 V 466 1 V
7 -12 V 7 62 V
233 10 V 233 -146 V
8 15 V
954 71 V
7 8 V
954 -65 V
8 16 V 8 16 V
954 2 V 7 -18 V
7 -7 V 1171 2651 Crs
954 16 V 1178 2850 Crs
8 10 V 1659 2881 Crs
7 -11 V 1667 2780 Crs
1171 2290 Crs 1892 2807 Crs
1178 2311 Crs 1899 2788 Crs
1659 2348 Crs 1907 2759 Crs
1667 2347 Crs 2373 2802 Crs
1892 2349 Crs 2380 2873 Crs
1899 2337 Crs 2658 2808 Crs
1907 2350 Crs 2853 2750 Crs
2373 2353 Crs 2861 2767 Crs
2380 2353 Crs 2868 2748 Crs
2658 2355 Crs 3334 2821 Crs
2853 2350 Crs 3342 2887 Crs
2861 2343 Crs 3349 2823 Crs
2868 2349 Crs 4070 2827 Crs
3334 2357 Crs 4536 2828 Crs
3342 2358 Crs 4543 2890 Crs
3349 2358 Crs 4776 2744 Crs
4070 2361 Crs 4784 2759 Crs
4536 2323 Crs 5738 2830 Crs
4543 2311 Crs 5745 2838 Crs
4776 2321 Crs 6699 2773 Crs
4784 2337 Crs 6707 2789 Crs
5738 2339 Crs 6714 2771 Crs
5745 2332 Crs
6699 2348 Crs
6707 2358 Crs
6714 2347 Crs
6579 4346 Crs 6579 4346 Crs
% End plot #2 % End plot #2
% Begin plot #3 % Begin plot #3
@ -1100,58 +1100,58 @@ LTb
0.34 0.71 0.91 C 0.34 0.71 0.91 C
6380 4206 M 6380 4206 M
399 0 V 399 0 V
1171 3743 M 1171 3798 M
7 112 V 7 73 V
481 86 V 481 75 V
8 -53 V 8 -21 V
225 -26 V 225 -27 V
7 81 V 7 46 V
8 -27 V 8 -18 V
466 20 V 466 10 V
7 10 V 7 58 V
278 28 V 278 -17 V
195 -16 V 195 -13 V
8 0 V 8 25 V
7 4 V 7 -25 V
466 12 V 466 11 V
8 35 V 8 35 V
7 -16 V 7 -16 V
721 11 V 721 11 V
466 -55 V 466 0 V
7 74 V 7 29 V
233 -53 V 233 -46 V
8 41 V 8 37 V
954 -22 V 954 -7 V
7 22 V 7 21 V
954 -14 V 954 -17 V
8 -64 V 8 24 V
7 17 V 7 -26 V
1171 3743 Star 1171 3798 Star
1178 3855 Star 1178 3871 Star
1659 3941 Star 1659 3946 Star
1667 3888 Star 1667 3925 Star
1892 3862 Star 1892 3898 Star
1899 3943 Star 1899 3944 Star
1907 3916 Star 1907 3926 Star
2373 3936 Star 2373 3936 Star
2380 3946 Star 2380 3994 Star
2658 3974 Star 2658 3977 Star
2853 3958 Star 2853 3964 Star
2861 3958 Star 2861 3989 Star
2868 3962 Star 2868 3964 Star
3334 3974 Star 3334 3975 Star
3342 4009 Star 3342 4010 Star
3349 3993 Star 3349 3994 Star
4070 4004 Star 4070 4005 Star
4536 3949 Star 4536 4005 Star
4543 4023 Star 4543 4034 Star
4776 3970 Star 4776 3988 Star
4784 4011 Star 4784 4025 Star
5738 3989 Star 5738 4018 Star
5745 4011 Star 5745 4039 Star
6699 3997 Star 6699 4022 Star
6707 3933 Star 6707 4046 Star
6714 3950 Star 6714 4020 Star
6579 4206 Star 6579 4206 Star
% End plot #3 % End plot #3
2.000 UL 2.000 UL

View file

@ -1,29 +1,29 @@
#Description: Naive, three-loop dgemm. #Description: Naive, three-loop dgemm.
Size: 31 Mflop/s: 2393.33 Percentage: 6.50 Size: 31 Mflop/s: 3140.45 Percentage: 8.53
Size: 32 Mflop/s: 2400.13 Percentage: 6.52 Size: 32 Mflop/s: 3364.78 Percentage: 9.14
Size: 96 Mflop/s: 1998.74 Percentage: 5.43 Size: 96 Mflop/s: 2703.08 Percentage: 7.35
Size: 97 Mflop/s: 1996.01 Percentage: 5.42 Size: 97 Mflop/s: 2729.68 Percentage: 7.42
Size: 127 Mflop/s: 1923.81 Percentage: 5.23 Size: 127 Mflop/s: 2556.58 Percentage: 6.95
Size: 128 Mflop/s: 1731.98 Percentage: 4.71 Size: 128 Mflop/s: 1803.41 Percentage: 4.90
Size: 129 Mflop/s: 1903.31 Percentage: 5.17 Size: 129 Mflop/s: 2669.26 Percentage: 7.25
Size: 191 Mflop/s: 1736.78 Percentage: 4.72 Size: 191 Mflop/s: 2290.09 Percentage: 6.22
Size: 192 Mflop/s: 1694.44 Percentage: 4.60 Size: 192 Mflop/s: 1801.66 Percentage: 4.90
Size: 229 Mflop/s: 1715.1 Percentage: 4.66 Size: 229 Mflop/s: 2218.61 Percentage: 6.03
Size: 255 Mflop/s: 1720.39 Percentage: 4.67 Size: 255 Mflop/s: 2178.15 Percentage: 5.92
Size: 256 Mflop/s: 777.65 Percentage: 2.11 Size: 256 Mflop/s: 808.413 Percentage: 2.20
Size: 257 Mflop/s: 1729.27 Percentage: 4.70 Size: 257 Mflop/s: 2238.93 Percentage: 6.08
Size: 319 Mflop/s: 1704.8 Percentage: 4.63 Size: 319 Mflop/s: 2174.45 Percentage: 5.91
Size: 320 Mflop/s: 1414.84 Percentage: 3.84 Size: 320 Mflop/s: 1612.13 Percentage: 4.38
Size: 321 Mflop/s: 1741.3 Percentage: 4.73 Size: 321 Mflop/s: 2173.64 Percentage: 5.91
Size: 417 Mflop/s: 1733 Percentage: 4.71 Size: 417 Mflop/s: 2125.36 Percentage: 5.78
Size: 479 Mflop/s: 1731.17 Percentage: 4.70 Size: 479 Mflop/s: 2107.13 Percentage: 5.73
Size: 480 Mflop/s: 1678.77 Percentage: 4.56 Size: 480 Mflop/s: 1848.43 Percentage: 5.02
Size: 511 Mflop/s: 1733.6 Percentage: 4.71 Size: 511 Mflop/s: 2112.99 Percentage: 5.74
Size: 512 Mflop/s: 782.96 Percentage: 2.13 Size: 512 Mflop/s: 801.127 Percentage: 2.18
Size: 639 Mflop/s: 1714.42 Percentage: 4.66 Size: 639 Mflop/s: 1881.94 Percentage: 5.11
Size: 640 Mflop/s: 663.418 Percentage: 1.80 Size: 640 Mflop/s: 815.847 Percentage: 2.22
Size: 767 Mflop/s: 1690.82 Percentage: 4.59 Size: 767 Mflop/s: 1825.75 Percentage: 4.96
Size: 768 Mflop/s: 792.043 Percentage: 2.15 Size: 768 Mflop/s: 812.933 Percentage: 2.21
Size: 769 Mflop/s: 1696.95 Percentage: 4.61 Size: 769 Mflop/s: 1825.38 Percentage: 4.96
#Average percentage of Peak = 4.47314 #Average percentage of Peak = 5.4996

View file

@ -1,29 +1,29 @@
#Description: Reference dgemm. #Description: Reference dgemm.
Size: 31 Mflop/s: 23449.2 Percentage: 63.72 Size: 31 Mflop/s: 25677.4 Percentage: 69.78
Size: 32 Mflop/s: 28198.9 Percentage: 76.63 Size: 32 Mflop/s: 28952.1 Percentage: 78.67
Size: 96 Mflop/s: 32542.3 Percentage: 88.43 Size: 96 Mflop/s: 32816.4 Percentage: 89.18
Size: 97 Mflop/s: 29801.3 Percentage: 80.98 Size: 97 Mflop/s: 31699.2 Percentage: 86.14
Size: 127 Mflop/s: 28557.8 Percentage: 77.60 Size: 127 Mflop/s: 30274.5 Percentage: 82.27
Size: 128 Mflop/s: 32643.3 Percentage: 88.70 Size: 128 Mflop/s: 32721.7 Percentage: 88.92
Size: 129 Mflop/s: 31198.2 Percentage: 84.78 Size: 129 Mflop/s: 31746.4 Percentage: 86.27
Size: 191 Mflop/s: 32247.3 Percentage: 87.63 Size: 191 Mflop/s: 32263.1 Percentage: 87.67
Size: 192 Mflop/s: 32830.6 Percentage: 89.21 Size: 192 Mflop/s: 35491.2 Percentage: 96.44
Size: 229 Mflop/s: 34360.9 Percentage: 93.37 Size: 229 Mflop/s: 34557.2 Percentage: 93.91
Size: 255 Mflop/s: 33477.7 Percentage: 90.97 Size: 255 Mflop/s: 33771.3 Percentage: 91.77
Size: 256 Mflop/s: 33473.9 Percentage: 90.96 Size: 256 Mflop/s: 35221.1 Percentage: 95.71
Size: 257 Mflop/s: 33686.5 Percentage: 91.54 Size: 257 Mflop/s: 33807.9 Percentage: 91.87
Size: 319 Mflop/s: 34335.2 Percentage: 93.30 Size: 319 Mflop/s: 34415.8 Percentage: 93.52
Size: 320 Mflop/s: 36438.1 Percentage: 99.02 Size: 320 Mflop/s: 36500.2 Percentage: 99.19
Size: 321 Mflop/s: 35433.7 Percentage: 96.29 Size: 321 Mflop/s: 35508.1 Percentage: 96.49
Size: 417 Mflop/s: 36133.7 Percentage: 98.19 Size: 417 Mflop/s: 36157.6 Percentage: 98.25
Size: 479 Mflop/s: 32951.4 Percentage: 89.54 Size: 479 Mflop/s: 36186.4 Percentage: 98.33
Size: 480 Mflop/s: 37260 Percentage:101.25 Size: 480 Mflop/s: 37971.3 Percentage:103.18
Size: 511 Mflop/s: 34128 Percentage: 92.74 Size: 511 Mflop/s: 35144 Percentage: 95.50
Size: 512 Mflop/s: 36526.4 Percentage: 99.26 Size: 512 Mflop/s: 37362.5 Percentage:101.53
Size: 639 Mflop/s: 35249.2 Percentage: 95.79 Size: 639 Mflop/s: 36989.1 Percentage:100.51
Size: 640 Mflop/s: 36538.7 Percentage: 99.29 Size: 640 Mflop/s: 38267.8 Percentage:103.99
Size: 767 Mflop/s: 35718.5 Percentage: 97.06 Size: 767 Mflop/s: 37220.8 Percentage:101.14
Size: 768 Mflop/s: 32116.8 Percentage: 87.27 Size: 768 Mflop/s: 38744 Percentage:105.28
Size: 769 Mflop/s: 33033.9 Percentage: 89.77 Size: 769 Mflop/s: 37076.1 Percentage:100.75
#Average percentage of Peak = 90.1266 #Average percentage of Peak = 93.7023

View file

@ -1,29 +1,29 @@
#Description: Block-based dgemm. #Description: Block-based dgemm.
Size: 31 Mflop/s: 2112.63 Percentage: 5.74 Size: 31 Mflop/s: 3844.56 Percentage: 10.45
Size: 32 Mflop/s: 2187.44 Percentage: 5.94 Size: 32 Mflop/s: 5342.55 Percentage: 14.52
Size: 96 Mflop/s: 2325.39 Percentage: 6.32 Size: 96 Mflop/s: 5620.08 Percentage: 15.27
Size: 97 Mflop/s: 2322.81 Percentage: 6.31 Size: 97 Mflop/s: 4754.1 Percentage: 12.92
Size: 127 Mflop/s: 2330.3 Percentage: 6.33 Size: 127 Mflop/s: 4977.82 Percentage: 13.53
Size: 128 Mflop/s: 2282.93 Percentage: 6.20 Size: 128 Mflop/s: 4817.8 Percentage: 13.09
Size: 129 Mflop/s: 2334.25 Percentage: 6.34 Size: 129 Mflop/s: 4594.25 Percentage: 12.48
Size: 191 Mflop/s: 2345.91 Percentage: 6.37 Size: 191 Mflop/s: 4931.27 Percentage: 13.40
Size: 192 Mflop/s: 2345.38 Percentage: 6.37 Size: 192 Mflop/s: 5549.67 Percentage: 15.08
Size: 229 Mflop/s: 2351.01 Percentage: 6.39 Size: 229 Mflop/s: 4982.59 Percentage: 13.54
Size: 255 Mflop/s: 2335.21 Percentage: 6.35 Size: 255 Mflop/s: 4528.43 Percentage: 12.31
Size: 256 Mflop/s: 2306.48 Percentage: 6.27 Size: 256 Mflop/s: 4652.68 Percentage: 12.64
Size: 257 Mflop/s: 2330.68 Percentage: 6.33 Size: 257 Mflop/s: 4512.33 Percentage: 12.26
Size: 319 Mflop/s: 2360.03 Percentage: 6.41 Size: 319 Mflop/s: 5093.38 Percentage: 13.84
Size: 320 Mflop/s: 2364.53 Percentage: 6.43 Size: 320 Mflop/s: 5674.61 Percentage: 15.42
Size: 321 Mflop/s: 2366.38 Percentage: 6.43 Size: 321 Mflop/s: 5111.09 Percentage: 13.89
Size: 417 Mflop/s: 2378.34 Percentage: 6.46 Size: 417 Mflop/s: 5143.98 Percentage: 13.98
Size: 479 Mflop/s: 2233.05 Percentage: 6.07 Size: 479 Mflop/s: 5152.51 Percentage: 14.00
Size: 480 Mflop/s: 2187.87 Percentage: 5.95 Size: 480 Mflop/s: 5703 Percentage: 15.50
Size: 511 Mflop/s: 2224.61 Percentage: 6.05 Size: 511 Mflop/s: 4479.96 Percentage: 12.17
Size: 512 Mflop/s: 2284.85 Percentage: 6.21 Size: 512 Mflop/s: 4596.26 Percentage: 12.49
Size: 639 Mflop/s: 2292.78 Percentage: 6.23 Size: 639 Mflop/s: 5168.59 Percentage: 14.05
Size: 640 Mflop/s: 2264.7 Percentage: 6.15 Size: 640 Mflop/s: 5232.97 Percentage: 14.22
Size: 767 Mflop/s: 2324.83 Percentage: 6.32 Size: 767 Mflop/s: 4701.09 Percentage: 12.77
Size: 768 Mflop/s: 2363.92 Percentage: 6.42 Size: 768 Mflop/s: 4826.12 Percentage: 13.11
Size: 769 Mflop/s: 2321.31 Percentage: 6.31 Size: 769 Mflop/s: 4686.21 Percentage: 12.73
#Average percentage of Peak = 6.25811 #Average percentage of Peak = 13.4488