hw1: blocked dgemm is correct

This commit is contained in:
Claudio Maggioni 2022-09-28 13:20:37 +02:00
parent 27fc66cf14
commit 3b9b6babbb
6 changed files with 1303 additions and 16 deletions

View file

@ -79,7 +79,7 @@ int main (int argc, char **argv)
/* {31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025}; */ /* {31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025}; */
/* A representative subset of the first list. Currently uncommented. */ /* A representative subset of the first list. Currently uncommented. */
{ 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257, { 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257,
319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 }; 319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 };
int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]); int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]);

View file

@ -1,3 +1,4 @@
#include <string.h>
/* /*
Please include compiler name below (you may also include any other modules you would like to be loaded) Please include compiler name below (you may also include any other modules you would like to be loaded)
@ -13,25 +14,45 @@ LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKL
*/ */
const char* dgemm_desc = "Naive, three-loop dgemm."; const char* dgemm_desc = "Block-based dgemm.";
const int block_size = 50;
inline int min(int a, int b) {
return a < b ? a : b;
}
inline void naivemm(int r_min, int r_max, int k_min, int k_max, int c_min, int c_max, int n, double* A, double* B, double* C) {
/* For each row i of A */
for (int i = r_min; i < r_max; ++i) {
/* For each column j of B */
for (int j = c_min; j < c_max; ++j) {
for(int k = k_min; k < k_max; k++) {
C[i + j * n] += A[i + k * n] * B[k + j * n];
}
}
}
}
/* This routine performs a dgemm operation /* This routine performs a dgemm operation
* C := C + A * B * C := C + A * B
* where A, B, and C are lda-by-lda matrices stored in column-major format. * where A, B, and C are lda-by-lda matrices stored in column-major format.
* On exit, A and B maintain their input values. */ * On exit, A and B maintain their input values. */
void square_dgemm (int n, double* A, double* B, double* C) void square_dgemm(int n, double* A, double* B, double* C) {
{ /* For each row i of A */
// TODO: Implement the blocking optimization for (int i = 0; i < n; i += block_size) {
int i_next = min(i + block_size, n);
/* For each row i of A */
for (int i = 0; i < n; ++i) /* For each column j of B */
/* For each column j of B */ for (int j = 0; j < n; j += block_size) {
for (int j = 0; j < n; ++j) int j_next = min(j + block_size, n);
{
/* Compute C(i,j) */ for (int k = 0; k < n; k += block_size) {
double cij = C[i+j*n]; int k_next = min(k + block_size, n);
for( int k = 0; k < n; k++ ) naivemm(i, i_next, k, k_next, j, j_next, n, A, B, C);
cij += A[i+k*n] * B[k+j*n]; }
C[i+j*n] = cij; }
} }
} }

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,29 @@
#Description: Naive, three-loop dgemm.
Size: 31 Mflop/s: 2131.35 Percentage: 5.79
Size: 32 Mflop/s: 2387.28 Percentage: 6.49
Size: 96 Mflop/s: 1844.52 Percentage: 5.01
Size: 97 Mflop/s: 1991.74 Percentage: 5.41
Size: 127 Mflop/s: 1878.09 Percentage: 5.10
Size: 128 Mflop/s: 1466.11 Percentage: 3.98
Size: 129 Mflop/s: 1688.41 Percentage: 4.59
Size: 191 Mflop/s: 1747.28 Percentage: 4.75
Size: 192 Mflop/s: 1458.67 Percentage: 3.96
Size: 229 Mflop/s: 1714.53 Percentage: 4.66
Size: 255 Mflop/s: 1719.28 Percentage: 4.67
Size: 256 Mflop/s: 604.682 Percentage: 1.64
Size: 257 Mflop/s: 1733.51 Percentage: 4.71
Size: 319 Mflop/s: 1720.89 Percentage: 4.68
Size: 320 Mflop/s: 1514.24 Percentage: 4.11
Size: 321 Mflop/s: 1721.2 Percentage: 4.68
Size: 417 Mflop/s: 1718.17 Percentage: 4.67
Size: 479 Mflop/s: 1719.18 Percentage: 4.67
Size: 480 Mflop/s: 1594.88 Percentage: 4.33
Size: 511 Mflop/s: 1716.8 Percentage: 4.67
Size: 512 Mflop/s: 581.233 Percentage: 1.58
Size: 639 Mflop/s: 1678.33 Percentage: 4.56
Size: 640 Mflop/s: 749.008 Percentage: 2.04
Size: 767 Mflop/s: 1247.59 Percentage: 3.39
Size: 768 Mflop/s: 714.52 Percentage: 1.94
Size: 769 Mflop/s: 1603.09 Percentage: 4.36
#Average percentage of Peak = 4.24797

View file

@ -0,0 +1,29 @@
#Description: Reference dgemm.
Size: 31 Mflop/s: 23035.3 Percentage: 62.60
Size: 32 Mflop/s: 26290.9 Percentage: 71.44
Size: 96 Mflop/s: 32829.1 Percentage: 89.21
Size: 97 Mflop/s: 31312.6 Percentage: 85.09
Size: 127 Mflop/s: 29329 Percentage: 79.70
Size: 128 Mflop/s: 32578.6 Percentage: 88.53
Size: 129 Mflop/s: 31113.1 Percentage: 84.55
Size: 191 Mflop/s: 31590.5 Percentage: 85.84
Size: 192 Mflop/s: 35219.4 Percentage: 95.70
Size: 229 Mflop/s: 34236 Percentage: 93.03
Size: 255 Mflop/s: 32692.5 Percentage: 88.84
Size: 256 Mflop/s: 34510 Percentage: 93.78
Size: 257 Mflop/s: 32844.7 Percentage: 89.25
Size: 319 Mflop/s: 32950.9 Percentage: 89.54
Size: 320 Mflop/s: 36332.4 Percentage: 98.73
Size: 321 Mflop/s: 34460.3 Percentage: 93.64
Size: 417 Mflop/s: 34136 Percentage: 92.76
Size: 479 Mflop/s: 35101.8 Percentage: 95.39
Size: 480 Mflop/s: 35608.8 Percentage: 96.76
Size: 511 Mflop/s: 33768.6 Percentage: 91.76
Size: 512 Mflop/s: 35947 Percentage: 97.68
Size: 639 Mflop/s: 34572.5 Percentage: 93.95
Size: 640 Mflop/s: 35268.1 Percentage: 95.84
Size: 767 Mflop/s: 35731.4 Percentage: 97.10
Size: 768 Mflop/s: 37114.6 Percentage:100.85
Size: 769 Mflop/s: 34093.6 Percentage: 92.65
#Average percentage of Peak = 90.1618

View file

@ -0,0 +1,29 @@
#Description: Naive, three-loop dgemm.
Size: 31 Mflop/s: 1065.56 Percentage: 2.90
Size: 32 Mflop/s: 1703.76 Percentage: 4.63
Size: 96 Mflop/s: 1730.73 Percentage: 4.70
Size: 97 Mflop/s: 1728.48 Percentage: 4.70
Size: 127 Mflop/s: 1718.52 Percentage: 4.67
Size: 128 Mflop/s: 1533.64 Percentage: 4.17
Size: 129 Mflop/s: 1724.17 Percentage: 4.69
Size: 191 Mflop/s: 1636.9 Percentage: 4.45
Size: 192 Mflop/s: 1534.75 Percentage: 4.17
Size: 229 Mflop/s: 1604.48 Percentage: 4.36
Size: 255 Mflop/s: 1462.11 Percentage: 3.97
Size: 256 Mflop/s: 730.562 Percentage: 1.99
Size: 257 Mflop/s: 1483.12 Percentage: 4.03
Size: 319 Mflop/s: 1409.3 Percentage: 3.83
Size: 320 Mflop/s: 1303.95 Percentage: 3.54
Size: 321 Mflop/s: 1621.34 Percentage: 4.41
Size: 417 Mflop/s: 1496.69 Percentage: 4.07
Size: 479 Mflop/s: 1518.7 Percentage: 4.13
Size: 480 Mflop/s: 1429.18 Percentage: 3.88
Size: 511 Mflop/s: 1371.7 Percentage: 3.73
Size: 512 Mflop/s: 602.424 Percentage: 1.64
Size: 639 Mflop/s: 1339.03 Percentage: 3.64
Size: 640 Mflop/s: 913.949 Percentage: 2.48
Size: 767 Mflop/s: 1566.19 Percentage: 4.26
Size: 768 Mflop/s: 757.52 Percentage: 2.06
Size: 769 Mflop/s: 1559.49 Percentage: 4.24
#Average percentage of Peak = 3.81963