hw1: blocked dgemm is correct

This commit is contained in:
Claudio Maggioni 2022-09-28 13:20:37 +02:00
parent 27fc66cf14
commit 3b9b6babbb
6 changed files with 1303 additions and 16 deletions

View file

@ -79,7 +79,7 @@ int main (int argc, char **argv)
/* {31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025}; */
/* A representative subset of the first list. Currently uncommented. */
{ 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257,
{ 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257,
319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 };
int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]);

View file

@ -1,3 +1,4 @@
#include <string.h>
/*
Please include compiler name below (you may also include any other modules you would like to be loaded)
@ -13,25 +14,45 @@ LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKL
*/
const char* dgemm_desc = "Naive, three-loop dgemm.";
const char* dgemm_desc = "Block-based dgemm.";
const int block_size = 50;
inline int min(int a, int b) {
return a < b ? a : b;
}
inline void naivemm(int r_min, int r_max, int k_min, int k_max, int c_min, int c_max, int n, double* A, double* B, double* C) {
/* For each row i of A */
for (int i = r_min; i < r_max; ++i) {
/* For each column j of B */
for (int j = c_min; j < c_max; ++j) {
for(int k = k_min; k < k_max; k++) {
C[i + j * n] += A[i + k * n] * B[k + j * n];
}
}
}
}
/* This routine performs a dgemm operation
* C := C + A * B
* where A, B, and C are lda-by-lda matrices stored in column-major format.
* On exit, A and B maintain their input values. */
void square_dgemm (int n, double* A, double* B, double* C)
{
// TODO: Implement the blocking optimization
void square_dgemm(int n, double* A, double* B, double* C) {
/* For each row i of A */
for (int i = 0; i < n; i += block_size) {
int i_next = min(i + block_size, n);
/* For each row i of A */
for (int i = 0; i < n; ++i)
/* For each column j of B */
for (int j = 0; j < n; ++j)
{
/* Compute C(i,j) */
double cij = C[i+j*n];
for( int k = 0; k < n; k++ )
cij += A[i+k*n] * B[k+j*n];
C[i+j*n] = cij;
/* For each column j of B */
for (int j = 0; j < n; j += block_size) {
int j_next = min(j + block_size, n);
for (int k = 0; k < n; k += block_size) {
int k_next = min(k + block_size, n);
naivemm(i, i_next, k, k_next, j, j_next, n, A, B, C);
}
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,29 @@
#Description: Naive, three-loop dgemm.
Size: 31 Mflop/s: 2131.35 Percentage: 5.79
Size: 32 Mflop/s: 2387.28 Percentage: 6.49
Size: 96 Mflop/s: 1844.52 Percentage: 5.01
Size: 97 Mflop/s: 1991.74 Percentage: 5.41
Size: 127 Mflop/s: 1878.09 Percentage: 5.10
Size: 128 Mflop/s: 1466.11 Percentage: 3.98
Size: 129 Mflop/s: 1688.41 Percentage: 4.59
Size: 191 Mflop/s: 1747.28 Percentage: 4.75
Size: 192 Mflop/s: 1458.67 Percentage: 3.96
Size: 229 Mflop/s: 1714.53 Percentage: 4.66
Size: 255 Mflop/s: 1719.28 Percentage: 4.67
Size: 256 Mflop/s: 604.682 Percentage: 1.64
Size: 257 Mflop/s: 1733.51 Percentage: 4.71
Size: 319 Mflop/s: 1720.89 Percentage: 4.68
Size: 320 Mflop/s: 1514.24 Percentage: 4.11
Size: 321 Mflop/s: 1721.2 Percentage: 4.68
Size: 417 Mflop/s: 1718.17 Percentage: 4.67
Size: 479 Mflop/s: 1719.18 Percentage: 4.67
Size: 480 Mflop/s: 1594.88 Percentage: 4.33
Size: 511 Mflop/s: 1716.8 Percentage: 4.67
Size: 512 Mflop/s: 581.233 Percentage: 1.58
Size: 639 Mflop/s: 1678.33 Percentage: 4.56
Size: 640 Mflop/s: 749.008 Percentage: 2.04
Size: 767 Mflop/s: 1247.59 Percentage: 3.39
Size: 768 Mflop/s: 714.52 Percentage: 1.94
Size: 769 Mflop/s: 1603.09 Percentage: 4.36
#Average percentage of Peak = 4.24797

View file

@ -0,0 +1,29 @@
#Description: Reference dgemm.
Size: 31 Mflop/s: 23035.3 Percentage: 62.60
Size: 32 Mflop/s: 26290.9 Percentage: 71.44
Size: 96 Mflop/s: 32829.1 Percentage: 89.21
Size: 97 Mflop/s: 31312.6 Percentage: 85.09
Size: 127 Mflop/s: 29329 Percentage: 79.70
Size: 128 Mflop/s: 32578.6 Percentage: 88.53
Size: 129 Mflop/s: 31113.1 Percentage: 84.55
Size: 191 Mflop/s: 31590.5 Percentage: 85.84
Size: 192 Mflop/s: 35219.4 Percentage: 95.70
Size: 229 Mflop/s: 34236 Percentage: 93.03
Size: 255 Mflop/s: 32692.5 Percentage: 88.84
Size: 256 Mflop/s: 34510 Percentage: 93.78
Size: 257 Mflop/s: 32844.7 Percentage: 89.25
Size: 319 Mflop/s: 32950.9 Percentage: 89.54
Size: 320 Mflop/s: 36332.4 Percentage: 98.73
Size: 321 Mflop/s: 34460.3 Percentage: 93.64
Size: 417 Mflop/s: 34136 Percentage: 92.76
Size: 479 Mflop/s: 35101.8 Percentage: 95.39
Size: 480 Mflop/s: 35608.8 Percentage: 96.76
Size: 511 Mflop/s: 33768.6 Percentage: 91.76
Size: 512 Mflop/s: 35947 Percentage: 97.68
Size: 639 Mflop/s: 34572.5 Percentage: 93.95
Size: 640 Mflop/s: 35268.1 Percentage: 95.84
Size: 767 Mflop/s: 35731.4 Percentage: 97.10
Size: 768 Mflop/s: 37114.6 Percentage:100.85
Size: 769 Mflop/s: 34093.6 Percentage: 92.65
#Average percentage of Peak = 90.1618

View file

@ -0,0 +1,29 @@
#Description: Naive, three-loop dgemm.
Size: 31 Mflop/s: 1065.56 Percentage: 2.90
Size: 32 Mflop/s: 1703.76 Percentage: 4.63
Size: 96 Mflop/s: 1730.73 Percentage: 4.70
Size: 97 Mflop/s: 1728.48 Percentage: 4.70
Size: 127 Mflop/s: 1718.52 Percentage: 4.67
Size: 128 Mflop/s: 1533.64 Percentage: 4.17
Size: 129 Mflop/s: 1724.17 Percentage: 4.69
Size: 191 Mflop/s: 1636.9 Percentage: 4.45
Size: 192 Mflop/s: 1534.75 Percentage: 4.17
Size: 229 Mflop/s: 1604.48 Percentage: 4.36
Size: 255 Mflop/s: 1462.11 Percentage: 3.97
Size: 256 Mflop/s: 730.562 Percentage: 1.99
Size: 257 Mflop/s: 1483.12 Percentage: 4.03
Size: 319 Mflop/s: 1409.3 Percentage: 3.83
Size: 320 Mflop/s: 1303.95 Percentage: 3.54
Size: 321 Mflop/s: 1621.34 Percentage: 4.41
Size: 417 Mflop/s: 1496.69 Percentage: 4.07
Size: 479 Mflop/s: 1518.7 Percentage: 4.13
Size: 480 Mflop/s: 1429.18 Percentage: 3.88
Size: 511 Mflop/s: 1371.7 Percentage: 3.73
Size: 512 Mflop/s: 602.424 Percentage: 1.64
Size: 639 Mflop/s: 1339.03 Percentage: 3.64
Size: 640 Mflop/s: 913.949 Percentage: 2.48
Size: 767 Mflop/s: 1566.19 Percentage: 4.26
Size: 768 Mflop/s: 757.52 Percentage: 2.06
Size: 769 Mflop/s: 1559.49 Percentage: 4.24
#Average percentage of Peak = 3.81963