hw1: blocked dgemm is correct
This commit is contained in:
parent
27fc66cf14
commit
3b9b6babbb
6 changed files with 1303 additions and 16 deletions
|
@ -79,7 +79,7 @@ int main (int argc, char **argv)
|
||||||
/* {31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025}; */
|
/* {31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025}; */
|
||||||
|
|
||||||
/* A representative subset of the first list. Currently uncommented. */
|
/* A representative subset of the first list. Currently uncommented. */
|
||||||
{ 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257,
|
{ 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257,
|
||||||
319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 };
|
319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 };
|
||||||
|
|
||||||
int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]);
|
int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]);
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include <string.h>
|
||||||
/*
|
/*
|
||||||
Please include compiler name below (you may also include any other modules you would like to be loaded)
|
Please include compiler name below (you may also include any other modules you would like to be loaded)
|
||||||
|
|
||||||
|
@ -13,25 +14,45 @@ LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKL
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const char* dgemm_desc = "Naive, three-loop dgemm.";
|
const char* dgemm_desc = "Block-based dgemm.";
|
||||||
|
|
||||||
|
const int block_size = 50;
|
||||||
|
|
||||||
|
inline int min(int a, int b) {
|
||||||
|
return a < b ? a : b;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void naivemm(int r_min, int r_max, int k_min, int k_max, int c_min, int c_max, int n, double* A, double* B, double* C) {
|
||||||
|
/* For each row i of A */
|
||||||
|
for (int i = r_min; i < r_max; ++i) {
|
||||||
|
|
||||||
|
/* For each column j of B */
|
||||||
|
for (int j = c_min; j < c_max; ++j) {
|
||||||
|
|
||||||
|
for(int k = k_min; k < k_max; k++) {
|
||||||
|
C[i + j * n] += A[i + k * n] * B[k + j * n];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* This routine performs a dgemm operation
|
/* This routine performs a dgemm operation
|
||||||
* C := C + A * B
|
* C := C + A * B
|
||||||
* where A, B, and C are lda-by-lda matrices stored in column-major format.
|
* where A, B, and C are lda-by-lda matrices stored in column-major format.
|
||||||
* On exit, A and B maintain their input values. */
|
* On exit, A and B maintain their input values. */
|
||||||
void square_dgemm (int n, double* A, double* B, double* C)
|
void square_dgemm(int n, double* A, double* B, double* C) {
|
||||||
{
|
/* For each row i of A */
|
||||||
// TODO: Implement the blocking optimization
|
for (int i = 0; i < n; i += block_size) {
|
||||||
|
int i_next = min(i + block_size, n);
|
||||||
/* For each row i of A */
|
|
||||||
for (int i = 0; i < n; ++i)
|
/* For each column j of B */
|
||||||
/* For each column j of B */
|
for (int j = 0; j < n; j += block_size) {
|
||||||
for (int j = 0; j < n; ++j)
|
int j_next = min(j + block_size, n);
|
||||||
{
|
|
||||||
/* Compute C(i,j) */
|
for (int k = 0; k < n; k += block_size) {
|
||||||
double cij = C[i+j*n];
|
int k_next = min(k + block_size, n);
|
||||||
for( int k = 0; k < n; k++ )
|
naivemm(i, i_next, k, k_next, j, j_next, n, A, B, C);
|
||||||
cij += A[i+k*n] * B[k+j*n];
|
}
|
||||||
C[i+j*n] = cij;
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
1179
Project1/project_1_maggioni_claudio/matmult/timing.ps
Normal file
1179
Project1/project_1_maggioni_claudio/matmult/timing.ps
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,29 @@
|
||||||
|
#Description: Naive, three-loop dgemm.
|
||||||
|
|
||||||
|
Size: 31 Mflop/s: 2131.35 Percentage: 5.79
|
||||||
|
Size: 32 Mflop/s: 2387.28 Percentage: 6.49
|
||||||
|
Size: 96 Mflop/s: 1844.52 Percentage: 5.01
|
||||||
|
Size: 97 Mflop/s: 1991.74 Percentage: 5.41
|
||||||
|
Size: 127 Mflop/s: 1878.09 Percentage: 5.10
|
||||||
|
Size: 128 Mflop/s: 1466.11 Percentage: 3.98
|
||||||
|
Size: 129 Mflop/s: 1688.41 Percentage: 4.59
|
||||||
|
Size: 191 Mflop/s: 1747.28 Percentage: 4.75
|
||||||
|
Size: 192 Mflop/s: 1458.67 Percentage: 3.96
|
||||||
|
Size: 229 Mflop/s: 1714.53 Percentage: 4.66
|
||||||
|
Size: 255 Mflop/s: 1719.28 Percentage: 4.67
|
||||||
|
Size: 256 Mflop/s: 604.682 Percentage: 1.64
|
||||||
|
Size: 257 Mflop/s: 1733.51 Percentage: 4.71
|
||||||
|
Size: 319 Mflop/s: 1720.89 Percentage: 4.68
|
||||||
|
Size: 320 Mflop/s: 1514.24 Percentage: 4.11
|
||||||
|
Size: 321 Mflop/s: 1721.2 Percentage: 4.68
|
||||||
|
Size: 417 Mflop/s: 1718.17 Percentage: 4.67
|
||||||
|
Size: 479 Mflop/s: 1719.18 Percentage: 4.67
|
||||||
|
Size: 480 Mflop/s: 1594.88 Percentage: 4.33
|
||||||
|
Size: 511 Mflop/s: 1716.8 Percentage: 4.67
|
||||||
|
Size: 512 Mflop/s: 581.233 Percentage: 1.58
|
||||||
|
Size: 639 Mflop/s: 1678.33 Percentage: 4.56
|
||||||
|
Size: 640 Mflop/s: 749.008 Percentage: 2.04
|
||||||
|
Size: 767 Mflop/s: 1247.59 Percentage: 3.39
|
||||||
|
Size: 768 Mflop/s: 714.52 Percentage: 1.94
|
||||||
|
Size: 769 Mflop/s: 1603.09 Percentage: 4.36
|
||||||
|
#Average percentage of Peak = 4.24797
|
|
@ -0,0 +1,29 @@
|
||||||
|
#Description: Reference dgemm.
|
||||||
|
|
||||||
|
Size: 31 Mflop/s: 23035.3 Percentage: 62.60
|
||||||
|
Size: 32 Mflop/s: 26290.9 Percentage: 71.44
|
||||||
|
Size: 96 Mflop/s: 32829.1 Percentage: 89.21
|
||||||
|
Size: 97 Mflop/s: 31312.6 Percentage: 85.09
|
||||||
|
Size: 127 Mflop/s: 29329 Percentage: 79.70
|
||||||
|
Size: 128 Mflop/s: 32578.6 Percentage: 88.53
|
||||||
|
Size: 129 Mflop/s: 31113.1 Percentage: 84.55
|
||||||
|
Size: 191 Mflop/s: 31590.5 Percentage: 85.84
|
||||||
|
Size: 192 Mflop/s: 35219.4 Percentage: 95.70
|
||||||
|
Size: 229 Mflop/s: 34236 Percentage: 93.03
|
||||||
|
Size: 255 Mflop/s: 32692.5 Percentage: 88.84
|
||||||
|
Size: 256 Mflop/s: 34510 Percentage: 93.78
|
||||||
|
Size: 257 Mflop/s: 32844.7 Percentage: 89.25
|
||||||
|
Size: 319 Mflop/s: 32950.9 Percentage: 89.54
|
||||||
|
Size: 320 Mflop/s: 36332.4 Percentage: 98.73
|
||||||
|
Size: 321 Mflop/s: 34460.3 Percentage: 93.64
|
||||||
|
Size: 417 Mflop/s: 34136 Percentage: 92.76
|
||||||
|
Size: 479 Mflop/s: 35101.8 Percentage: 95.39
|
||||||
|
Size: 480 Mflop/s: 35608.8 Percentage: 96.76
|
||||||
|
Size: 511 Mflop/s: 33768.6 Percentage: 91.76
|
||||||
|
Size: 512 Mflop/s: 35947 Percentage: 97.68
|
||||||
|
Size: 639 Mflop/s: 34572.5 Percentage: 93.95
|
||||||
|
Size: 640 Mflop/s: 35268.1 Percentage: 95.84
|
||||||
|
Size: 767 Mflop/s: 35731.4 Percentage: 97.10
|
||||||
|
Size: 768 Mflop/s: 37114.6 Percentage:100.85
|
||||||
|
Size: 769 Mflop/s: 34093.6 Percentage: 92.65
|
||||||
|
#Average percentage of Peak = 90.1618
|
|
@ -0,0 +1,29 @@
|
||||||
|
#Description: Naive, three-loop dgemm.
|
||||||
|
|
||||||
|
Size: 31 Mflop/s: 1065.56 Percentage: 2.90
|
||||||
|
Size: 32 Mflop/s: 1703.76 Percentage: 4.63
|
||||||
|
Size: 96 Mflop/s: 1730.73 Percentage: 4.70
|
||||||
|
Size: 97 Mflop/s: 1728.48 Percentage: 4.70
|
||||||
|
Size: 127 Mflop/s: 1718.52 Percentage: 4.67
|
||||||
|
Size: 128 Mflop/s: 1533.64 Percentage: 4.17
|
||||||
|
Size: 129 Mflop/s: 1724.17 Percentage: 4.69
|
||||||
|
Size: 191 Mflop/s: 1636.9 Percentage: 4.45
|
||||||
|
Size: 192 Mflop/s: 1534.75 Percentage: 4.17
|
||||||
|
Size: 229 Mflop/s: 1604.48 Percentage: 4.36
|
||||||
|
Size: 255 Mflop/s: 1462.11 Percentage: 3.97
|
||||||
|
Size: 256 Mflop/s: 730.562 Percentage: 1.99
|
||||||
|
Size: 257 Mflop/s: 1483.12 Percentage: 4.03
|
||||||
|
Size: 319 Mflop/s: 1409.3 Percentage: 3.83
|
||||||
|
Size: 320 Mflop/s: 1303.95 Percentage: 3.54
|
||||||
|
Size: 321 Mflop/s: 1621.34 Percentage: 4.41
|
||||||
|
Size: 417 Mflop/s: 1496.69 Percentage: 4.07
|
||||||
|
Size: 479 Mflop/s: 1518.7 Percentage: 4.13
|
||||||
|
Size: 480 Mflop/s: 1429.18 Percentage: 3.88
|
||||||
|
Size: 511 Mflop/s: 1371.7 Percentage: 3.73
|
||||||
|
Size: 512 Mflop/s: 602.424 Percentage: 1.64
|
||||||
|
Size: 639 Mflop/s: 1339.03 Percentage: 3.64
|
||||||
|
Size: 640 Mflop/s: 913.949 Percentage: 2.48
|
||||||
|
Size: 767 Mflop/s: 1566.19 Percentage: 4.26
|
||||||
|
Size: 768 Mflop/s: 757.52 Percentage: 2.06
|
||||||
|
Size: 769 Mflop/s: 1559.49 Percentage: 4.24
|
||||||
|
#Average percentage of Peak = 3.81963
|
Reference in a new issue