hw1: blocked dgemm is correct
This commit is contained in:
parent
27fc66cf14
commit
3b9b6babbb
6 changed files with 1303 additions and 16 deletions
|
@ -1,3 +1,4 @@
|
|||
#include <string.h>
|
||||
/*
|
||||
Please include compiler name below (you may also include any other modules you would like to be loaded)
|
||||
|
||||
|
@ -13,25 +14,45 @@ LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKL
|
|||
|
||||
*/
|
||||
|
||||
const char* dgemm_desc = "Naive, three-loop dgemm.";
|
||||
const char* dgemm_desc = "Block-based dgemm.";
|
||||
|
||||
const int block_size = 50;
|
||||
|
||||
inline int min(int a, int b) {
|
||||
return a < b ? a : b;
|
||||
}
|
||||
|
||||
inline void naivemm(int r_min, int r_max, int k_min, int k_max, int c_min, int c_max, int n, double* A, double* B, double* C) {
|
||||
/* For each row i of A */
|
||||
for (int i = r_min; i < r_max; ++i) {
|
||||
|
||||
/* For each column j of B */
|
||||
for (int j = c_min; j < c_max; ++j) {
|
||||
|
||||
for(int k = k_min; k < k_max; k++) {
|
||||
C[i + j * n] += A[i + k * n] * B[k + j * n];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* This routine performs a dgemm operation
|
||||
* C := C + A * B
|
||||
* where A, B, and C are lda-by-lda matrices stored in column-major format.
|
||||
* On exit, A and B maintain their input values. */
|
||||
void square_dgemm (int n, double* A, double* B, double* C)
|
||||
{
|
||||
// TODO: Implement the blocking optimization
|
||||
|
||||
void square_dgemm(int n, double* A, double* B, double* C) {
|
||||
/* For each row i of A */
|
||||
for (int i = 0; i < n; ++i)
|
||||
for (int i = 0; i < n; i += block_size) {
|
||||
int i_next = min(i + block_size, n);
|
||||
|
||||
/* For each column j of B */
|
||||
for (int j = 0; j < n; ++j)
|
||||
{
|
||||
/* Compute C(i,j) */
|
||||
double cij = C[i+j*n];
|
||||
for( int k = 0; k < n; k++ )
|
||||
cij += A[i+k*n] * B[k+j*n];
|
||||
C[i+j*n] = cij;
|
||||
for (int j = 0; j < n; j += block_size) {
|
||||
int j_next = min(j + block_size, n);
|
||||
|
||||
for (int k = 0; k < n; k += block_size) {
|
||||
int k_next = min(k + block_size, n);
|
||||
naivemm(i, i_next, k, k_next, j, j_next, n, A, B, C);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
1179
Project1/project_1_maggioni_claudio/matmult/timing.ps
Normal file
1179
Project1/project_1_maggioni_claudio/matmult/timing.ps
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,29 @@
|
|||
#Description: Naive, three-loop dgemm.
|
||||
|
||||
Size: 31 Mflop/s: 2131.35 Percentage: 5.79
|
||||
Size: 32 Mflop/s: 2387.28 Percentage: 6.49
|
||||
Size: 96 Mflop/s: 1844.52 Percentage: 5.01
|
||||
Size: 97 Mflop/s: 1991.74 Percentage: 5.41
|
||||
Size: 127 Mflop/s: 1878.09 Percentage: 5.10
|
||||
Size: 128 Mflop/s: 1466.11 Percentage: 3.98
|
||||
Size: 129 Mflop/s: 1688.41 Percentage: 4.59
|
||||
Size: 191 Mflop/s: 1747.28 Percentage: 4.75
|
||||
Size: 192 Mflop/s: 1458.67 Percentage: 3.96
|
||||
Size: 229 Mflop/s: 1714.53 Percentage: 4.66
|
||||
Size: 255 Mflop/s: 1719.28 Percentage: 4.67
|
||||
Size: 256 Mflop/s: 604.682 Percentage: 1.64
|
||||
Size: 257 Mflop/s: 1733.51 Percentage: 4.71
|
||||
Size: 319 Mflop/s: 1720.89 Percentage: 4.68
|
||||
Size: 320 Mflop/s: 1514.24 Percentage: 4.11
|
||||
Size: 321 Mflop/s: 1721.2 Percentage: 4.68
|
||||
Size: 417 Mflop/s: 1718.17 Percentage: 4.67
|
||||
Size: 479 Mflop/s: 1719.18 Percentage: 4.67
|
||||
Size: 480 Mflop/s: 1594.88 Percentage: 4.33
|
||||
Size: 511 Mflop/s: 1716.8 Percentage: 4.67
|
||||
Size: 512 Mflop/s: 581.233 Percentage: 1.58
|
||||
Size: 639 Mflop/s: 1678.33 Percentage: 4.56
|
||||
Size: 640 Mflop/s: 749.008 Percentage: 2.04
|
||||
Size: 767 Mflop/s: 1247.59 Percentage: 3.39
|
||||
Size: 768 Mflop/s: 714.52 Percentage: 1.94
|
||||
Size: 769 Mflop/s: 1603.09 Percentage: 4.36
|
||||
#Average percentage of Peak = 4.24797
|
|
@ -0,0 +1,29 @@
|
|||
#Description: Reference dgemm.
|
||||
|
||||
Size: 31 Mflop/s: 23035.3 Percentage: 62.60
|
||||
Size: 32 Mflop/s: 26290.9 Percentage: 71.44
|
||||
Size: 96 Mflop/s: 32829.1 Percentage: 89.21
|
||||
Size: 97 Mflop/s: 31312.6 Percentage: 85.09
|
||||
Size: 127 Mflop/s: 29329 Percentage: 79.70
|
||||
Size: 128 Mflop/s: 32578.6 Percentage: 88.53
|
||||
Size: 129 Mflop/s: 31113.1 Percentage: 84.55
|
||||
Size: 191 Mflop/s: 31590.5 Percentage: 85.84
|
||||
Size: 192 Mflop/s: 35219.4 Percentage: 95.70
|
||||
Size: 229 Mflop/s: 34236 Percentage: 93.03
|
||||
Size: 255 Mflop/s: 32692.5 Percentage: 88.84
|
||||
Size: 256 Mflop/s: 34510 Percentage: 93.78
|
||||
Size: 257 Mflop/s: 32844.7 Percentage: 89.25
|
||||
Size: 319 Mflop/s: 32950.9 Percentage: 89.54
|
||||
Size: 320 Mflop/s: 36332.4 Percentage: 98.73
|
||||
Size: 321 Mflop/s: 34460.3 Percentage: 93.64
|
||||
Size: 417 Mflop/s: 34136 Percentage: 92.76
|
||||
Size: 479 Mflop/s: 35101.8 Percentage: 95.39
|
||||
Size: 480 Mflop/s: 35608.8 Percentage: 96.76
|
||||
Size: 511 Mflop/s: 33768.6 Percentage: 91.76
|
||||
Size: 512 Mflop/s: 35947 Percentage: 97.68
|
||||
Size: 639 Mflop/s: 34572.5 Percentage: 93.95
|
||||
Size: 640 Mflop/s: 35268.1 Percentage: 95.84
|
||||
Size: 767 Mflop/s: 35731.4 Percentage: 97.10
|
||||
Size: 768 Mflop/s: 37114.6 Percentage:100.85
|
||||
Size: 769 Mflop/s: 34093.6 Percentage: 92.65
|
||||
#Average percentage of Peak = 90.1618
|
|
@ -0,0 +1,29 @@
|
|||
#Description: Naive, three-loop dgemm.
|
||||
|
||||
Size: 31 Mflop/s: 1065.56 Percentage: 2.90
|
||||
Size: 32 Mflop/s: 1703.76 Percentage: 4.63
|
||||
Size: 96 Mflop/s: 1730.73 Percentage: 4.70
|
||||
Size: 97 Mflop/s: 1728.48 Percentage: 4.70
|
||||
Size: 127 Mflop/s: 1718.52 Percentage: 4.67
|
||||
Size: 128 Mflop/s: 1533.64 Percentage: 4.17
|
||||
Size: 129 Mflop/s: 1724.17 Percentage: 4.69
|
||||
Size: 191 Mflop/s: 1636.9 Percentage: 4.45
|
||||
Size: 192 Mflop/s: 1534.75 Percentage: 4.17
|
||||
Size: 229 Mflop/s: 1604.48 Percentage: 4.36
|
||||
Size: 255 Mflop/s: 1462.11 Percentage: 3.97
|
||||
Size: 256 Mflop/s: 730.562 Percentage: 1.99
|
||||
Size: 257 Mflop/s: 1483.12 Percentage: 4.03
|
||||
Size: 319 Mflop/s: 1409.3 Percentage: 3.83
|
||||
Size: 320 Mflop/s: 1303.95 Percentage: 3.54
|
||||
Size: 321 Mflop/s: 1621.34 Percentage: 4.41
|
||||
Size: 417 Mflop/s: 1496.69 Percentage: 4.07
|
||||
Size: 479 Mflop/s: 1518.7 Percentage: 4.13
|
||||
Size: 480 Mflop/s: 1429.18 Percentage: 3.88
|
||||
Size: 511 Mflop/s: 1371.7 Percentage: 3.73
|
||||
Size: 512 Mflop/s: 602.424 Percentage: 1.64
|
||||
Size: 639 Mflop/s: 1339.03 Percentage: 3.64
|
||||
Size: 640 Mflop/s: 913.949 Percentage: 2.48
|
||||
Size: 767 Mflop/s: 1566.19 Percentage: 4.26
|
||||
Size: 768 Mflop/s: 757.52 Percentage: 2.06
|
||||
Size: 769 Mflop/s: 1559.49 Percentage: 4.24
|
||||
#Average percentage of Peak = 3.81963
|
Reference in a new issue