hw1: ex2 optimization
This commit is contained in:
parent
df1cb3cf01
commit
86f9021ce5
3 changed files with 14 additions and 1978710 deletions
File diff suppressed because it is too large
Load diff
|
@ -17,17 +17,17 @@ LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKL
|
||||||
|
|
||||||
const char* dgemm_desc = "Block-based dgemm.";
|
const char* dgemm_desc = "Block-based dgemm.";
|
||||||
|
|
||||||
const int block_size = 18;
|
const int block_size = 27;
|
||||||
|
|
||||||
inline int min(int a, int b) {
|
inline int min(int a, int b) {
|
||||||
return a < b ? a : b;
|
return a < b ? a : b;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void naivemm(int r_min, int r_max, int k_min, int k_max, int c_min, int c_max, int n, double* A, double* B, double* C_temp) {
|
inline void naivemm(int r_min, int r_max, int k_min, int k_max, int c_min, int c_max, int n, double* A_row, double* B, double* C_temp) {
|
||||||
for (int i = r_min, ii = 0; i < r_max; ++i, ++ii) {
|
for (int i = r_min, ii = 0; i < r_max; ++i, ++ii) {
|
||||||
for (int j = c_min, jj = 0; j < c_max; ++j, ++jj) {
|
for (int j = c_min, jj = 0; j < c_max; ++j, ++jj) {
|
||||||
for (int k = k_min; k < k_max; k++) {
|
for (int k = k_min; k < k_max; k++) {
|
||||||
C_temp[ii + jj * block_size] += A[i + k * n] * B[k + j * n];
|
C_temp[ii + jj * block_size] += A_row[i * n + k] * B[k + j * n];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -44,8 +44,18 @@ inline void store_c(double* C, double* C_temp, int r_min, int r_max, int c_min,
|
||||||
* where A, B, and C are lda-by-lda matrices stored in column-major format.
|
* where A, B, and C are lda-by-lda matrices stored in column-major format.
|
||||||
* On exit, A and B maintain their input values. */
|
* On exit, A and B maintain their input values. */
|
||||||
void square_dgemm(int n, double* A, double* B, double* C) {
|
void square_dgemm(int n, double* A, double* B, double* C) {
|
||||||
|
double A_row[n * n];
|
||||||
double C_temp[block_size * block_size];
|
double C_temp[block_size * block_size];
|
||||||
|
|
||||||
|
for (int m = 0; m < n; ++m) {
|
||||||
|
double row_tmp[n];
|
||||||
|
memcpy(row_tmp, A + m * n, n * sizeof(double));
|
||||||
|
|
||||||
|
for (int l = 0; l < n; ++l) {
|
||||||
|
A_row[l * n + m] = row_tmp[l];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n; i += block_size) {
|
for (int i = 0; i < n; i += block_size) {
|
||||||
int i_next = min(i + block_size, n);
|
int i_next = min(i + block_size, n);
|
||||||
|
|
||||||
|
@ -56,7 +66,7 @@ void square_dgemm(int n, double* A, double* B, double* C) {
|
||||||
|
|
||||||
for (int k = 0; k < n; k += block_size) {
|
for (int k = 0; k < n; k += block_size) {
|
||||||
int k_next = min(k + block_size, n);
|
int k_next = min(k + block_size, n);
|
||||||
naivemm(i, i_next, k, k_next, j, j_next, n, A, B, C_temp);
|
naivemm(i, i_next, k, k_next, j, j_next, n, A_row, B, C_temp);
|
||||||
}
|
}
|
||||||
|
|
||||||
store_c(C, C_temp, i, i_next, j, j_next, n);
|
store_c(C, C_temp, i, i_next, j, j_next, n);
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
#Description: Naive, three-loop dgemm.
|
|
||||||
|
|
||||||
Size: 31 Mflop/s: 2431.2 Percentage: 6.61
|
|
||||||
#Average percentage of Peak = 6.60652
|
|
Reference in a new issue