#include /* Please include compiler name below (you may also include any other modules you would like to be loaded) COMPILER= gnu Please include All compiler flags and libraries as you want them run. You can simply copy this over from the Makefile's first few lines CC = cc OPT = -O3 CFLAGS = -Wall -std=gnu99 $(OPT) MKLROOT = /opt/intel/composer_xe_2013.1.117/mkl LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm */ const char* dgemm_desc = "Block-based dgemm."; const int block_size = 27; inline int min(int a, int b) { return a < b ? a : b; } inline void naivemm(int r_min, int r_max, int k_min, int k_max, int c_min, int c_max, int n, double* A_row, double* B, double* C_temp) { for (int i = r_min, ii = 0; i < r_max; ++i, ++ii) { for (int j = c_min, jj = 0; j < c_max; ++j, ++jj) { for (int k = k_min; k < k_max; k++) { C_temp[ii + jj * block_size] += A_row[i * n + k] * B[k + j * n]; } } } } inline void store_c(double* C, double* C_temp, int r_min, int r_max, int c_min, int c_max, int n) { for (int j = c_min, jj = 0; j < c_max; ++j, ++jj) { memcpy(C + j * n + r_min, C_temp + jj * block_size, (r_max - r_min) * sizeof(double)); } } /* This routine performs a dgemm operation * C := C + A * B * where A, B, and C are lda-by-lda matrices stored in column-major format. * On exit, A and B maintain their input values. */ void square_dgemm(int n, double* A, double* B, double* C) { double A_row[n * n]; double C_temp[block_size * block_size]; for (int m = 0; m < n; ++m) { double row_tmp[n]; memcpy(row_tmp, A + m * n, n * sizeof(double)); for (int l = 0; l < n; ++l) { A_row[l * n + m] = row_tmp[l]; } } for (int i = 0; i < n; i += block_size) { int i_next = min(i + block_size, n); for (int j = 0; j < n; j += block_size) { int j_next = min(j + block_size, n); memset(C_temp, 0, block_size * block_size * sizeof(double)); for (int k = 0; k < n; k += block_size) { int k_next = min(k + block_size, n); naivemm(i, i_next, k, k_next, j, j_next, n, A_row, B, C_temp); } store_c(C, C_temp, i, i_next, j, j_next, n); } } }