71 lines
2.3 KiB
C
71 lines
2.3 KiB
C
#include <string.h>
|
|
|
|
/*
|
|
Please include compiler name below (you may also include any other modules you would like to be loaded)
|
|
|
|
COMPILER= gnu
|
|
|
|
Please include All compiler flags and libraries as you want them run. You can simply copy this over from the Makefile's first few lines
|
|
|
|
CC = cc
|
|
OPT = -O3
|
|
CFLAGS = -Wall -std=gnu99 $(OPT)
|
|
MKLROOT = /opt/intel/composer_xe_2013.1.117/mkl
|
|
LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm
|
|
|
|
*/
|
|
|
|
#define MIN(a,b) (((a)<(b))?(a):(b))
|
|
|
|
const char* dgemm_desc = "Block-based dgemm.";
|
|
|
|
int block_size = 32;
|
|
|
|
/* This routine performs a dgemm operation
|
|
* C := C + A * B
|
|
* where A, B, and C are lda-by-lda matrices stored in column-major format.
|
|
* On exit, A and B maintain their input values. */
|
|
void square_dgemm(int n, double* A, double* B, double* C) {
|
|
double A_row[n * n];
|
|
double row_tmp[n];
|
|
double C_temp[block_size * block_size];
|
|
|
|
for (int m = 0; m < n; ++m) {
|
|
memcpy(row_tmp, A + m * n, n * sizeof(double));
|
|
|
|
for (int l = 0; l < n; ++l) {
|
|
A_row[l * n + m] = row_tmp[l];
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < n; i += block_size) {
|
|
int i_next = MIN(i + block_size, n);
|
|
|
|
for (int j = 0; j < n; j += block_size) {
|
|
int j_next = MIN(j + block_size, n);
|
|
|
|
// clear matrix C_temp
|
|
memset(C_temp, 0, block_size * block_size * sizeof(double));
|
|
|
|
for (int k = 0; k < n; k += block_size) {
|
|
int k_next = MIN(k + block_size, n);
|
|
|
|
// begin naivemm
|
|
for (int i2 = i, ii2 = 0; i2 < i_next; ++i2, ++ii2) {
|
|
for (int j2 = j, jj2 = 0; j2 < j_next; ++j2, ++jj2) {
|
|
for (int k2 = k; k2 < k_next; k2++) {
|
|
C_temp[ii2 + jj2 * block_size] += A_row[i2 * n + k2] * B[k2 + j2 * n];
|
|
}
|
|
}
|
|
}
|
|
// end naivemm
|
|
}
|
|
|
|
// store C_temp in C
|
|
for (int j2 = j, jj2 = 0; j2 < j_next; ++j2, ++jj2) {
|
|
memcpy(C + j2 * n + i, C_temp + jj2 * block_size, (i_next - i) * sizeof(double));
|
|
}
|
|
// end store C_temp
|
|
}
|
|
}
|
|
}
|