hw1: added sources
This commit is contained in:
parent
497ded0368
commit
26a8e93698
7 changed files with 365 additions and 0 deletions
33
Project1/project_1_maggioni_claudio/matmult/Makefile
Executable file
33
Project1/project_1_maggioni_claudio/matmult/Makefile
Executable file
|
@ -0,0 +1,33 @@
|
||||||
|
# On Euler, we will benchmark your DGEMM's performance against the performance
|
||||||
|
# of the default vendor-tuned DGEMM. This is done in benchmark-blas.
|
||||||
|
#
|
||||||
|
|
||||||
|
CC = gcc
|
||||||
|
OPT = -O2
|
||||||
|
CFLAGS = -Wall -std=gnu99 $(OPT)
|
||||||
|
LDFLAGS = -Wall
|
||||||
|
# librt is needed for clock_gettime
|
||||||
|
LDLIBS = -lrt -Wl,--no-as-needed -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lpthread -lm -ldl -m64 -I${MKLROOT}/include
|
||||||
|
|
||||||
|
targets = benchmark-naive benchmark-blocked benchmark-blas
|
||||||
|
objects = benchmark.o dgemm-naive.o dgemm-blocked.o dgemm-blas.o
|
||||||
|
|
||||||
|
.PHONY : default
|
||||||
|
default : all
|
||||||
|
|
||||||
|
.PHONY : all
|
||||||
|
all : clean $(targets)
|
||||||
|
|
||||||
|
benchmark-naive : benchmark.o dgemm-naive.o
|
||||||
|
$(CC) -o $@ $^ $(LDLIBS)
|
||||||
|
benchmark-blocked : benchmark.o dgemm-blocked.o
|
||||||
|
$(CC) -o $@ $^ $(LDLIBS)
|
||||||
|
benchmark-blas : benchmark.o dgemm-blas.o
|
||||||
|
$(CC) -o $@ $^ $(LDLIBS)
|
||||||
|
|
||||||
|
%.o : %.c
|
||||||
|
$(CC) -c $(CFLAGS) $<
|
||||||
|
|
||||||
|
.PHONY : clean
|
||||||
|
clean:
|
||||||
|
rm -f $(targets) $(objects)
|
174
Project1/project_1_maggioni_claudio/matmult/benchmark.c
Normal file
174
Project1/project_1_maggioni_claudio/matmult/benchmark.c
Normal file
|
@ -0,0 +1,174 @@
|
||||||
|
#include <stdlib.h> // For: exit, drand48, malloc, free, NULL, EXIT_FAILURE
|
||||||
|
#include <stdio.h> // For: perror
|
||||||
|
#include <string.h> // For: memset
|
||||||
|
|
||||||
|
#include <float.h> // For: DBL_EPSILON
|
||||||
|
#include <math.h> // For: fabs
|
||||||
|
|
||||||
|
#ifdef GETTIMEOFDAY
|
||||||
|
#include <sys/time.h> // For struct timeval, gettimeofday
|
||||||
|
#else
|
||||||
|
#include <time.h> // For struct timespec, clock_gettime, CLOCK_MONOTONIC
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// On icsmaster
|
||||||
|
// 2.3 GHz * 8 vector width * 2 flops for FMA = 36.8 GF/s
|
||||||
|
#define MAX_SPEED 36.8
|
||||||
|
|
||||||
|
/* reference_dgemm wraps a call to the BLAS-3 routine DGEMM, via the standard FORTRAN interface - hence the reference semantics. */
|
||||||
|
#define DGEMM dgemm_
|
||||||
|
extern void DGEMM (char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*);
|
||||||
|
void reference_dgemm (int N, double ALPHA, double* A, double* B, double* C)
|
||||||
|
{
|
||||||
|
char TRANSA = 'N';
|
||||||
|
char TRANSB = 'N';
|
||||||
|
int M = N;
|
||||||
|
int K = N;
|
||||||
|
double BETA = 1.;
|
||||||
|
int LDA = N;
|
||||||
|
int LDB = N;
|
||||||
|
int LDC = N;
|
||||||
|
DGEMM(&TRANSA, &TRANSB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Your function must have the following signature: */
|
||||||
|
extern const char* dgemm_desc;
|
||||||
|
extern void square_dgemm (int, double*, double*, double*);
|
||||||
|
|
||||||
|
double wall_time ()
|
||||||
|
{
|
||||||
|
#ifdef GETTIMEOFDAY
|
||||||
|
struct timeval t;
|
||||||
|
gettimeofday (&t, NULL);
|
||||||
|
return 1.*t.tv_sec + 1.e-6*t.tv_usec;
|
||||||
|
#else
|
||||||
|
struct timespec t;
|
||||||
|
clock_gettime (CLOCK_MONOTONIC, &t);
|
||||||
|
return 1.*t.tv_sec + 1.e-9*t.tv_nsec;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void die (const char* message)
|
||||||
|
{
|
||||||
|
perror (message);
|
||||||
|
exit (EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
void fill (double* p, int n)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < n; ++i)
|
||||||
|
p[i] = 2 * drand48() - 1; // Uniformly distributed over [-1, 1]
|
||||||
|
}
|
||||||
|
|
||||||
|
void absolute_value (double *p, int n)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < n; ++i)
|
||||||
|
p[i] = fabs (p[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The benchmarking program */
|
||||||
|
int main (int argc, char **argv)
|
||||||
|
{
|
||||||
|
printf ("#Description:\t%s\n\n", dgemm_desc);
|
||||||
|
|
||||||
|
/* Test sizes should highlight performance dips at multiples of certain powers-of-two */
|
||||||
|
|
||||||
|
int test_sizes[] =
|
||||||
|
|
||||||
|
/* Multiples-of-32, +/- 1. Currently commented. */
|
||||||
|
/* {31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025}; */
|
||||||
|
|
||||||
|
/* A representative subset of the first list. Currently uncommented. */
|
||||||
|
{ 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257,
|
||||||
|
319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 };
|
||||||
|
|
||||||
|
int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]);
|
||||||
|
|
||||||
|
/* assume last size is also the largest size */
|
||||||
|
int nmax = test_sizes[nsizes-1];
|
||||||
|
|
||||||
|
/* allocate memory for all problems */
|
||||||
|
double* buf = NULL;
|
||||||
|
buf = (double*) malloc (3 * nmax * nmax * sizeof(double));
|
||||||
|
if (buf == NULL) die ("failed to allocate largest problem size");
|
||||||
|
|
||||||
|
double Mflops_s[nsizes],per[nsizes],aveper;
|
||||||
|
|
||||||
|
/* For each test size */
|
||||||
|
for (int isize = 0; isize < sizeof(test_sizes)/sizeof(test_sizes[0]); ++isize)
|
||||||
|
{
|
||||||
|
/* Create and fill 3 random matrices A,B,C*/
|
||||||
|
int n = test_sizes[isize];
|
||||||
|
|
||||||
|
double* A = buf + 0;
|
||||||
|
double* B = A + nmax*nmax;
|
||||||
|
double* C = B + nmax*nmax;
|
||||||
|
|
||||||
|
fill (A, n*n);
|
||||||
|
fill (B, n*n);
|
||||||
|
fill (C, n*n);
|
||||||
|
|
||||||
|
/* Measure performance (in Gflops/s). */
|
||||||
|
|
||||||
|
/* Time a "sufficiently long" sequence of calls to reduce noise */
|
||||||
|
double Gflops_s, seconds = -1.0;
|
||||||
|
double timeout = 0.1; // "sufficiently long" := at least 1/10 second.
|
||||||
|
for (int n_iterations = 1; seconds < timeout; n_iterations *= 2)
|
||||||
|
{
|
||||||
|
/* Warm-up */
|
||||||
|
square_dgemm (n, A, B, C);
|
||||||
|
|
||||||
|
/* Benchmark n_iterations runs of square_dgemm */
|
||||||
|
seconds = -wall_time();
|
||||||
|
for (int it = 0; it < n_iterations; ++it)
|
||||||
|
square_dgemm (n, A, B, C);
|
||||||
|
seconds += wall_time();
|
||||||
|
|
||||||
|
/* compute Gflop/s rate */
|
||||||
|
Gflops_s = 2.e-9 * n_iterations * n * n * n / seconds;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Storing Mflop rate and calculating percentage of peak */
|
||||||
|
Mflops_s[isize] = Gflops_s*1000;
|
||||||
|
per[isize] = Gflops_s*100/MAX_SPEED;
|
||||||
|
|
||||||
|
printf ("Size: %d\tMflop/s: %8g\tPercentage:%6.2lf\n", n, Mflops_s[isize],per[isize]);
|
||||||
|
|
||||||
|
/* Ensure that error does not exceed the theoretical error bound. */
|
||||||
|
|
||||||
|
/* C := A * B, computed with square_dgemm */
|
||||||
|
memset (C, 0, n * n * sizeof(double));
|
||||||
|
square_dgemm (n, A, B, C);
|
||||||
|
|
||||||
|
/* Do not explicitly check that A and B were unmodified on square_dgemm exit
|
||||||
|
* - if they were, the following will most likely detect it:
|
||||||
|
* C := C - A * B, computed with reference_dgemm */
|
||||||
|
reference_dgemm(n, -1., A, B, C);
|
||||||
|
|
||||||
|
/* A := |A|, B := |B|, C := |C| */
|
||||||
|
absolute_value (A, n * n);
|
||||||
|
absolute_value (B, n * n);
|
||||||
|
absolute_value (C, n * n);
|
||||||
|
|
||||||
|
/* C := |C| - 3 * e_mach * n * |A| * |B|, computed with reference_dgemm */
|
||||||
|
reference_dgemm (n, -3.*DBL_EPSILON*n, A, B, C);
|
||||||
|
|
||||||
|
/* If any element in C is positive, then something went wrong in square_dgemm */
|
||||||
|
for (int i = 0; i < n * n; ++i)
|
||||||
|
if (C[i] > 0)
|
||||||
|
die("*** FAILURE *** Error in matrix multiply exceeds componentwise error bounds.\n" );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Calculating average percentage of peak reached by algorithm */
|
||||||
|
aveper=0;
|
||||||
|
for (int i=0; i<nsizes;i++)
|
||||||
|
aveper+= per[i];
|
||||||
|
aveper/=nsizes*1.0;
|
||||||
|
|
||||||
|
/* Printing average percentage to screen */
|
||||||
|
printf("#Average percentage of Peak = %g\n",aveper);
|
||||||
|
|
||||||
|
free (buf);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
38
Project1/project_1_maggioni_claudio/matmult/dgemm-blas.c
Normal file
38
Project1/project_1_maggioni_claudio/matmult/dgemm-blas.c
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
/*
|
||||||
|
Please include compiler name below (you may also include any other modules you would like to be loaded)
|
||||||
|
|
||||||
|
COMPILER= gnu
|
||||||
|
|
||||||
|
Please include All compiler flags and libraries as you want them run. You can simply copy this over from the Makefile's first few lines
|
||||||
|
|
||||||
|
CC = cc
|
||||||
|
OPT = -O3
|
||||||
|
CFLAGS = -Wall -std=gnu99 $(OPT)
|
||||||
|
MKLROOT = /opt/intel/composer_xe_2013.1.117/mkl
|
||||||
|
LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define DGEMM dgemm_
|
||||||
|
extern void DGEMM (char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*);
|
||||||
|
|
||||||
|
const char* dgemm_desc = "Reference dgemm.";
|
||||||
|
|
||||||
|
/* This routine performs a dgemm operation
|
||||||
|
* C := C + A * B
|
||||||
|
* where A, B, and C are lda-by-lda matrices stored in column-major format.
|
||||||
|
* On exit, A and B maintain their input values.
|
||||||
|
* This function wraps a call to the BLAS-3 routine DGEMM, via the standard FORTRAN interface - hence the reference semantics. */
|
||||||
|
void square_dgemm (int N, double* A, double* B, double* C)
|
||||||
|
{
|
||||||
|
char TRANSA = 'N';
|
||||||
|
char TRANSB = 'N';
|
||||||
|
int M = N;
|
||||||
|
int K = N;
|
||||||
|
double ALPHA = 1.;
|
||||||
|
double BETA = 1.;
|
||||||
|
int LDA = N;
|
||||||
|
int LDB = N;
|
||||||
|
int LDC = N;
|
||||||
|
DGEMM(&TRANSA, &TRANSB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC);
|
||||||
|
}
|
37
Project1/project_1_maggioni_claudio/matmult/dgemm-blocked.c
Normal file
37
Project1/project_1_maggioni_claudio/matmult/dgemm-blocked.c
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
/*
|
||||||
|
Please include compiler name below (you may also include any other modules you would like to be loaded)
|
||||||
|
|
||||||
|
COMPILER= gnu
|
||||||
|
|
||||||
|
Please include All compiler flags and libraries as you want them run. You can simply copy this over from the Makefile's first few lines
|
||||||
|
|
||||||
|
CC = cc
|
||||||
|
OPT = -O3
|
||||||
|
CFLAGS = -Wall -std=gnu99 $(OPT)
|
||||||
|
MKLROOT = /opt/intel/composer_xe_2013.1.117/mkl
|
||||||
|
LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
const char* dgemm_desc = "Naive, three-loop dgemm.";
|
||||||
|
|
||||||
|
/* This routine performs a dgemm operation
|
||||||
|
* C := C + A * B
|
||||||
|
* where A, B, and C are lda-by-lda matrices stored in column-major format.
|
||||||
|
* On exit, A and B maintain their input values. */
|
||||||
|
void square_dgemm (int n, double* A, double* B, double* C)
|
||||||
|
{
|
||||||
|
// TODO: Implement the blocking optimization
|
||||||
|
|
||||||
|
/* For each row i of A */
|
||||||
|
for (int i = 0; i < n; ++i)
|
||||||
|
/* For each column j of B */
|
||||||
|
for (int j = 0; j < n; ++j)
|
||||||
|
{
|
||||||
|
/* Compute C(i,j) */
|
||||||
|
double cij = C[i+j*n];
|
||||||
|
for( int k = 0; k < n; k++ )
|
||||||
|
cij += A[i+k*n] * B[k+j*n];
|
||||||
|
C[i+j*n] = cij;
|
||||||
|
}
|
||||||
|
}
|
35
Project1/project_1_maggioni_claudio/matmult/dgemm-naive.c
Normal file
35
Project1/project_1_maggioni_claudio/matmult/dgemm-naive.c
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
/*
|
||||||
|
Please include compiler name below (you may also include any other modules you would like to be loaded)
|
||||||
|
|
||||||
|
COMPILER= gnu
|
||||||
|
|
||||||
|
Please include All compiler flags and libraries as you want them run. You can simply copy this over from the Makefile's first few lines
|
||||||
|
|
||||||
|
CC = cc
|
||||||
|
OPT = -O3
|
||||||
|
CFLAGS = -Wall -std=gnu99 $(OPT)
|
||||||
|
MKLROOT = /opt/intel/composer_xe_2013.1.117/mkl
|
||||||
|
LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
const char* dgemm_desc = "Naive, three-loop dgemm.";
|
||||||
|
|
||||||
|
/* This routine performs a dgemm operation
|
||||||
|
* C := C + A * B
|
||||||
|
* where A, B, and C are lda-by-lda matrices stored in column-major format.
|
||||||
|
* On exit, A and B maintain their input values. */
|
||||||
|
void square_dgemm (int n, double* A, double* B, double* C)
|
||||||
|
{
|
||||||
|
/* For each row i of A */
|
||||||
|
for (int i = 0; i < n; ++i)
|
||||||
|
/* For each column j of B */
|
||||||
|
for (int j = 0; j < n; ++j)
|
||||||
|
{
|
||||||
|
/* Compute C(i,j) */
|
||||||
|
double cij = C[i+j*n];
|
||||||
|
for( int k = 0; k < n; k++ )
|
||||||
|
cij += A[i+k*n] * B[k+j*n];
|
||||||
|
C[i+j*n] = cij;
|
||||||
|
}
|
||||||
|
}
|
28
Project1/project_1_maggioni_claudio/matmult/run_matrixmult.sh
Executable file
28
Project1/project_1_maggioni_claudio/matmult/run_matrixmult.sh
Executable file
|
@ -0,0 +1,28 @@
|
||||||
|
#!/bin/bash -l
|
||||||
|
|
||||||
|
#SBATCH --job-name=matrixmult
|
||||||
|
#SBATCH --time=00:30:00
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --output=matrixmult-%j.out
|
||||||
|
#SBATCH --error=matrixmult-%j.err
|
||||||
|
|
||||||
|
# load modules
|
||||||
|
if command -v module 1>/dev/null 2>&1; then
|
||||||
|
module load gcc/10.1.0 intel-mkl/2020.1.217-gcc-10.1.0-qsctnr6 gnuplot
|
||||||
|
fi
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=1
|
||||||
|
export MKL_NUM_THREADS=1
|
||||||
|
|
||||||
|
echo "==== benchmark-naive ======================"
|
||||||
|
./benchmark-naive | tee timing_basic_dgemm.data
|
||||||
|
echo
|
||||||
|
echo "==== benchmark-blas ======================="
|
||||||
|
./benchmark-blas | tee timing_blas_dgemm.data
|
||||||
|
echo
|
||||||
|
echo "==== benchmark-blocked ===================="
|
||||||
|
./benchmark-blocked | tee timing_blocked_dgemm.data
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "==== plot results ========================="
|
||||||
|
gnuplot timing.gp
|
20
Project1/project_1_maggioni_claudio/matmult/timing.gp
Normal file
20
Project1/project_1_maggioni_claudio/matmult/timing.gp
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
set title "NxN matrix-matrix-multiplication on 4-Core Intel(R) Xeon(R) CPU E3-1585L v5 @ 3.00GHz"
|
||||||
|
set xlabel "Matrix size (N)"
|
||||||
|
set ylabel "Performance (GFlop/s)"
|
||||||
|
set grid
|
||||||
|
set logscale y 10
|
||||||
|
|
||||||
|
set terminal postscript color "Helvetica" 14
|
||||||
|
set output "timing.ps"
|
||||||
|
|
||||||
|
# set terminal png color "Helvetica" 14
|
||||||
|
# set output "timing.png"
|
||||||
|
|
||||||
|
# plot "timing.data" using 2:4 title "square_dgemm" with linespoints
|
||||||
|
|
||||||
|
|
||||||
|
# For performance comparisons
|
||||||
|
|
||||||
|
plot "timing_basic_dgemm.data" using 2:4 title "Naive dgemm" with linespoints, \
|
||||||
|
"timing_blocked_dgemm.data" using 2:4 title "Blocked dgemm" with linespoints, \
|
||||||
|
"timing_blas_dgemm.data" using 2:4 title "MKL blas dgemm" with linespoints
|
Reference in a new issue