From 26a8e93698c43c0423107fa069c47ad3409c6bae Mon Sep 17 00:00:00 2001
From: Claudio Maggioni <maggicl@usi.ch>
Date: Tue, 27 Sep 2022 08:35:59 +0200
Subject: [PATCH] hw1: added sources

---
 .../matmult/Makefile                          |  33 ++++
 .../matmult/benchmark.c                       | 174 ++++++++++++++++++
 .../matmult/dgemm-blas.c                      |  38 ++++
 .../matmult/dgemm-blocked.c                   |  37 ++++
 .../matmult/dgemm-naive.c                     |  35 ++++
 .../matmult/run_matrixmult.sh                 |  28 +++
 .../matmult/timing.gp                         |  20 ++
 7 files changed, 365 insertions(+)
 create mode 100755 Project1/project_1_maggioni_claudio/matmult/Makefile
 create mode 100644 Project1/project_1_maggioni_claudio/matmult/benchmark.c
 create mode 100644 Project1/project_1_maggioni_claudio/matmult/dgemm-blas.c
 create mode 100644 Project1/project_1_maggioni_claudio/matmult/dgemm-blocked.c
 create mode 100644 Project1/project_1_maggioni_claudio/matmult/dgemm-naive.c
 create mode 100755 Project1/project_1_maggioni_claudio/matmult/run_matrixmult.sh
 create mode 100644 Project1/project_1_maggioni_claudio/matmult/timing.gp

diff --git a/Project1/project_1_maggioni_claudio/matmult/Makefile b/Project1/project_1_maggioni_claudio/matmult/Makefile
new file mode 100755
index 0000000..1f22bd7
--- /dev/null
+++ b/Project1/project_1_maggioni_claudio/matmult/Makefile
@@ -0,0 +1,33 @@
+# On Euler, we will benchmark your DGEMM's performance against the performance
+# of the default vendor-tuned DGEMM. This is done in benchmark-blas.
+#
+
+CC = gcc
+OPT = -O2
+CFLAGS = -Wall -std=gnu99 $(OPT)
+LDFLAGS = -Wall
+# librt is needed for clock_gettime
+LDLIBS = -lrt -Wl,--no-as-needed -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lpthread -lm -ldl -m64 -I${MKLROOT}/include
+
+targets = benchmark-naive benchmark-blocked benchmark-blas
+objects = benchmark.o dgemm-naive.o dgemm-blocked.o dgemm-blas.o  
+
+.PHONY : default
+default : all
+
+.PHONY : all
+all : clean $(targets)
+
+benchmark-naive : benchmark.o dgemm-naive.o 
+	$(CC) -o $@ $^ $(LDLIBS)
+benchmark-blocked : benchmark.o dgemm-blocked.o
+	$(CC) -o $@ $^ $(LDLIBS)
+benchmark-blas : benchmark.o dgemm-blas.o
+	$(CC) -o $@ $^ $(LDLIBS)
+
+%.o : %.c
+	$(CC) -c $(CFLAGS) $<
+
+.PHONY : clean
+clean:
+	rm -f $(targets) $(objects)
diff --git a/Project1/project_1_maggioni_claudio/matmult/benchmark.c b/Project1/project_1_maggioni_claudio/matmult/benchmark.c
new file mode 100644
index 0000000..7ee3b86
--- /dev/null
+++ b/Project1/project_1_maggioni_claudio/matmult/benchmark.c
@@ -0,0 +1,174 @@
+#include <stdlib.h> // For: exit, drand48, malloc, free, NULL, EXIT_FAILURE
+#include <stdio.h>  // For: perror
+#include <string.h> // For: memset
+
+#include <float.h>  // For: DBL_EPSILON
+#include <math.h>   // For: fabs
+
+#ifdef GETTIMEOFDAY
+#include <sys/time.h> // For struct timeval, gettimeofday
+#else
+#include <time.h> // For struct timespec, clock_gettime, CLOCK_MONOTONIC
+#endif
+
+// On icsmaster
+// 2.3 GHz * 8 vector width * 2 flops for FMA = 36.8 GF/s
+#define MAX_SPEED 36.8
+
+/* reference_dgemm wraps a call to the BLAS-3 routine DGEMM, via the standard FORTRAN interface - hence the reference semantics. */ 
+#define DGEMM dgemm_
+extern void DGEMM (char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*); 
+void reference_dgemm (int N, double ALPHA, double* A, double* B, double* C)
+{
+  char TRANSA = 'N';
+  char TRANSB = 'N';
+  int M = N;
+  int K = N;
+  double BETA = 1.;
+  int LDA = N;
+  int LDB = N;
+  int LDC = N;
+  DGEMM(&TRANSA, &TRANSB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC);
+}   
+
+/* Your function must have the following signature: */
+extern const char* dgemm_desc;
+extern void square_dgemm (int, double*, double*, double*);
+
+double wall_time ()
+{
+#ifdef GETTIMEOFDAY
+  struct timeval t;
+  gettimeofday (&t, NULL);
+  return 1.*t.tv_sec + 1.e-6*t.tv_usec;
+#else
+  struct timespec t;
+  clock_gettime (CLOCK_MONOTONIC, &t);
+  return 1.*t.tv_sec + 1.e-9*t.tv_nsec;
+#endif
+}
+
+void die (const char* message)
+{
+  perror (message);
+  exit (EXIT_FAILURE);
+}
+
+void fill (double* p, int n)
+{
+  for (int i = 0; i < n; ++i)
+    p[i] = 2 * drand48() - 1; // Uniformly distributed over [-1, 1]
+}
+
+void absolute_value (double *p, int n)
+{
+  for (int i = 0; i < n; ++i)
+    p[i] = fabs (p[i]);
+}
+
+/* The benchmarking program */
+int main (int argc, char **argv)
+{
+  printf ("#Description:\t%s\n\n", dgemm_desc);
+
+  /* Test sizes should highlight performance dips at multiples of certain powers-of-two */
+
+  int test_sizes[] = 
+
+  /* Multiples-of-32, +/- 1. Currently commented. */
+  /* {31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025}; */
+
+  /* A representative subset of the first list. Currently uncommented. */ 
+  { 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257,
+    319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 };
+
+  int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]);
+
+  /* assume last size is also the largest size */
+  int nmax = test_sizes[nsizes-1];
+
+  /* allocate memory for all problems */
+  double* buf = NULL;
+  buf = (double*) malloc (3 * nmax * nmax * sizeof(double));
+  if (buf == NULL) die ("failed to allocate largest problem size");
+
+  double Mflops_s[nsizes],per[nsizes],aveper;
+
+  /* For each test size */
+  for (int isize = 0; isize < sizeof(test_sizes)/sizeof(test_sizes[0]); ++isize)
+  {
+    /* Create and fill 3 random matrices A,B,C*/
+    int n = test_sizes[isize];
+
+    double* A = buf + 0;
+    double* B = A + nmax*nmax;
+    double* C = B + nmax*nmax;
+
+    fill (A, n*n);
+    fill (B, n*n);
+    fill (C, n*n);
+
+    /* Measure performance (in Gflops/s). */
+
+    /* Time a "sufficiently long" sequence of calls to reduce noise */
+    double Gflops_s, seconds = -1.0;
+    double timeout = 0.1; // "sufficiently long" := at least 1/10 second.
+    for (int n_iterations = 1; seconds < timeout; n_iterations *= 2) 
+    {
+      /* Warm-up */
+      square_dgemm (n, A, B, C);
+
+      /* Benchmark n_iterations runs of square_dgemm */
+      seconds = -wall_time();
+      for (int it = 0; it < n_iterations; ++it)
+	square_dgemm (n, A, B, C);
+      seconds += wall_time();
+
+      /*  compute Gflop/s rate */
+      Gflops_s = 2.e-9 * n_iterations * n * n * n / seconds;
+    }
+  
+    /* Storing Mflop rate and calculating percentage of peak */
+    Mflops_s[isize] = Gflops_s*1000;
+    per[isize] = Gflops_s*100/MAX_SPEED;
+
+    printf ("Size: %d\tMflop/s: %8g\tPercentage:%6.2lf\n", n, Mflops_s[isize],per[isize]);
+
+    /* Ensure that error does not exceed the theoretical error bound. */
+
+    /* C := A * B, computed with square_dgemm */
+    memset (C, 0, n * n * sizeof(double));
+    square_dgemm (n, A, B, C);
+
+    /* Do not explicitly check that A and B were unmodified on square_dgemm exit
+     *  - if they were, the following will most likely detect it:   
+     * C := C - A * B, computed with reference_dgemm */
+    reference_dgemm(n, -1., A, B, C);
+
+    /* A := |A|, B := |B|, C := |C| */
+    absolute_value (A, n * n);
+    absolute_value (B, n * n);
+    absolute_value (C, n * n);
+
+    /* C := |C| - 3 * e_mach * n * |A| * |B|, computed with reference_dgemm */ 
+    reference_dgemm (n, -3.*DBL_EPSILON*n, A, B, C);
+
+    /* If any element in C is positive, then something went wrong in square_dgemm */
+    for (int i = 0; i < n * n; ++i)
+      if (C[i] > 0)
+	die("*** FAILURE *** Error in matrix multiply exceeds componentwise error bounds.\n" );
+  }
+
+  /* Calculating average percentage of peak reached by algorithm */
+  aveper=0;
+  for (int i=0; i<nsizes;i++)
+    aveper+= per[i];
+  aveper/=nsizes*1.0;
+  
+  /* Printing average percentage to screen */
+  printf("#Average percentage of Peak = %g\n",aveper);
+
+  free (buf);
+
+  return 0;
+}
diff --git a/Project1/project_1_maggioni_claudio/matmult/dgemm-blas.c b/Project1/project_1_maggioni_claudio/matmult/dgemm-blas.c
new file mode 100644
index 0000000..6894bab
--- /dev/null
+++ b/Project1/project_1_maggioni_claudio/matmult/dgemm-blas.c
@@ -0,0 +1,38 @@
+/* 
+    Please include compiler name below (you may also include any other modules you would like to be loaded)
+
+COMPILER= gnu
+
+    Please include All compiler flags and libraries as you want them run. You can simply copy this over from the Makefile's first few lines
+ 
+CC = cc
+OPT = -O3
+CFLAGS = -Wall -std=gnu99 $(OPT)
+MKLROOT = /opt/intel/composer_xe_2013.1.117/mkl
+LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm
+
+*/
+
+#define DGEMM dgemm_
+extern void DGEMM (char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*); 
+
+const char* dgemm_desc = "Reference dgemm.";
+
+/* This routine performs a dgemm operation
+ *  C := C + A * B
+ * where A, B, and C are lda-by-lda matrices stored in column-major format.
+ * On exit, A and B maintain their input values.    
+ * This function wraps a call to the BLAS-3 routine DGEMM, via the standard FORTRAN interface - hence the reference semantics. */
+void square_dgemm (int N, double* A, double* B, double* C)
+{
+  char TRANSA = 'N';
+  char TRANSB = 'N';
+  int M = N;
+  int K = N;
+  double ALPHA = 1.;
+  double BETA = 1.;
+  int LDA = N;
+  int LDB = N;
+  int LDC = N;
+  DGEMM(&TRANSA, &TRANSB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC);
+}   
diff --git a/Project1/project_1_maggioni_claudio/matmult/dgemm-blocked.c b/Project1/project_1_maggioni_claudio/matmult/dgemm-blocked.c
new file mode 100644
index 0000000..17adcbf
--- /dev/null
+++ b/Project1/project_1_maggioni_claudio/matmult/dgemm-blocked.c
@@ -0,0 +1,37 @@
+/* 
+    Please include compiler name below (you may also include any other modules you would like to be loaded)
+
+COMPILER= gnu
+
+    Please include All compiler flags and libraries as you want them run. You can simply copy this over from the Makefile's first few lines
+ 
+CC = cc
+OPT = -O3
+CFLAGS = -Wall -std=gnu99 $(OPT)
+MKLROOT = /opt/intel/composer_xe_2013.1.117/mkl
+LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm
+
+*/
+
+const char* dgemm_desc = "Naive, three-loop dgemm.";
+
+/* This routine performs a dgemm operation
+ *  C := C + A * B
+ * where A, B, and C are lda-by-lda matrices stored in column-major format.
+ * On exit, A and B maintain their input values. */    
+void square_dgemm (int n, double* A, double* B, double* C)
+{
+  // TODO: Implement the blocking optimization
+
+  /* For each row i of A */
+  for (int i = 0; i < n; ++i)
+    /* For each column j of B */
+    for (int j = 0; j < n; ++j) 
+    {
+      /* Compute C(i,j) */
+      double cij = C[i+j*n];
+      for( int k = 0; k < n; k++ )
+	cij += A[i+k*n] * B[k+j*n];
+      C[i+j*n] = cij;
+    }
+}
diff --git a/Project1/project_1_maggioni_claudio/matmult/dgemm-naive.c b/Project1/project_1_maggioni_claudio/matmult/dgemm-naive.c
new file mode 100644
index 0000000..b0a6a4e
--- /dev/null
+++ b/Project1/project_1_maggioni_claudio/matmult/dgemm-naive.c
@@ -0,0 +1,35 @@
+/* 
+    Please include compiler name below (you may also include any other modules you would like to be loaded)
+
+COMPILER= gnu
+
+    Please include All compiler flags and libraries as you want them run. You can simply copy this over from the Makefile's first few lines
+ 
+CC = cc
+OPT = -O3
+CFLAGS = -Wall -std=gnu99 $(OPT)
+MKLROOT = /opt/intel/composer_xe_2013.1.117/mkl
+LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm
+
+*/
+
+const char* dgemm_desc = "Naive, three-loop dgemm.";
+
+/* This routine performs a dgemm operation
+ *  C := C + A * B
+ * where A, B, and C are lda-by-lda matrices stored in column-major format.
+ * On exit, A and B maintain their input values. */    
+void square_dgemm (int n, double* A, double* B, double* C)
+{
+  /* For each row i of A */
+  for (int i = 0; i < n; ++i)
+    /* For each column j of B */
+    for (int j = 0; j < n; ++j) 
+    {
+      /* Compute C(i,j) */
+      double cij = C[i+j*n];
+      for( int k = 0; k < n; k++ )
+	cij += A[i+k*n] * B[k+j*n];
+      C[i+j*n] = cij;
+    }
+}
diff --git a/Project1/project_1_maggioni_claudio/matmult/run_matrixmult.sh b/Project1/project_1_maggioni_claudio/matmult/run_matrixmult.sh
new file mode 100755
index 0000000..c937f8b
--- /dev/null
+++ b/Project1/project_1_maggioni_claudio/matmult/run_matrixmult.sh
@@ -0,0 +1,28 @@
+#!/bin/bash -l
+
+#SBATCH --job-name=matrixmult
+#SBATCH --time=00:30:00
+#SBATCH --nodes=1
+#SBATCH --output=matrixmult-%j.out
+#SBATCH --error=matrixmult-%j.err
+
+# load modules
+if command -v module 1>/dev/null 2>&1; then
+   module load gcc/10.1.0 intel-mkl/2020.1.217-gcc-10.1.0-qsctnr6 gnuplot
+fi
+
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+
+echo "==== benchmark-naive ======================"
+./benchmark-naive | tee timing_basic_dgemm.data
+echo
+echo "==== benchmark-blas ======================="
+./benchmark-blas | tee timing_blas_dgemm.data
+echo
+echo "==== benchmark-blocked ===================="
+./benchmark-blocked | tee timing_blocked_dgemm.data
+
+echo
+echo "==== plot results ========================="
+gnuplot timing.gp
diff --git a/Project1/project_1_maggioni_claudio/matmult/timing.gp b/Project1/project_1_maggioni_claudio/matmult/timing.gp
new file mode 100644
index 0000000..8a6ee16
--- /dev/null
+++ b/Project1/project_1_maggioni_claudio/matmult/timing.gp
@@ -0,0 +1,20 @@
+set title "NxN matrix-matrix-multiplication on 4-Core Intel(R) Xeon(R) CPU E3-1585L v5 @ 3.00GHz"
+set xlabel "Matrix size (N)"
+set ylabel "Performance (GFlop/s)"
+set grid
+set logscale y 10
+
+set terminal postscript color "Helvetica" 14
+set output "timing.ps"
+
+# set terminal png color "Helvetica" 14
+# set output "timing.png"
+
+# plot "timing.data" using 2:4 title "square_dgemm" with linespoints
+
+
+# For performance comparisons
+
+plot "timing_basic_dgemm.data"   using 2:4 title "Naive dgemm" with linespoints, \
+     "timing_blocked_dgemm.data" using 2:4 title "Blocked dgemm" with linespoints, \
+     "timing_blas_dgemm.data"   using 2:4 title "MKL blas dgemm" with linespoints