Merge branch 'master' of git.maggioni.xyz:maggicl/HPC

This commit is contained in:
Claudio Maggioni 2022-09-27 09:00:20 +02:00
commit 49e302e8bc
22 changed files with 3120 additions and 0 deletions

302
.gitignore vendored Normal file
View file

@ -0,0 +1,302 @@
## Core latex/pdflatex auxiliary files:
*.aux
*.lof
*.log
*.lot
*.fls
*.out
*.toc
*.fmt
*.fot
*.cb
*.cb2
.*.lb
## Intermediate documents:
*.dvi
*.xdv
*-converted-to.*
# these rules might exclude image files for figures etc.
# *.ps
# *.eps
# *.pdf
## Generated if empty string is given at "Please type another file name for output:"
.pdf
## Bibliography auxiliary files (bibtex/biblatex/biber):
*.bbl
*.bcf
*.blg
*-blx.aux
*-blx.bib
*.run.xml
## Build tool auxiliary files:
*.fdb_latexmk
*.synctex
*.synctex(busy)
*.synctex.gz
*.synctex.gz(busy)
*.pdfsync
## Build tool directories for auxiliary files
# latexrun
latex.out/
## Auxiliary and intermediate files from other packages:
# algorithms
*.alg
*.loa
# achemso
acs-*.bib
# amsthm
*.thm
# beamer
*.nav
*.pre
*.snm
*.vrb
# changes
*.soc
# comment
*.cut
# cprotect
*.cpt
# elsarticle (documentclass of Elsevier journals)
*.spl
# endnotes
*.ent
# fixme
*.lox
# feynmf/feynmp
*.mf
*.mp
*.t[1-9]
*.t[1-9][0-9]
*.tfm
#(r)(e)ledmac/(r)(e)ledpar
*.end
*.?end
*.[1-9]
*.[1-9][0-9]
*.[1-9][0-9][0-9]
*.[1-9]R
*.[1-9][0-9]R
*.[1-9][0-9][0-9]R
*.eledsec[1-9]
*.eledsec[1-9]R
*.eledsec[1-9][0-9]
*.eledsec[1-9][0-9]R
*.eledsec[1-9][0-9][0-9]
*.eledsec[1-9][0-9][0-9]R
# glossaries
*.acn
*.acr
*.glg
*.glo
*.gls
*.glsdefs
*.lzo
*.lzs
*.slg
*.slo
*.sls
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
# *.ist
# gnuplot
*.gnuplot
*.table
# gnuplottex
*-gnuplottex-*
# gregoriotex
*.gaux
*.glog
*.gtex
# htlatex
*.4ct
*.4tc
*.idv
*.lg
*.trc
*.xref
# hyperref
*.brf
# knitr
*-concordance.tex
# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
# *.tikz
*-tikzDictionary
# listings
*.lol
# luatexja-ruby
*.ltjruby
# makeidx
*.idx
*.ilg
*.ind
# minitoc
*.maf
*.mlf
*.mlt
*.mtc[0-9]*
*.slf[0-9]*
*.slt[0-9]*
*.stc[0-9]*
# minted
_minted*
*.pyg
# morewrites
*.mw
# newpax
*.newpax
# nomencl
*.nlg
*.nlo
*.nls
# pax
*.pax
# pdfpcnotes
*.pdfpc
# sagetex
*.sagetex.sage
*.sagetex.py
*.sagetex.scmd
# scrwfile
*.wrt
# svg
svg-inkscape/
# sympy
*.sout
*.sympy
sympy-plots-for-*.tex/
# pdfcomment
*.upa
*.upb
# pythontex
*.pytxcode
pythontex-files-*/
# tcolorbox
*.listing
# thmtools
*.loe
# TikZ & PGF
*.dpth
*.md5
*.auxlock
# titletoc
*.ptc
# todonotes
*.tdo
# vhistory
*.hst
*.ver
# easy-todo
*.lod
# xcolor
*.xcp
# xmpincl
*.xmpi
# xindy
*.xdy
# xypic precompiled matrices and outlines
*.xyc
*.xyd
# endfloat
*.ttt
*.fff
# Latexian
TSWLatexianTemp*
## Editors:
# WinEdt
*.bak
*.sav
# Texpad
.texpadtmp
# LyX
*.lyx~
# Kile
*.backup
# gummi
.*.swp
# KBibTeX
*~[0-9]*
# TeXnicCenter
*.tps
# auto folder when using emacs and auctex
./auto/*
*.el
# expex forward references with \gathertags
*-tags.tex
# standalone packages
*.sta
# Makeindex log files
*.lpz
# xwatermark package
*.xwm
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
# Uncomment the next line to have this generated file ignored.
#*Notes.bib

BIN
Project0/project0.pdf Normal file

Binary file not shown.

13
Project1/Makefile Normal file
View file

@ -0,0 +1,13 @@
filename=project_1_maggioni_claudio
pdf:
pdflatex ${filename}
pdflatex ${filename}
make clean
read:
evince ${filename}.pdf &
clean:
rm -f ${filename}.out ${filename}.log ${filename}.bbl ${filename}.blg ${filename}.au ${filename}.log ${filename}.ps ${filename}.aux ${filename}.out ${filename}.dvi ${filename}.bbl ${filename}.blg ${filename}.toc ${filename}.nav ${filename}.vrb ${filename}.snm

95
Project1/assignment.sty Normal file
View file

@ -0,0 +1,95 @@
\usepackage{ifthen}
\usepackage[utf8]{inputenc}
\usepackage{graphics}
\usepackage{graphicx}
\usepackage{hyperref}
\pagestyle{plain}
\voffset -5mm
\oddsidemargin 0mm
\evensidemargin -11mm
\marginparwidth 2cm
\marginparsep 0pt
\topmargin 0mm
\headheight 0pt
\headsep 0pt
\topskip 0pt
\textheight 255mm
\textwidth 165mm
\newcommand{\duedate} {}
\newcommand{\setduedate}[1]{%
\renewcommand\duedate {Due date:~ #1}}
\newcommand\isassignment {false}
\newcommand{\setassignment}{\renewcommand\isassignment {true}}
\newcommand{\ifassignment}[1]{\ifthenelse{\boolean{\isassignment}}{#1}{}}
\newcommand{\ifnotassignment}[1]{\ifthenelse{\boolean{\isassignment}}{}{#1}}
\newcommand{\assignmentpolicy}{
\begin{table}[h]
\begin{center}
\scalebox{0.8} {%
\begin{tabular}{|p{0.02cm}p{16cm}|}
\hline
&\\
\multicolumn{2}{|c|}{\Large\textbf{HPC 2022 --- Submission Instructions}}\\
\multicolumn{2}{|c|}{\large\textbf{(Please, notice that following instructions are mandatory: }}\\
\multicolumn{2}{|c|}{\large\textbf{submissions that don't comply with, won't be considered)}}\\
&\\
\textbullet & Assignments must be submitted to \href{https://www.icorsi.ch/course/view.php?id=14652}{iCorsi} (i.e. in electronic format).\\
\textbullet & Provide both executable package and sources (e.g. C/C++ files, Matlab).
If you are using libraries, please add them in the file. Sources must be organized in directories called:\\
\multicolumn{2}{|c|}{\textit{Project\_number\_lastname\_firstname}}\\
& and the file must be called:\\
\multicolumn{2}{|c|}{\textit{project\_number\_lastname\_firstname.zip}}\\
\multicolumn{2}{|c|}{\textit{project\_number\_lastname\_firstname.pdf}}\\
\textbullet & The TAs will grade your project by reviewing your project write-up, and looking at the implementation
you attempted, and benchmarking your code's performance.\\
\textbullet & You are allowed to discuss all questions with anyone you like; however: (i) your submission must list anyone you discussed problems with and (ii) you must write up your submission independently.\\
\hline
\end{tabular}
}
\end{center}
\end{table}
}
\newcommand{\punkte}[1]{\hspace{1ex}\emph{\mdseries\hfill(#1~\ifcase#1{Points}\or{Points}\else{Points}\fi)}}
\newcommand\serieheader[6]{
\thispagestyle{empty}%
\begin{flushleft}
\includegraphics[width=0.4\textwidth]{usi_inf.png}
\end{flushleft}
\noindent%
{\large\ignorespaces{\textbf{#1}}\hspace{\fill}\ignorespaces{ \textbf{#2}}}\\ \\%
{\large\ignorespaces #3 \hspace{\fill}\ignorespaces #4}\\
\noindent%
\bigskip
\hrule\par\bigskip\noindent%
\bigskip {\ignorespaces {\Large{\textbf{#5}}}
\hspace{\fill}\ignorespaces \large \ifthenelse{\boolean{\isassignment}}{\duedate}{#6}}
\hrule\par\bigskip\noindent% \linebreak
}
\makeatletter
\def\enumerateMod{\ifnum \@enumdepth >3 \@toodeep\else
\advance\@enumdepth \@ne
\edef\@enumctr{enum\romannumeral\the\@enumdepth}\list
{\csname label\@enumctr\endcsname}{\usecounter
{\@enumctr}%%%? the following differs from "enumerate"
\topsep0pt%
\partopsep0pt%
\itemsep0pt%
\def\makelabel##1{\hss\llap{##1}}}\fi}
\let\endenumerateMod =\endlist
\makeatother
\usepackage{textcomp}

BIN
Project1/project1.pdf Normal file

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,79 @@
\documentclass[unicode,11pt,a4paper,oneside,numbers=endperiod,openany]{scrartcl}
\input{assignment.sty}
\usepackage{fancyvrb}
\begin{document}
\setassignment
\setduedate{12.10.2022 (midnight)}
\serieheader{High-Performance Computing Lab}{2022}{Student: Claudio
Maggioni}{Discussed with: ---}{Solution for Project 1}{}
\newline
\assignmentpolicy
In this project you will practice memory access optimization, performance-oriented programming, and OpenMP parallelizaton
on the ICS Cluster .
\section{Explaining Memory Hierarchies \punkte{25}}
By identifying the memory hierarchy parameters through \texttt{likwid-topology}
for the cache topology and \texttt{free -g} for the amount of primary memory I
find the following values:
\begin{center}
\begin{tabular}{llll}
Main memory & 62 GB \\
L3 cache & 25 MB per socket \\
L2 cache & 256 kB per core \\
L1 cache & 32 kB per core
\end{tabular}
\end{center}
All values are reported using base 2 IEC byte units. The cluster has 2 sockets
and a total of 20 cores (10 per socket). The cache topology diagram reported by
\texttt{likwid-topology -g} is the following:
\pagebreak[4]
% https://tex.stackexchange.com/a/171818
\begin{Verbatim}[fontsize=\tiny]
Socket 0:
+---------------------------------------------------------------------------------------------------------------+
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| | 0 | | 1 | | 2 | | 3 | | 4 | | 5 | | 6 | | 7 | | 8 | | 9 | |
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | |
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | |
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| +-----------------------------------------------------------------------------------------------------------+ |
| | 25 MB | |
| +-----------------------------------------------------------------------------------------------------------+ |
+---------------------------------------------------------------------------------------------------------------+
Socket 1:
+---------------------------------------------------------------------------------------------------------------+
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| | 10 | | 11 | | 12 | | 13 | | 14 | | 15 | | 16 | | 17 | | 18 | | 19 | |
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | |
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | |
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
| +-----------------------------------------------------------------------------------------------------------+ |
| | 25 MB | |
| +-----------------------------------------------------------------------------------------------------------+ |
+---------------------------------------------------------------------------------------------------------------+
\end{Verbatim}
\section{Optimize Square Matrix-Matrix Multiplication \punkte{60}}
\section{Quality of the Report \punkte{15}}
\end{document}

View file

@ -0,0 +1,33 @@
# On Euler, we will benchmark your DGEMM's performance against the performance
# of the default vendor-tuned DGEMM. This is done in benchmark-blas.
#
CC = gcc
OPT = -O2
CFLAGS = -Wall -std=gnu99 $(OPT)
LDFLAGS = -Wall
# librt is needed for clock_gettime
LDLIBS = -lrt -Wl,--no-as-needed -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lpthread -lm -ldl -m64 -I${MKLROOT}/include
targets = benchmark-naive benchmark-blocked benchmark-blas
objects = benchmark.o dgemm-naive.o dgemm-blocked.o dgemm-blas.o
.PHONY : default
default : all
.PHONY : all
all : clean $(targets)
benchmark-naive : benchmark.o dgemm-naive.o
$(CC) -o $@ $^ $(LDLIBS)
benchmark-blocked : benchmark.o dgemm-blocked.o
$(CC) -o $@ $^ $(LDLIBS)
benchmark-blas : benchmark.o dgemm-blas.o
$(CC) -o $@ $^ $(LDLIBS)
%.o : %.c
$(CC) -c $(CFLAGS) $<
.PHONY : clean
clean:
rm -f $(targets) $(objects)

View file

@ -0,0 +1,174 @@
#include <stdlib.h> // For: exit, drand48, malloc, free, NULL, EXIT_FAILURE
#include <stdio.h> // For: perror
#include <string.h> // For: memset
#include <float.h> // For: DBL_EPSILON
#include <math.h> // For: fabs
#ifdef GETTIMEOFDAY
#include <sys/time.h> // For struct timeval, gettimeofday
#else
#include <time.h> // For struct timespec, clock_gettime, CLOCK_MONOTONIC
#endif
// On icsmaster
// 2.3 GHz * 8 vector width * 2 flops for FMA = 36.8 GF/s
#define MAX_SPEED 36.8
/* reference_dgemm wraps a call to the BLAS-3 routine DGEMM, via the standard FORTRAN interface - hence the reference semantics. */
#define DGEMM dgemm_
extern void DGEMM (char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*);
void reference_dgemm (int N, double ALPHA, double* A, double* B, double* C)
{
char TRANSA = 'N';
char TRANSB = 'N';
int M = N;
int K = N;
double BETA = 1.;
int LDA = N;
int LDB = N;
int LDC = N;
DGEMM(&TRANSA, &TRANSB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC);
}
/* Your function must have the following signature: */
extern const char* dgemm_desc;
extern void square_dgemm (int, double*, double*, double*);
double wall_time ()
{
#ifdef GETTIMEOFDAY
struct timeval t;
gettimeofday (&t, NULL);
return 1.*t.tv_sec + 1.e-6*t.tv_usec;
#else
struct timespec t;
clock_gettime (CLOCK_MONOTONIC, &t);
return 1.*t.tv_sec + 1.e-9*t.tv_nsec;
#endif
}
void die (const char* message)
{
perror (message);
exit (EXIT_FAILURE);
}
void fill (double* p, int n)
{
for (int i = 0; i < n; ++i)
p[i] = 2 * drand48() - 1; // Uniformly distributed over [-1, 1]
}
void absolute_value (double *p, int n)
{
for (int i = 0; i < n; ++i)
p[i] = fabs (p[i]);
}
/* The benchmarking program */
int main (int argc, char **argv)
{
printf ("#Description:\t%s\n\n", dgemm_desc);
/* Test sizes should highlight performance dips at multiples of certain powers-of-two */
int test_sizes[] =
/* Multiples-of-32, +/- 1. Currently commented. */
/* {31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025}; */
/* A representative subset of the first list. Currently uncommented. */
{ 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257,
319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 };
int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]);
/* assume last size is also the largest size */
int nmax = test_sizes[nsizes-1];
/* allocate memory for all problems */
double* buf = NULL;
buf = (double*) malloc (3 * nmax * nmax * sizeof(double));
if (buf == NULL) die ("failed to allocate largest problem size");
double Mflops_s[nsizes],per[nsizes],aveper;
/* For each test size */
for (int isize = 0; isize < sizeof(test_sizes)/sizeof(test_sizes[0]); ++isize)
{
/* Create and fill 3 random matrices A,B,C*/
int n = test_sizes[isize];
double* A = buf + 0;
double* B = A + nmax*nmax;
double* C = B + nmax*nmax;
fill (A, n*n);
fill (B, n*n);
fill (C, n*n);
/* Measure performance (in Gflops/s). */
/* Time a "sufficiently long" sequence of calls to reduce noise */
double Gflops_s, seconds = -1.0;
double timeout = 0.1; // "sufficiently long" := at least 1/10 second.
for (int n_iterations = 1; seconds < timeout; n_iterations *= 2)
{
/* Warm-up */
square_dgemm (n, A, B, C);
/* Benchmark n_iterations runs of square_dgemm */
seconds = -wall_time();
for (int it = 0; it < n_iterations; ++it)
square_dgemm (n, A, B, C);
seconds += wall_time();
/* compute Gflop/s rate */
Gflops_s = 2.e-9 * n_iterations * n * n * n / seconds;
}
/* Storing Mflop rate and calculating percentage of peak */
Mflops_s[isize] = Gflops_s*1000;
per[isize] = Gflops_s*100/MAX_SPEED;
printf ("Size: %d\tMflop/s: %8g\tPercentage:%6.2lf\n", n, Mflops_s[isize],per[isize]);
/* Ensure that error does not exceed the theoretical error bound. */
/* C := A * B, computed with square_dgemm */
memset (C, 0, n * n * sizeof(double));
square_dgemm (n, A, B, C);
/* Do not explicitly check that A and B were unmodified on square_dgemm exit
* - if they were, the following will most likely detect it:
* C := C - A * B, computed with reference_dgemm */
reference_dgemm(n, -1., A, B, C);
/* A := |A|, B := |B|, C := |C| */
absolute_value (A, n * n);
absolute_value (B, n * n);
absolute_value (C, n * n);
/* C := |C| - 3 * e_mach * n * |A| * |B|, computed with reference_dgemm */
reference_dgemm (n, -3.*DBL_EPSILON*n, A, B, C);
/* If any element in C is positive, then something went wrong in square_dgemm */
for (int i = 0; i < n * n; ++i)
if (C[i] > 0)
die("*** FAILURE *** Error in matrix multiply exceeds componentwise error bounds.\n" );
}
/* Calculating average percentage of peak reached by algorithm */
aveper=0;
for (int i=0; i<nsizes;i++)
aveper+= per[i];
aveper/=nsizes*1.0;
/* Printing average percentage to screen */
printf("#Average percentage of Peak = %g\n",aveper);
free (buf);
return 0;
}

View file

@ -0,0 +1,38 @@
/*
Please include compiler name below (you may also include any other modules you would like to be loaded)
COMPILER= gnu
Please include All compiler flags and libraries as you want them run. You can simply copy this over from the Makefile's first few lines
CC = cc
OPT = -O3
CFLAGS = -Wall -std=gnu99 $(OPT)
MKLROOT = /opt/intel/composer_xe_2013.1.117/mkl
LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm
*/
#define DGEMM dgemm_
extern void DGEMM (char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*);
const char* dgemm_desc = "Reference dgemm.";
/* This routine performs a dgemm operation
* C := C + A * B
* where A, B, and C are lda-by-lda matrices stored in column-major format.
* On exit, A and B maintain their input values.
* This function wraps a call to the BLAS-3 routine DGEMM, via the standard FORTRAN interface - hence the reference semantics. */
void square_dgemm (int N, double* A, double* B, double* C)
{
char TRANSA = 'N';
char TRANSB = 'N';
int M = N;
int K = N;
double ALPHA = 1.;
double BETA = 1.;
int LDA = N;
int LDB = N;
int LDC = N;
DGEMM(&TRANSA, &TRANSB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC);
}

View file

@ -0,0 +1,37 @@
/*
Please include compiler name below (you may also include any other modules you would like to be loaded)
COMPILER= gnu
Please include All compiler flags and libraries as you want them run. You can simply copy this over from the Makefile's first few lines
CC = cc
OPT = -O3
CFLAGS = -Wall -std=gnu99 $(OPT)
MKLROOT = /opt/intel/composer_xe_2013.1.117/mkl
LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm
*/
const char* dgemm_desc = "Naive, three-loop dgemm.";
/* This routine performs a dgemm operation
* C := C + A * B
* where A, B, and C are lda-by-lda matrices stored in column-major format.
* On exit, A and B maintain their input values. */
void square_dgemm (int n, double* A, double* B, double* C)
{
// TODO: Implement the blocking optimization
/* For each row i of A */
for (int i = 0; i < n; ++i)
/* For each column j of B */
for (int j = 0; j < n; ++j)
{
/* Compute C(i,j) */
double cij = C[i+j*n];
for( int k = 0; k < n; k++ )
cij += A[i+k*n] * B[k+j*n];
C[i+j*n] = cij;
}
}

View file

@ -0,0 +1,35 @@
/*
Please include compiler name below (you may also include any other modules you would like to be loaded)
COMPILER= gnu
Please include All compiler flags and libraries as you want them run. You can simply copy this over from the Makefile's first few lines
CC = cc
OPT = -O3
CFLAGS = -Wall -std=gnu99 $(OPT)
MKLROOT = /opt/intel/composer_xe_2013.1.117/mkl
LDLIBS = -lrt -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm
*/
const char* dgemm_desc = "Naive, three-loop dgemm.";
/* This routine performs a dgemm operation
* C := C + A * B
* where A, B, and C are lda-by-lda matrices stored in column-major format.
* On exit, A and B maintain their input values. */
void square_dgemm (int n, double* A, double* B, double* C)
{
/* For each row i of A */
for (int i = 0; i < n; ++i)
/* For each column j of B */
for (int j = 0; j < n; ++j)
{
/* Compute C(i,j) */
double cij = C[i+j*n];
for( int k = 0; k < n; k++ )
cij += A[i+k*n] * B[k+j*n];
C[i+j*n] = cij;
}
}

View file

@ -0,0 +1,28 @@
#!/bin/bash -l
#SBATCH --job-name=matrixmult
#SBATCH --time=00:30:00
#SBATCH --nodes=1
#SBATCH --output=matrixmult-%j.out
#SBATCH --error=matrixmult-%j.err
# load modules
if command -v module 1>/dev/null 2>&1; then
module load gcc/10.1.0 intel-mkl/2020.1.217-gcc-10.1.0-qsctnr6 gnuplot
fi
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
echo "==== benchmark-naive ======================"
./benchmark-naive | tee timing_basic_dgemm.data
echo
echo "==== benchmark-blas ======================="
./benchmark-blas | tee timing_blas_dgemm.data
echo
echo "==== benchmark-blocked ===================="
./benchmark-blocked | tee timing_blocked_dgemm.data
echo
echo "==== plot results ========================="
gnuplot timing.gp

View file

@ -0,0 +1,20 @@
set title "NxN matrix-matrix-multiplication on 4-Core Intel(R) Xeon(R) CPU E3-1585L v5 @ 3.00GHz"
set xlabel "Matrix size (N)"
set ylabel "Performance (GFlop/s)"
set grid
set logscale y 10
set terminal postscript color "Helvetica" 14
set output "timing.ps"
# set terminal png color "Helvetica" 14
# set output "timing.png"
# plot "timing.data" using 2:4 title "square_dgemm" with linespoints
# For performance comparisons
plot "timing_basic_dgemm.data" using 2:4 title "Naive dgemm" with linespoints, \
"timing_blocked_dgemm.data" using 2:4 title "Blocked dgemm" with linespoints, \
"timing_blas_dgemm.data" using 2:4 title "MKL blas dgemm" with linespoints

View file

@ -0,0 +1,30 @@
#
# Usage:
# make # run benchmark on the local machine or on cluster compute node
# # operated by SLURM
#
.PRECIOUS: %.gp %.xxx %.out
#generic: generic.ps
generic: membench
membench: membench.c
gcc -O3 -o membench membench.c
clean:
make generic.clean
%.ps: membench %.gp
sbatch ./run_membench.sh $*
module load gnuplot
gnuplot %.gp
%.gp: gnuplot.template
sed -e '/sarlacc/ s//$*/' gnuplot.template > $*.gp
%.clean:
rm -f $*.ps $*.gp $*.xxx *.out membench
tar:
cd ../; tar cf membench.tar membench/*

View file

@ -0,0 +1,35 @@
set terminal postscript color
set output "generic.ps"
set style data linespoints
set style line 1 linetype 2
set style line 2 linetype 3
set style line 3 linetype 1
set logscale x 2
set nokey
set xtics (4,16,64,256,"1K" 1024,"4K" 4096,"16K" 16384,"64K" 65536,"256K" 262144,"1M" 1048576)
set title "10-Core Intel(R) Xeon(R) CPU E3-1585L v5 @ 3.00GHz Read+Write (ns) Versus Stride"
set xlabel "Stride (bytes)"
set ylabel "Time Read+Write (nanoseconds)"
set key on
plot 'generic.xxx' index 0 using 2:3 title "0.5 KB" with linespoints, \
'generic.xxx' index 1 using 2:3 title "1 KB" with linespoints, \
'generic.xxx' index 2 using 2:3 title "2 KB" with linespoints, \
'generic.xxx' index 3 using 2:3 title "4 KB" with linespoints, \
'generic.xxx' index 4 using 2:3 title "8 KB" with linespoints, \
'generic.xxx' index 5 using 2:3 title "16 KB" with linespoints, \
'generic.xxx' index 6 using 2:3 title "32 KB" with linespoints, \
'generic.xxx' index 7 using 2:3 title "64 KB" with linespoints, \
'generic.xxx' index 8 using 2:3 title "128 KB" with linespoints, \
'generic.xxx' index 9 using 2:3 title "256 KB" with linespoints, \
'generic.xxx' index 10 using 2:3 title "512 KB" with linespoints, \
'generic.xxx' index 11 using 2:3 title "1 MB" with linespoints, \
'generic.xxx' index 12 using 2:3 title "2 MB" with linespoints, \
'generic.xxx' index 13 using 2:3 title "4 MB" with linespoints, \
'generic.xxx' index 14 using 2:3 title "8 MB" with linespoints, \
'generic.xxx' index 15 using 2:3 title "16 MB" with linespoints, \
'generic.xxx' index 16 using 2:3 title "32 MB" with linespoints, \
'generic.xxx' index 17 using 2:3 title "64 MB" with linespoints

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,315 @@
512 4 0.422
512 8 0.439
512 16 0.403
512 32 0.403
512 64 0.398
512 128 0.456
512 256 0.726
1024 4 0.410
1024 8 0.420
1024 16 0.420
1024 32 0.407
1024 64 0.400
1024 128 0.391
1024 256 0.454
1024 512 0.728
2048 4 0.404
2048 8 0.408
2048 16 0.431
2048 32 0.419
2048 64 0.398
2048 128 0.438
2048 256 0.395
2048 512 0.464
2048 1024 0.724
4096 4 0.402
4096 8 0.430
4096 16 0.412
4096 32 0.427
4096 64 0.445
4096 128 0.404
4096 256 0.396
4096 512 0.402
4096 1024 0.461
4096 2048 0.714
8192 4 0.402
8192 8 0.403
8192 16 0.404
8192 32 0.431
8192 64 0.453
8192 128 0.469
8192 256 0.404
8192 512 0.428
8192 1024 0.406
8192 2048 0.491
8192 4096 0.753
16384 4 0.417
16384 8 0.403
16384 16 0.441
16384 32 0.441
16384 64 0.443
16384 128 0.505
16384 256 0.498
16384 512 0.420
16384 1024 0.523
16384 2048 0.631
16384 4096 0.605
16384 8192 0.705
32768 4 0.425
32768 8 0.412
32768 16 0.403
32768 32 0.403
32768 64 0.406
32768 128 0.406
32768 256 0.427
32768 512 0.463
32768 1024 0.505
32768 2048 0.670
32768 4096 0.918
32768 8192 0.581
32768 16384 0.702
65536 4 0.401
65536 8 0.403
65536 16 0.447
65536 32 0.466
65536 64 0.925
65536 128 1.306
65536 256 1.335
65536 512 1.885
65536 1024 2.523
65536 2048 2.266
65536 4096 3.132
65536 8192 0.913
65536 16384 0.582
65536 32768 0.701
131072 4 0.415
131072 8 0.403
131072 16 0.425
131072 32 0.456
131072 64 0.904
131072 128 1.307
131072 256 1.334
131072 512 1.848
131072 1024 2.393
131072 2048 2.863
131072 4096 3.742
131072 8192 3.120
131072 16384 0.887
131072 32768 0.581
131072 65536 0.701
262144 4 0.438
262144 8 0.440
262144 16 0.472
262144 32 0.518
262144 64 1.014
262144 128 1.698
262144 256 1.630
262144 512 2.226
262144 1024 2.609
262144 2048 3.078
262144 4096 3.927
262144 8192 3.755
262144 16384 3.272
262144 32768 0.948
262144 65536 0.622
262144 131072 0.701
524288 4 0.405
524288 8 0.431
524288 16 0.481
524288 32 0.818
524288 64 1.572
524288 128 2.656
524288 256 2.957
524288 512 3.704
524288 1024 3.985
524288 2048 4.254
524288 4096 4.515
524288 8192 4.148
524288 16384 3.985
524288 32768 3.220
524288 65536 0.982
524288 131072 0.697
524288 262144 0.767
1048576 4 0.404
1048576 8 0.418
1048576 16 0.546
1048576 32 0.800
1048576 64 1.499
1048576 128 2.615
1048576 256 3.033
1048576 512 3.902
1048576 1024 4.197
1048576 2048 4.332
1048576 4096 4.264
1048576 8192 4.419
1048576 16384 4.799
1048576 32768 5.757
1048576 65536 5.498
1048576 131072 0.957
1048576 262144 0.635
1048576 524288 0.704
2097152 4 0.635
2097152 8 0.647
2097152 16 0.556
2097152 32 0.954
2097152 64 1.703
2097152 128 2.897
2097152 256 3.222
2097152 512 4.473
2097152 1024 4.570
2097152 2048 4.091
2097152 4096 4.077
2097152 8192 4.546
2097152 16384 5.193
2097152 32768 5.117
2097152 65536 4.991
2097152 131072 4.379
2097152 262144 1.031
2097152 524288 0.673
2097152 1048576 0.726
4194304 4 0.445
4194304 8 0.636
4194304 16 0.848
4194304 32 1.132
4194304 64 1.639
4194304 128 3.468
4194304 256 3.918
4194304 512 4.942
4194304 1024 4.904
4194304 2048 4.221
4194304 4096 4.554
4194304 8192 5.309
4194304 16384 5.732
4194304 32768 5.519
4194304 65536 5.235
4194304 131072 4.912
4194304 262144 4.715
4194304 524288 1.013
4194304 1048576 0.594
4194304 2097152 0.740
8388608 4 0.549
8388608 8 0.755
8388608 16 1.192
8388608 32 1.612
8388608 64 3.242
8388608 128 5.062
8388608 256 4.817
8388608 512 4.999
8388608 1024 6.507
8388608 2048 7.264
8388608 4096 5.980
8388608 8192 4.671
8388608 16384 4.947
8388608 32768 5.228
8388608 65536 5.524
8388608 131072 5.782
8388608 262144 4.873
8388608 524288 3.119
8388608 1048576 0.965
8388608 2097152 0.580
8388608 4194304 0.702
16777216 4 0.581
16777216 8 0.790
16777216 16 1.447
16777216 32 2.226
16777216 64 5.026
16777216 128 7.217
16777216 256 11.799
16777216 512 10.725
16777216 1024 13.057
16777216 2048 14.465
16777216 4096 14.082
16777216 8192 8.786
16777216 16384 5.607
16777216 32768 5.368
16777216 65536 5.643
16777216 131072 5.823
16777216 262144 5.693
16777216 524288 4.243
16777216 1048576 3.339
16777216 2097152 1.109
16777216 4194304 0.579
16777216 8388608 0.702
33554432 4 0.540
33554432 8 0.781
33554432 16 1.266
33554432 32 2.442
33554432 64 5.082
33554432 128 8.243
33554432 256 12.508
33554432 512 12.972
33554432 1024 17.293
33554432 2048 20.517
33554432 4096 15.132
33554432 8192 15.801
33554432 16384 9.443
33554432 32768 7.183
33554432 65536 7.054
33554432 131072 6.064
33554432 262144 5.443
33554432 524288 5.409
33554432 1048576 4.329
33554432 2097152 3.734
33554432 4194304 0.956
33554432 8388608 0.579
33554432 16777216 0.699
67108864 4 0.539
67108864 8 0.794
67108864 16 1.373
67108864 32 2.531
67108864 64 5.398
67108864 128 9.175
67108864 256 14.549
67108864 512 17.047
67108864 1024 19.841
67108864 2048 21.698
67108864 4096 15.342
67108864 8192 14.757
67108864 16384 14.683
67108864 32768 11.404
67108864 65536 9.460
67108864 131072 6.333
67108864 262144 5.737
67108864 524288 5.177
67108864 1048576 4.878
67108864 2097152 4.219
67108864 4194304 3.114
67108864 8388608 0.967
67108864 16777216 0.638
67108864 33554432 0.707

View file

@ -0,0 +1,35 @@
set terminal postscript color
set output "sarlacc.ps"
set style data linespoints
set style line 1 linetype 2
set style line 2 linetype 3
set style line 3 linetype 1
set logscale x 2
set nokey
set xtics (4,16,64,256,"1K" 1024,"4K" 4096,"16K" 16384,"64K" 65536,"256K" 262144,"1M" 1048576)
set title "10-Core Intel(R) Xeon(R) CPU E3-1585L v5 @ 3.00GHz Read+Write (ns) Versus Stride"
set xlabel "Stride (bytes)"
set ylabel "Time Read+Write (nanoseconds)"
set key on
plot 'sarlacc.xxx' index 0 using 2:3 title "0.5 KB" with linespoints, \
'sarlacc.xxx' index 1 using 2:3 title "1 KB" with linespoints, \
'sarlacc.xxx' index 2 using 2:3 title "2 KB" with linespoints, \
'sarlacc.xxx' index 3 using 2:3 title "4 KB" with linespoints, \
'sarlacc.xxx' index 4 using 2:3 title "8 KB" with linespoints, \
'sarlacc.xxx' index 5 using 2:3 title "16 KB" with linespoints, \
'sarlacc.xxx' index 6 using 2:3 title "32 KB" with linespoints, \
'sarlacc.xxx' index 7 using 2:3 title "64 KB" with linespoints, \
'sarlacc.xxx' index 8 using 2:3 title "128 KB" with linespoints, \
'sarlacc.xxx' index 9 using 2:3 title "256 KB" with linespoints, \
'sarlacc.xxx' index 10 using 2:3 title "512 KB" with linespoints, \
'sarlacc.xxx' index 11 using 2:3 title "1 MB" with linespoints, \
'sarlacc.xxx' index 12 using 2:3 title "2 MB" with linespoints, \
'sarlacc.xxx' index 13 using 2:3 title "4 MB" with linespoints, \
'sarlacc.xxx' index 14 using 2:3 title "8 MB" with linespoints, \
'sarlacc.xxx' index 15 using 2:3 title "16 MB" with linespoints, \
'sarlacc.xxx' index 16 using 2:3 title "32 MB" with linespoints, \
'sarlacc.xxx' index 17 using 2:3 title "64 MB" with linespoints

View file

@ -0,0 +1,168 @@
/* ==================================================================== *
* *
* membench.c -- Measurement of the performance of the memory *
* hierarchy. *
* *
* ==================================================================== */
#include <unistd.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/times.h>
#include <sys/time.h>
#include <sys/types.h>
#include <time.h>
#include <limits.h>
#define CACHE_MIN (128) /* smallest cache */
#define CACHE_MAX (16 * 1024 * 1024) /* largest cache */
#define SAMPLE 10 /* to get larger time sample */
int x[CACHE_MAX]; /* stride thru this array */
/**
* Get the number of CPU ticks since last booting the computer
*/
inline unsigned long long getCPUTick (void)
{
unsigned lo, hi;
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
return (unsigned long long) hi << 32 | lo;
}
/**
* Get the current system time in milliseconds
*/
unsigned long timeGetTime (void)
{
/* Using Linux Time Functions To Determine Time */
struct timeval tv;
gettimeofday (&tv, 0);
return tv.tv_sec * 1000 + tv.tv_usec / 1000;
}
/**
* Determine the CPU clock speed.
* @param nTime The time in milliseconds used to perform the measurement
*/
unsigned long getCPUSpeed (long nTime)
{
long long timeStart, timeStop;
long long startTick, endTick;
long long overhead = getCPUTick () - getCPUTick ();
/* Calculate Starting Time And Start Tick */
timeStart = timeGetTime ();
while (timeGetTime () == timeStart)
timeStart = timeGetTime();
while (1)
{
timeStop = timeGetTime ();
if ((timeStop - timeStart) > 1)
{
startTick = getCPUTick ();
break;
}
}
/* Calculate Stop Time And End Tick */
timeStart = timeStop;
while (1)
{
timeStop = timeGetTime();
if ((timeStop - timeStart) > nTime)
{
endTick = getCPUTick();
break;
}
}
/* Return The Processors Speed In Hertz */
return (unsigned long) ((endTick - startTick) + (overhead));
}
int main ()
{
int register i, index, stride, limit, temp;
long steps, tsteps;
int csize;
/* timing variables */
double sec;
/* number of processor cycles used */
unsigned long long cycles0, cycles;
/* The CPU speed in Hz */
unsigned long nHz = getCPUSpeed (1000);
for (csize = CACHE_MIN; csize <= CACHE_MAX; csize <<= 1)
{
for (stride = 1; stride <= csize / 2; stride <<= 1)
{
/* init cycles counter */
cycles = 0;
/* cache size this loop */
limit = csize - stride + 1;
steps = 0;
do
{
cycles0 = getCPUTick ();
for (i = SAMPLE * stride; i != 0; i--)
{
/* larger sample */
for (index = 0; index < limit; index += stride)
{
/* cache access */
x[index] = x[index] + 1;
}
}
/* count while loop iterations */
steps++;
cycles += getCPUTick () - cycles0;
} while (cycles < nHz); /* repeat until collected 1 sec */
sec = cycles / (double) nHz;
/* repeat empty loop to subtract loop overhead */
/* used to match # while iterations */
tsteps = 0;
/* repeat until same # iterations as above */
do
{
cycles0 = getCPUTick ();
for (i = SAMPLE * stride; i != 0; i--)
{
/* larger sample */
for (index = 0; index < limit; index += stride)
{
/* dummy code */
temp = temp + index;
}
}
/* count while loop iterations */
tsteps++;
cycles -= getCPUTick () - cycles0;
} while (tsteps < steps);
printf ("Size:%7lu Stride:%7lu read+write:%10.3f ns, sec = %6.3f, cycles = %lld steps = %6.0f\n",
csize * sizeof (int), stride * sizeof (int),
(double) sec * 1e9 / (steps * SAMPLE * stride * ((limit - 1) / stride + 1)),
sec, cycles, (double) steps);
fflush(stdout);
}
printf ("\n\n");
}
return 0;
}

View file

@ -0,0 +1,16 @@
#!/bin/bash -l
#SBATCH --job-name=membench
#SBATCH --time=00:30:00
#SBATCH --nodes=1
#SBATCH --output=membench-%j.out
#SBATCH --error=membench-%j.err
# load modules
if command -v module 1>/dev/null 2>&1; then
module load gcc/10.1.0 gnuplot
fi
./membench | sed -e '/:/ s//: /g' -e '/ */ s// /g' | cut -f2,4,6 > generic.xxx && sed -e '/sarlacc/ s//generic/' gnuplot.template > generic.gp
gnuplot generic.gp

BIN
Project1/usi_inf.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 97 KiB