2022-09-26 17:35:06 +00:00
|
|
|
\documentclass[unicode,11pt,a4paper,oneside,numbers=endperiod,openany]{scrartcl}
|
|
|
|
|
|
|
|
\input{assignment.sty}
|
2022-10-05 08:03:17 +00:00
|
|
|
\usepackage{float}
|
|
|
|
\usepackage{subcaption}
|
|
|
|
\usepackage{graphicx}
|
|
|
|
\usepackage{tikz}
|
2022-10-06 19:52:51 +00:00
|
|
|
\usetikzlibrary{decorations.markings}
|
|
|
|
\usepackage{url}
|
|
|
|
\hypersetup{pdfborder = {0 0 0}}
|
|
|
|
\usepackage{xcolor}
|
2022-10-05 09:14:40 +00:00
|
|
|
\usepackage{multirow}
|
2022-10-05 18:04:15 +00:00
|
|
|
\usepackage{makecell}
|
|
|
|
\usepackage{booktabs}
|
2022-10-11 13:28:57 +00:00
|
|
|
\usepackage{algorithm2e}
|
2022-10-05 18:04:15 +00:00
|
|
|
\usepackage[nomessages]{fp}
|
2022-10-05 08:03:17 +00:00
|
|
|
|
|
|
|
\begin{document}
|
2022-09-26 17:35:06 +00:00
|
|
|
|
|
|
|
\setassignment
|
|
|
|
\setduedate{12.10.2022 (midnight)}
|
|
|
|
|
2022-10-06 19:52:51 +00:00
|
|
|
\serieheader{High-Performance Computing Lab}{2022}{Student: Claudio Maggioni}{
|
2022-10-12 09:50:47 +00:00
|
|
|
Discussed with: M. Cattaneo, G. De Vita, V. Karpenko}{Solution for Project 1}{}
|
2022-09-26 17:35:06 +00:00
|
|
|
\newline
|
|
|
|
|
2022-10-05 18:04:15 +00:00
|
|
|
%\assignmentpolicy
|
|
|
|
%In this project you will practice memory access optimization,
|
|
|
|
%performance-oriented programming, and OpenMP parallelizaton on the ICS Cluster.
|
|
|
|
|
|
|
|
\tableofcontents
|
2022-09-26 17:35:06 +00:00
|
|
|
|
|
|
|
\section{Explaining Memory Hierarchies \punkte{25}}
|
|
|
|
|
2022-09-27 08:39:48 +00:00
|
|
|
\subsection{Memory Hierarchy Parameters of the Cluster}
|
2022-10-05 18:04:15 +00:00
|
|
|
By invoking \texttt{likwid-topology} for the cache topology and \texttt{free -g}
|
|
|
|
for the amount of primary memory, the following memory hierarchy parameters are
|
|
|
|
found:
|
2022-09-26 17:35:06 +00:00
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
\begin{tabular}{llll}
|
|
|
|
Main memory & 62 GB \\
|
|
|
|
L3 cache & 25 MB per socket \\
|
|
|
|
L2 cache & 256 kB per core \\
|
|
|
|
L1 cache & 32 kB per core
|
|
|
|
\end{tabular}
|
|
|
|
\end{center}
|
|
|
|
|
|
|
|
All values are reported using base 2 IEC byte units. The cluster has 2 sockets
|
|
|
|
and a total of 20 cores (10 per socket). The cache topology diagram reported by
|
2022-10-05 09:14:40 +00:00
|
|
|
\texttt{likwid-topology -g} is shown in Figure \ref{fig:topo}.
|
2022-09-26 17:35:06 +00:00
|
|
|
|
2022-10-05 18:04:15 +00:00
|
|
|
\pagebreak[4]
|
|
|
|
|
2022-10-05 09:14:40 +00:00
|
|
|
\begin{figure}[t]
|
|
|
|
\begin{center}
|
|
|
|
Socket 0:\vspace{0.3cm}
|
|
|
|
\begin{tabular}{|l|l|l|l|l|l|l|l|l|l|}
|
|
|
|
\hline 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 \\\hline
|
|
|
|
32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32
|
|
|
|
kB & 32 kB \\\hline
|
|
|
|
256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256
|
|
|
|
kB & 256 kB & 256 kB \\\hline
|
|
|
|
\multicolumn{10}{|c|}{25 MB} \\\hline
|
|
|
|
\end{tabular}\vspace{0.8cm}\\
|
|
|
|
Socket 1:\vspace{0.3cm}
|
|
|
|
\begin{tabular}{|l|l|l|l|l|l|l|l|l|l|}
|
|
|
|
\hline 10 & 11 & 12 & 13 & 14 & 15 & 16 & 17 & 18 & 19 \\\hline
|
|
|
|
32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32
|
|
|
|
kB & 32 kB \\\hline
|
|
|
|
256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256
|
|
|
|
kB & 256 kB & 256 kB \\\hline
|
|
|
|
\multicolumn{10}{|c|}{25 MB} \\\hline
|
|
|
|
\end{tabular}
|
|
|
|
\end{center}
|
|
|
|
\caption{Cache topology diagram as outputted by \texttt{likwid-topology -g}.
|
|
|
|
Byte sizes all in IEC units.}
|
|
|
|
\label{fig:topo}
|
|
|
|
\end{figure}
|
2022-09-26 17:35:06 +00:00
|
|
|
|
2022-09-27 08:39:48 +00:00
|
|
|
\subsection{Memory Access Pattern of \texttt{membench.c}}
|
|
|
|
|
2022-10-05 18:04:15 +00:00
|
|
|
\begin{figure}[t]
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}
|
|
|
|
\tikzset{->-/.style={decoration={
|
|
|
|
markings,
|
|
|
|
mark=at position .75 with {\arrow{>}}},postaction={decorate}}};
|
|
|
|
|
|
|
|
\draw (0,0) grid (5,1);
|
|
|
|
\draw [dashed] (5,0) -- (5.5,0);
|
|
|
|
\draw [dashed] (5,1) -- (5.5,1);
|
|
|
|
\draw [dashed] (6.5,0) -- (7,0);
|
|
|
|
\draw [dashed] (6.5,1) -- (7,1);
|
|
|
|
\draw (7,0) grid (12,1);
|
|
|
|
|
|
|
|
\foreach \r in {0,1,...,4}{
|
|
|
|
\fill (\r + 0.5,0.5) circle [radius=2pt];
|
|
|
|
\draw[->-] (\r-0.5,0.5) to[bend left] (\r+0.5,0.5);
|
|
|
|
\draw (\r + 0.5, -0.5) node {$\r$};
|
|
|
|
}
|
|
|
|
\draw[->-] (4.5,0.5) to[bend left] (5.5,0.5);
|
|
|
|
\foreach \r in {7,8,...,11}{
|
|
|
|
\fill (\r + 0.5,0.5) circle [radius=2pt];
|
|
|
|
\FPeval{l}{round(\r + 128 - 12, 0)}
|
|
|
|
\draw[->-] (\r-0.5,0.5) to[bend left] (\r+0.5,0.5);
|
|
|
|
\draw (\r + 0.5, -0.5) node {$\l$};
|
|
|
|
}
|
|
|
|
|
|
|
|
\draw (0,-3) grid (3,-2);
|
|
|
|
\draw [dashed] (3,-2) -- (3.5,-2);
|
|
|
|
\draw [dashed] (3,-3) -- (3.5,-3);
|
|
|
|
\draw [dashed] (4,-2) -- (4.5,-2);
|
|
|
|
\draw [dashed] (4,-3) -- (4.5,-3);
|
|
|
|
\draw (4.5,-2) -- (7.5,-2);
|
|
|
|
\draw (4.5,-3) -- (7.5,-3);
|
|
|
|
\foreach \r in {4.5,5.5,...,7.5}{
|
|
|
|
\draw (\r,-3) -- (\r,-2);
|
|
|
|
}
|
|
|
|
\draw [dashed] (7.5,-2) -- (8,-2);
|
|
|
|
\draw [dashed] (7.5,-3) -- (8,-3);
|
|
|
|
\draw [dashed] (8.5,-2) -- (9,-2);
|
|
|
|
\draw [dashed] (8.5,-3) -- (9,-3);
|
|
|
|
\draw (9,-3) grid (12,-2);
|
|
|
|
|
|
|
|
\fill (0.5,-2.5) circle [radius=2pt];
|
|
|
|
\fill (6,-2.5) circle [radius=2pt];
|
|
|
|
\foreach \r in {0,1,2}{
|
|
|
|
\draw (\r + 0.5, -3.5) node {$\r$};
|
|
|
|
}
|
|
|
|
\foreach \r in {9,10,11}{
|
|
|
|
\FPeval{l}{round(\r - 12, 0)}
|
|
|
|
\draw (\r + 0.5, -3.5) node {\tiny $2^{20} \l$};
|
|
|
|
}
|
2022-10-11 14:42:49 +00:00
|
|
|
\draw (5, -3.5) node {\tiny $2^{19} - 1$};
|
|
|
|
\draw (6, -3.5) node {\tiny $2^{19}$};
|
|
|
|
\draw (7,-3.5) node {\tiny $2^{19} + 1$};
|
2022-10-05 18:04:15 +00:00
|
|
|
\draw[->-] (-0.5,-2.5) to[bend left] (0.5,-2.5);
|
|
|
|
\draw[->-] (0.5,-2.5) to[bend left] (6,-2.5);
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
\caption{Memory access patterns of \texttt{membench.c} for \texttt{csize =
|
|
|
|
128} and \texttt{stride = 1} (above) and for \texttt{csize = $2^{20}$} and
|
2022-10-11 13:28:57 +00:00
|
|
|
\texttt{stride = $2^{19}$} (below)}
|
2022-10-05 18:04:15 +00:00
|
|
|
\label{fig:access}
|
|
|
|
\end{figure}
|
|
|
|
|
2022-09-27 08:39:48 +00:00
|
|
|
The benchmark \texttt{membench.c} measures the average time of repeated read and
|
|
|
|
write overations across a set of indices of a stack-allocated array of 32-bit
|
|
|
|
signed integers. The indices vary according to the access pattern used, which in
|
|
|
|
turn is defined by two variables, \texttt{csize} and \texttt{stride}.
|
|
|
|
\texttt{csize} is an upper bound on the index value, i.e. (one more of) the
|
|
|
|
highest index used to access the array in the pattern. \texttt{stride}
|
|
|
|
determines the difference between array indexes over access iterations, i.e. a
|
|
|
|
\texttt{stride} of 1 will access every array index, a \texttt{stride} of 2 will
|
|
|
|
skip every other index, a \texttt{stride} of 4 will access one index then skip 3
|
2022-10-11 14:42:49 +00:00
|
|
|
and so on and so forth. The benchmark stops when the index to access is strictly
|
|
|
|
greater than \texttt{csize - stride}.
|
2022-09-27 08:39:48 +00:00
|
|
|
|
|
|
|
Therefore, for \texttt{csize = 128} and \texttt{stride = 1} the array will
|
|
|
|
access all indexes between 0 and 127 sequentially, and for \texttt{csize =
|
2022-10-11 13:28:57 +00:00
|
|
|
$2^{20}$} and \texttt{stride = $2^{19}$} the benchmark will access index 0, then
|
2022-10-11 14:42:49 +00:00
|
|
|
index $2^{19}-1$. The access patterns for these
|
2022-10-05 18:04:15 +00:00
|
|
|
two configurations are shown visually in Figure \ref{fig:access}.
|
2022-09-27 08:39:48 +00:00
|
|
|
|
2022-10-11 14:42:49 +00:00
|
|
|
By running the \texttt{membench.c} both on my personal laptop and on the
|
|
|
|
cluster, the results shown in Figure \ref{fig:mem} are obtained. \textit{csize}
|
|
|
|
values are shown as different data series and labeled by byte size and
|
|
|
|
\textit{stride} values are mapped on the $x$ axis by the byte-equivalent value
|
|
|
|
as well\footnote{Byte values are a factor of 4 greater than the values used in
|
|
|
|
the code and in Figure \ref{fig:mem}. This is due to the fact that the array
|
|
|
|
elements used in the benchmark are 32-bit signed integers, which take up 4 bytes
|
|
|
|
each.}. For $\texttt{csize = 128} = 512$ bytes and $\texttt{stride = 1} = 4$
|
|
|
|
bytes the mean access time is $0.124$ nanoseconds, while for $\texttt{csize =
|
|
|
|
$2^{20}$} = 4$MB and for $\texttt{stride = $2^{19}$} = 2$MB the mean access time
|
|
|
|
is $1.156$ nanoseconds. The first set of parameters performs well thanks to the
|
|
|
|
low \textit{stride} value, thus achieving very good space locality and
|
|
|
|
maximizing cache hits. However, the second set of parameters achieves good
|
|
|
|
performance as well thanks to the few values accessed with each pass, thus
|
|
|
|
improving the temporal locality of each address accessed. This observation
|
|
|
|
applies for the few last data points in each data series of Figure
|
|
|
|
\ref{fig:mem}, i.e. for \textit{stride} values close to \textit{csize}.
|
|
|
|
|
2022-09-27 08:39:48 +00:00
|
|
|
\subsection{Analyzing Benchmark Results}
|
|
|
|
|
2022-10-05 08:03:17 +00:00
|
|
|
\begin{figure}[t]
|
|
|
|
\begin{subfigure}{0.5\textwidth}
|
|
|
|
\includegraphics[width=\textwidth]{generic_macos.pdf}
|
|
|
|
\caption{Personal laptop}
|
|
|
|
\label{fig:mem:laptop}
|
|
|
|
\end{subfigure}
|
|
|
|
\begin{subfigure}{0.5\textwidth}
|
|
|
|
\includegraphics[width=\textwidth]{generic_cluster.pdf}
|
|
|
|
\caption{Cluster}
|
|
|
|
\label{fig:mem:cluster}
|
|
|
|
\end{subfigure}
|
|
|
|
\caption{Results of the \texttt{membench.c} benchmark for both my personal
|
|
|
|
laptop (in Figure \ref{fig:mem:laptop}) and the cluster (in Figure
|
|
|
|
\ref{fig:mem:cluster}).}
|
|
|
|
\label{fig:mem}
|
|
|
|
\end{figure}
|
2022-09-27 08:39:48 +00:00
|
|
|
|
2022-10-05 08:03:17 +00:00
|
|
|
The \texttt{membench.c} benchmark results for my personal laptop (Macbook Pro
|
|
|
|
2018 with a Core i7-8750H CPU) and the cluster are shown in figure
|
|
|
|
\ref{fig:mem}.
|
2022-09-27 08:39:48 +00:00
|
|
|
|
|
|
|
The memory access graph for the cluster's benchmark results shows that temporal
|
|
|
|
locality is best for small array sizes and for small \texttt{stride} values.
|
|
|
|
In particular, for array memory sizes of 16MB or lower (\texttt{csize} of $4
|
|
|
|
\cdot 2^{20}$ or lower) and \texttt{stride} values of 2048 or lower the mean
|
|
|
|
read+write time is less than 10 nanoseconds. Temporal locality is worst for
|
|
|
|
large sizes and strides, although the largest values of \texttt{stride} for each
|
|
|
|
size (like \texttt{csize / 2} and \texttt{csize / 4}) achieve better mean times
|
2022-10-11 14:42:49 +00:00
|
|
|
for the aformentioned effect of having \textit{stride} values close to
|
|
|
|
\textit{csize}.
|
|
|
|
|
|
|
|
The pattern that can be read from the graphs, especially the one for the
|
|
|
|
cluster, shows that the \textit{stride} axis is divided in regions showing
|
|
|
|
memory access time of similar magnitude. The boundary between the first and the
|
|
|
|
second region is a \textit{stride} value of rougly 2KB, while a \textit{stride}
|
|
|
|
of 512KB roughly separates the second and the third region. The difference in
|
|
|
|
performance between regions and the similarity of performance within regions
|
|
|
|
suggest the threshold stride values are related to changes in the use of the
|
|
|
|
cache hierarchy. In particular, the first region may characterize regions where
|
|
|
|
the L1 cache, the fastest non-register memory available, is predominantly used.
|
|
|
|
Then the second region might overlap with a more intense use of the L2 cache and
|
|
|
|
likewise between the third region and the L3 cache.
|
2022-10-06 19:52:51 +00:00
|
|
|
\marginpar[right text]{\color{white}\url{https://youtu.be/JzJlzGaQFoc}}
|
2022-09-27 08:39:48 +00:00
|
|
|
|
2022-09-26 17:35:06 +00:00
|
|
|
\section{Optimize Square Matrix-Matrix Multiplication \punkte{60}}
|
|
|
|
|
2022-10-11 13:28:57 +00:00
|
|
|
|
|
|
|
\begin{figure}[t]
|
|
|
|
\begin{verbatim}
|
|
|
|
INPUT: A (n by n), B (n by n), n
|
|
|
|
OUTPUT: C (n by n)
|
|
|
|
|
|
|
|
s := 26 # block dimension
|
|
|
|
|
|
|
|
A_row := <matrix A converted in row major form>
|
|
|
|
C_temp := <empty s by s matrix>
|
|
|
|
|
|
|
|
for i := 0 to n by s:
|
|
|
|
i_next := min(i + s, n)
|
|
|
|
|
|
|
|
for j := 0 to n by s:
|
|
|
|
j_next := min(j + s, n)
|
|
|
|
|
|
|
|
<set all cells in C_temp to 0>
|
|
|
|
|
|
|
|
for k := 0 to n by s:
|
|
|
|
k_next := min(k + s, n)
|
|
|
|
|
|
|
|
# Perform naive matrix multiplication, incrementing cells of C_temp
|
|
|
|
# with each multiplication result
|
|
|
|
naivemm(A_row[i, k][i_next, k_next], B[k, j][k_next, j_next],
|
|
|
|
C_temp[0, 0][i_next - i, j_next - j])
|
|
|
|
end for
|
|
|
|
|
|
|
|
C[i, j][i_next, j_next] = C_temp[0, 0][i_next - i, j_next - j]
|
|
|
|
end for
|
|
|
|
end for
|
|
|
|
\end{verbatim}
|
|
|
|
\caption{Pseudocode listing of my blocked matrix multiplication
|
|
|
|
implementation. Matrix indices start from 0 (i.e. row $0$ and column $0$
|
|
|
|
denotes the top-left-most cell in a matrix). \\ \texttt{M[a, b][c, d]} denotes
|
|
|
|
a rectangular region of the matrix $M$ whose top-left-most cell is the cell
|
|
|
|
in $M$ at row $a$ and column $b$ and whose bottom-right-most cell is the
|
|
|
|
cell in $M$ at row $c - 1$ and column $d - 1$.}
|
|
|
|
\label{fig:algo}
|
|
|
|
\end{figure}
|
|
|
|
|
2022-10-05 08:03:17 +00:00
|
|
|
The file \texttt{matmult/dgemm-blocked.c} contains a C implementation of the
|
2022-10-11 13:28:57 +00:00
|
|
|
blocked matrix multiplication algorithm presented in the project. A pseudocode
|
|
|
|
listing of the implementation is provided in Figure \ref{fig:algo}.
|
|
|
|
|
|
|
|
In order to achieve a correct and fast execution, my implementation:
|
2022-09-26 17:35:06 +00:00
|
|
|
|
2022-10-05 08:03:17 +00:00
|
|
|
\begin{figure}[t]
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}
|
|
|
|
\fill[blue!60!white] (4,0) rectangle (5,-2);
|
|
|
|
\fill[blue!40!white] (4,-2) rectangle (5,-4);
|
|
|
|
\fill[blue!60!white] (0,-4) rectangle (2,-5);
|
|
|
|
\fill[blue!40!white] (2,-4) rectangle (4,-5);
|
|
|
|
\fill[green!40!white] (4,-4) rectangle (5,-5);
|
|
|
|
\draw[step=1,gray,very thin] (0,0) grid (5,-5);
|
|
|
|
\draw[step=2] (0,0) grid (5,-5);
|
|
|
|
\draw[step=5] (0,0) grid (5,-5);
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
\caption{Result of the block division process of a square matrix of size 5
|
|
|
|
using a block size of 2. The 2-by-1 and 1-by-2 rectangular remainders are
|
|
|
|
shown in blue and the square matrix of remainder size (i.e. 1) is shown in
|
|
|
|
green.}
|
|
|
|
\label{fig:matrix}
|
|
|
|
\end{figure}
|
|
|
|
|
|
|
|
\begin{itemize}
|
|
|
|
\item Handles the edge cases related to the ``remainders'' in the matrix
|
|
|
|
block division, i.e. when the division between the size of the matrix
|
|
|
|
and the block size yields a remainder. Assuming only squared matrices
|
|
|
|
are multiplied through the algorithm (as in the test suite provided) the
|
|
|
|
block division could yield rectangular matrix blocks located in the last
|
|
|
|
rows and columns of each matrix, and the bottom-right corner of the
|
|
|
|
matrix will be contained in a square matrix block of the size of the
|
|
|
|
remainder. The result of this process is shown in Figure
|
|
|
|
\ref{fig:matrix};
|
|
|
|
\item Converts matrix A into row major format. As shown in Figure
|
|
|
|
\ref{fig:iter}, by having A in row major format and B in column major
|
|
|
|
format, iterations across matrix block in the inner most loop of the
|
|
|
|
algorithm (the one calling \textit{naivemm}) cache hits are maximised by
|
2022-10-11 13:28:57 +00:00
|
|
|
achieving space locality between the blocks used. This achieved
|
|
|
|
approximately an increase of performance of two percentage points in
|
|
|
|
terms of CPU utilization (i.e. from a baseline of $4\%$ to $6\%$),
|
2022-10-05 08:03:17 +00:00
|
|
|
\item Caches the result of each innermost iteration into a temporary matrix
|
|
|
|
of block size before storing it into matrix C. This achieves better
|
|
|
|
space locality when \textit{naivemm} needs to store values in matrix C.
|
|
|
|
The block size temporary matrix has virtually no stride and thus cache
|
|
|
|
hits are maximised. The copy operation is implemented with bulk copy
|
2022-10-11 13:28:57 +00:00
|
|
|
\texttt{memcpy} calls. This optimization achieves an extra half of a
|
|
|
|
percentage point in terms of CPU utilization (i.e. from the $6\%$
|
2022-10-12 09:38:39 +00:00
|
|
|
discussed above to roughly $6.5\%$).
|
|
|
|
\item Exploits some compiler optimizations, namely using the compiler
|
|
|
|
optimizer at the \texttt{-O3} setting and using the \texttt{-ffast-math}
|
|
|
|
and \texttt{-march=haswell} to respectively apply some floating point
|
|
|
|
arithmetic optimizations and to set the compiler target to the exact ISA
|
|
|
|
of the cluster's processor. Note that these flags are applied to all
|
|
|
|
implementations used in the benchmark as the flags were added in the
|
|
|
|
\texttt{Makefile}. In addition, the \texttt{naivemm} algoritms was
|
|
|
|
inlined in the actual \texttt{dgemm-blocked.c} source code instead of
|
|
|
|
being separated into a function to be called to achieve better
|
|
|
|
performance and compiler optimizations. All these changes increased the
|
|
|
|
CPU utilization from $6.5\%$ to an average of $13.45\%$.
|
2022-10-05 08:03:17 +00:00
|
|
|
\end{itemize}
|
|
|
|
|
2022-10-11 14:42:49 +00:00
|
|
|
The chosen matrix block size for running the benchmark on the cluster is:
|
2022-10-11 13:28:57 +00:00
|
|
|
|
2022-10-12 09:38:39 +00:00
|
|
|
$$s = 32$$
|
2022-10-11 13:28:57 +00:00
|
|
|
|
|
|
|
as shown in the pseudocode. This value has been obtained by running an empirical
|
|
|
|
binary search on the value using the benchmark as a metric, i.e. by running
|
|
|
|
\texttt{./run\_matrixmult.sh} several times with different values. For square
|
|
|
|
blocks (i.e. the worst case) the total size for the matrix $A$ and $B$ sub-block
|
|
|
|
and the \texttt{C\_temp} temporary matrix block for $C$ is:
|
|
|
|
|
2022-10-12 09:38:39 +00:00
|
|
|
$$\mathrm{Bytes} = \mathrm{cellSize} * s^2 * 3 = 8 * 32^2 * 3 = 24576$$
|
2022-10-11 13:28:57 +00:00
|
|
|
|
|
|
|
given that a double-precision floating point number, the data type used for
|
|
|
|
matrix cells in the scope of this project, is 8 bytes long. The obtained total
|
|
|
|
bytes size is fairly close to the L1 cache size of the processor used in the
|
|
|
|
cluster ($32\mathrm{Kb} = 32768$ bytes), which is expected given that the
|
|
|
|
algorithm needs to exploit fast memory as much as possible. The reason the
|
|
|
|
empirically best value results in a theoretical cache allocation that is only
|
2022-10-11 14:42:49 +00:00
|
|
|
half of the complete L1 cache size is due to some real-life factors. For example,
|
2022-10-11 13:28:57 +00:00
|
|
|
cache misses tipically result in aligned page loads which may load unnecessary
|
|
|
|
data.
|
|
|
|
|
|
|
|
A potential way to exploit the different cache levels is to apply the blocked
|
|
|
|
matrix algorithm iteratively multiple times. For example, OpenBLAS implements
|
|
|
|
DGEMM by having two levels of matrix blocks to better exploit the L2 and L3
|
|
|
|
caches found on most processors.
|
|
|
|
|
2022-10-05 08:03:17 +00:00
|
|
|
\begin{figure}[t]
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}
|
|
|
|
\node[align=center] at (2.5,0.5) {Matrix A};
|
|
|
|
\fill[orange!10!white] (0,0) rectangle (2,-2);
|
|
|
|
\fill[orange!25!white] (2,0) rectangle (4,-2);
|
|
|
|
\fill[orange!40!white] (4,0) rectangle (5,-2);
|
|
|
|
|
|
|
|
\draw[step=1,gray,very thin] (0,0) grid (5,-5);
|
|
|
|
\draw[step=2,black,thick] (0,0) grid (5,-5);
|
|
|
|
\draw[step=5,black,thick] (0,0) grid (5,-5);
|
|
|
|
|
|
|
|
\draw[-to,step=1,red,very thick] (0.5,-0.5) -- (4.5,-0.5);
|
|
|
|
\draw[-to,step=1,red,very thick] (0.5,-1.5) -- (4.5,-1.5);
|
|
|
|
\draw[-to,step=1,red,very thick] (0.5,-2.5) -- (4.5,-2.5);
|
|
|
|
\draw[-to,step=1,red,very thick] (0.5,-3.5) -- (4.5,-3.5);
|
|
|
|
\draw[-to,step=1,red,very thick] (0.5,-4.5) -- (4.5,-4.5);
|
|
|
|
|
|
|
|
\node[align=center] at (8.5,0.5) {Matrix B};
|
|
|
|
\fill[orange!10!white] (6,0) rectangle (8,-2);
|
|
|
|
\fill[orange!25!white] (6,-2) rectangle (8,-4);
|
|
|
|
\fill[orange!40!white] (6,-4) rectangle (8,-5);
|
|
|
|
|
|
|
|
\draw[step=1,gray,very thin] (6,0) grid (11,-5);
|
|
|
|
\draw[step=2,black,thick] (6,0) grid (11,-5);
|
|
|
|
\draw[step=5,black,thick] (6,0) grid (11,-5);
|
|
|
|
\draw[black,thick] (11,0) -- (11,-5);
|
|
|
|
|
|
|
|
\draw[-to,step=1,red,very thick] (6.5,-0.5) -- (6.5,-4.5);
|
|
|
|
\draw[-to,step=1,red,very thick] (7.5,-0.5) -- (7.5,-4.5);
|
|
|
|
\draw[-to,step=1,red,very thick] (8.5,-0.5) -- (8.5,-4.5);
|
|
|
|
\draw[-to,step=1,red,very thick] (9.5,-0.5) -- (9.5,-4.5);
|
|
|
|
\draw[-to,step=1,red,very thick] (10.5,-0.5) -- (10.5,-4.5);
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
\caption{Inner most loop iteration of the blocked GEMM algorithm across
|
|
|
|
matrices A and B. The red lines represent the ``majorness'' of each matrix
|
|
|
|
(A is converted by the algorithm in row-major form, while B is given and
|
|
|
|
used in column-major form). The shades of orange represent the blocks used
|
|
|
|
in each iteration.}
|
|
|
|
\label{fig:iter}
|
|
|
|
\end{figure}
|
|
|
|
|
|
|
|
The results of the matrix multiplication benchmark for the naive, blocked, and
|
2022-10-05 18:04:15 +00:00
|
|
|
BLAS implementations are shown in Figure \ref{fig:bench} as a graph of GFlop/s
|
|
|
|
over matrix size or in Figure \ref{fig:benchtab} as a table. The blocked
|
2022-10-12 09:38:39 +00:00
|
|
|
implementation achieves up to 200\% more FLOPS than the naive implementation for
|
|
|
|
the largest matrix dimensions. However, the blocked implementation achives
|
|
|
|
roughly an eighth of the FLOPS the Intel MKL BLAS based implementation achieves.
|
2022-10-05 08:03:17 +00:00
|
|
|
|
2022-10-11 14:42:49 +00:00
|
|
|
I was unable to run this benchmark suite on my personal machine due to Intel MKL
|
|
|
|
installation issues that prevented the code to compile.
|
|
|
|
|
2022-10-05 08:03:17 +00:00
|
|
|
\begin{figure}[t]
|
|
|
|
\includegraphics[width=\textwidth]{timing.pdf}
|
2022-10-05 18:04:15 +00:00
|
|
|
\caption{GFlop/s per matrix size of the matrix multiplication benchmark for the naive,
|
|
|
|
blocked, and BLAS implementations. The Y-axis is log-scaled.}
|
2022-10-05 08:03:17 +00:00
|
|
|
\label{fig:bench}
|
|
|
|
\end{figure}
|
2022-09-26 17:35:06 +00:00
|
|
|
|
2022-10-05 18:04:15 +00:00
|
|
|
\begin{figure}[t]
|
|
|
|
\begin{center}
|
|
|
|
\begin{tabular}{c|cc|cc|cc}
|
|
|
|
\toprule
|
|
|
|
& \multicolumn{2}{c|}{Naive} & \multicolumn{2}{c|}{Blocked} &
|
|
|
|
\multicolumn{2}{c}{BLAS} \\
|
|
|
|
\makecell{Size} & \makecell{MFLOPS} &
|
|
|
|
\makecell{\% CPU} & \makecell{MFLOPS} &
|
|
|
|
\makecell{\% CPU} & \makecell{MFLOPS} &
|
|
|
|
\makecell{\% CPU} \\
|
|
|
|
\midrule
|
2022-10-12 09:38:39 +00:00
|
|
|
31 & 3140.45 & 8.53 & 3844.56 & 10.45 & 25677.4 & 69.78 \\
|
|
|
|
32 & 3364.78 & 9.14 & 5342.55 & 14.52 & 28952.1 & 78.67 \\
|
|
|
|
96 & 2703.08 & 7.35 & 5620.08 & 15.27 & 32816.4 & 89.18 \\
|
|
|
|
97 & 2729.68 & 7.42 & 4754.1 & 12.92 & 31699.2 & 86.14 \\
|
|
|
|
127 & 2556.58 & 6.95 & 4977.82 & 13.53 & 30274.5 & 82.27 \\
|
|
|
|
128 & 1803.41 & 4.90 & 4817.8 & 13.09 & 32721.7 & 88.92 \\
|
|
|
|
129 & 2669.26 & 7.25 & 4594.25 & 12.48 & 31746.4 & 86.27 \\
|
|
|
|
191 & 2290.09 & 6.22 & 4931.27 & 13.40 & 32263.1 & 87.67 \\
|
|
|
|
192 & 1801.66 & 4.90 & 5549.67 & 15.08 & 35491.2 & 96.44 \\
|
|
|
|
229 & 2218.61 & 6.03 & 4982.59 & 13.54 & 34557.2 & 93.91 \\
|
|
|
|
255 & 2178.15 & 5.92 & 4528.43 & 12.31 & 33771.3 & 91.77 \\
|
|
|
|
256 & 808.413 & 2.20 & 4652.68 & 12.64 & 35221.1 & 95.71 \\
|
|
|
|
257 & 2238.93 & 6.08 & 4512.33 & 12.26 & 33807.9 & 91.87 \\
|
|
|
|
319 & 2174.45 & 5.91 & 5093.38 & 13.84 & 34415.8 & 93.52 \\
|
|
|
|
320 & 1612.13 & 4.38 & 5674.61 & 15.42 & 36500.2 & 99.19 \\
|
|
|
|
321 & 2173.64 & 5.91 & 5111.09 & 13.89 & 35508.1 & 96.49 \\
|
|
|
|
417 & 2125.36 & 5.78 & 5143.98 & 13.98 & 36157.6 & 98.25 \\
|
|
|
|
479 & 2107.13 & 5.73 & 5152.51 & 14.00 & 36186.4 & 98.33 \\
|
|
|
|
480 & 1848.43 & 5.02 & 5703 & 15.50 & 37971.3 & 103.18 \\
|
|
|
|
511 & 2112.99 & 5.74 & 4479.96 & 12.17 & 35144 & 95.50 \\
|
|
|
|
512 & 801.127 & 2.18 & 4596.26 & 12.49 & 37362.5 & 101.53 \\
|
|
|
|
639 & 1881.94 & 5.11 & 5168.59 & 14.05 & 36989.1 & 100.51 \\
|
|
|
|
640 & 815.847 & 2.22 & 5232.97 & 14.22 & 38267.8 & 103.99 \\
|
|
|
|
767 & 1825.75 & 4.96 & 4701.09 & 12.77 & 37220.8 & 101.14 \\
|
|
|
|
768 & 812.933 & 2.21 & 4826.12 & 13.11 & 38744 & 105.28 \\
|
|
|
|
769 & 1825.38 & 4.96 & 4686.21 & 12.73 & 37076.1 & 100.75 \\
|
2022-10-05 18:04:15 +00:00
|
|
|
\bottomrule
|
|
|
|
\end{tabular}
|
|
|
|
\end{center}
|
|
|
|
\caption{MFlop/s and CPU utlisation per matrix size of the matrix
|
|
|
|
multiplication benchmark for the naive, blocked, and BLAS implementations.}
|
|
|
|
\label{fig:benchtab}
|
|
|
|
\end{figure}
|
2022-09-26 17:35:06 +00:00
|
|
|
\end{document}
|