\documentclass[unicode,11pt,a4paper,oneside,numbers=endperiod,openany]{scrartcl}

\input{assignment.sty}
\usepackage{float}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage{tikz}
\usetikzlibrary{decorations.markings}
\usepackage{url}
\hypersetup{pdfborder = {0 0 0}}
\usepackage{xcolor}
\usepackage{multirow}
\usepackage{makecell}
\usepackage{booktabs}
\usepackage{algorithm2e}
\usepackage[nomessages]{fp}

\begin{document}

\setassignment
\setduedate{12.10.2022 (midnight)}

\serieheader{High-Performance Computing Lab}{2022}{Student: Claudio Maggioni}{
    Discussed with: M. Cattaneo, G. De Vita, V. Karpenko}{Solution for Project 1}{}
\newline

%\assignmentpolicy
%In this project you will practice memory access optimization,
%performance-oriented programming, and OpenMP parallelizaton on the ICS Cluster.  

\tableofcontents

\section{Explaining Memory Hierarchies \punkte{25}}

\subsection{Memory Hierarchy Parameters of the Cluster}
By invoking \texttt{likwid-topology} for the cache topology and \texttt{free -g}
for the amount of primary memory, the following memory hierarchy parameters are
found:

\begin{center}
\begin{tabular}{llll}
Main memory & 62 GB              \\
L3 cache    & 25 MB per socket   \\
L2 cache    & 256 kB per core    \\
L1 cache    & 32 kB per core    
\end{tabular}
\end{center}

All values are reported using base 2 IEC byte units. The cluster has 2 sockets
and a total of 20 cores (10 per socket). The cache topology diagram reported by
\texttt{likwid-topology -g} is shown in Figure \ref{fig:topo}.

\pagebreak[4]

\begin{figure}[t]
    \begin{center}
       Socket 0:\vspace{0.3cm}
        \begin{tabular}{|l|l|l|l|l|l|l|l|l|l|}
            \hline 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 \\\hline
            32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32
            kB & 32 kB \\\hline
            256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256
            kB & 256 kB & 256 kB \\\hline
            \multicolumn{10}{|c|}{25 MB} \\\hline
      \end{tabular}\vspace{0.8cm}\\
       Socket 1:\vspace{0.3cm}
       \begin{tabular}{|l|l|l|l|l|l|l|l|l|l|}
            \hline 10 & 11 & 12 & 13 & 14 & 15 & 16 & 17 & 18 & 19 \\\hline
            32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32
            kB & 32 kB \\\hline
            256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256
            kB & 256 kB & 256 kB \\\hline
            \multicolumn{10}{|c|}{25 MB} \\\hline
       \end{tabular}
    \end{center}
    \caption{Cache topology diagram as outputted by \texttt{likwid-topology -g}.
    Byte sizes all in IEC units.}
    \label{fig:topo}
\end{figure}

\subsection{Memory Access Pattern of \texttt{membench.c}}

\begin{figure}[t]
\begin{center}
\begin{tikzpicture}
    \tikzset{->-/.style={decoration={
  markings,
  mark=at position .75 with {\arrow{>}}},postaction={decorate}}};

    \draw (0,0) grid (5,1);
    \draw [dashed] (5,0) -- (5.5,0);
    \draw [dashed] (5,1) -- (5.5,1);
    \draw [dashed] (6.5,0) -- (7,0);
    \draw [dashed] (6.5,1) -- (7,1);
    \draw (7,0) grid (12,1);

    \foreach \r in {0,1,...,4}{
        \fill (\r + 0.5,0.5) circle [radius=2pt];
        \draw[->-] (\r-0.5,0.5) to[bend left] (\r+0.5,0.5);
        \draw (\r + 0.5, -0.5) node {$\r$};
    }
    \draw[->-] (4.5,0.5) to[bend left] (5.5,0.5);
    \foreach \r in {7,8,...,11}{
        \fill (\r + 0.5,0.5) circle [radius=2pt];
        \FPeval{l}{round(\r + 128 - 12, 0)}
        \draw[->-] (\r-0.5,0.5) to[bend left] (\r+0.5,0.5);
        \draw (\r + 0.5, -0.5) node {$\l$};
    }
    
    \draw (0,-3) grid (3,-2);
    \draw [dashed] (3,-2) -- (3.5,-2);
    \draw [dashed] (3,-3) -- (3.5,-3);
    \draw [dashed] (4,-2) -- (4.5,-2);
    \draw [dashed] (4,-3) -- (4.5,-3);
    \draw (4.5,-2) -- (7.5,-2);
    \draw (4.5,-3) -- (7.5,-3);
    \foreach \r in {4.5,5.5,...,7.5}{
        \draw (\r,-3) -- (\r,-2);
    }
    \draw [dashed] (7.5,-2) -- (8,-2);
    \draw [dashed] (7.5,-3) -- (8,-3);
    \draw [dashed] (8.5,-2) -- (9,-2);
    \draw [dashed] (8.5,-3) -- (9,-3);
    \draw (9,-3) grid (12,-2);

    \fill (0.5,-2.5) circle [radius=2pt];
    \fill (6,-2.5) circle [radius=2pt];
    \foreach \r in {0,1,2}{
        \draw (\r + 0.5, -3.5) node {$\r$};
    }
    \foreach \r in {9,10,11}{
        \FPeval{l}{round(\r - 12, 0)}
        \draw (\r + 0.5, -3.5) node {\tiny $2^{20}  \l$};
    }
    \draw (5, -3.5) node {\tiny $2^{19} - 1$};
    \draw (6, -3.5) node {\tiny $2^{19}$};
    \draw (7,-3.5) node {\tiny $2^{19} + 1$};
    \draw[->-] (-0.5,-2.5) to[bend left] (0.5,-2.5);
    \draw[->-] (0.5,-2.5) to[bend left] (6,-2.5);
\end{tikzpicture}
\end{center}
    \caption{Memory access patterns of \texttt{membench.c} for \texttt{csize =
    128} and \texttt{stride = 1} (above) and for \texttt{csize = $2^{20}$} and
    \texttt{stride = $2^{19}$} (below)}
    \label{fig:access}
\end{figure}

The benchmark \texttt{membench.c} measures the average time of repeated read and
write overations across a set of indices of a stack-allocated array of 32-bit
signed integers. The indices vary according to the access pattern used, which in
turn is defined by two variables, \texttt{csize} and \texttt{stride}.
\texttt{csize} is an upper bound on the index value, i.e.  (one more of) the
highest index used to access the array in the pattern.  \texttt{stride}
determines the difference between array indexes over access iterations, i.e. a
\texttt{stride} of 1 will access every array index, a \texttt{stride} of 2 will
skip every other index, a \texttt{stride} of 4 will access one index then skip 3
and so on and so forth. The benchmark stops when the index to access is strictly
greater than \texttt{csize - stride}.

Therefore, for \texttt{csize = 128} and \texttt{stride = 1} the array will
access all indexes between 0 and 127 sequentially, and for \texttt{csize =
$2^{20}$} and \texttt{stride = $2^{19}$} the benchmark will access index 0, then
index $2^{19}-1$. The access patterns for these
two configurations are shown visually in Figure \ref{fig:access}.

By running the \texttt{membench.c} both on my personal laptop and on the
cluster, the results shown in Figure \ref{fig:mem} are obtained. \textit{csize}
values are shown as different data series and labeled by byte size and
\textit{stride} values are mapped on the $x$ axis by the byte-equivalent value
as well\footnote{Byte values are a factor of 4 greater than the values used in
the code and in Figure \ref{fig:mem}. This is due to the fact that the array
elements used in the benchmark are 32-bit signed integers, which take up 4 bytes
each.}. For $\texttt{csize = 128} = 512$ bytes and $\texttt{stride = 1} = 4$
bytes the mean access time is $0.124$ nanoseconds, while for $\texttt{csize =
$2^{20}$} = 4$MB and for $\texttt{stride = $2^{19}$} = 2$MB the mean access time
is $1.156$ nanoseconds. The first set of parameters performs well thanks to the
low \textit{stride} value, thus achieving very good space locality and
maximizing cache hits. However, the second set of parameters achieves good
performance as well thanks to the few values accessed with each pass, thus
improving the temporal locality of each address accessed. This observation
applies for the few last data points in each data series of Figure
\ref{fig:mem}, i.e. for \textit{stride} values close to \textit{csize}. 

\subsection{Analyzing Benchmark Results}

\begin{figure}[t]
    \begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{generic_macos.pdf}
        \caption{Personal laptop}
        \label{fig:mem:laptop}
    \end{subfigure}
    \begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{generic_cluster.pdf}
        \caption{Cluster}
        \label{fig:mem:cluster}
    \end{subfigure}
    \caption{Results of the \texttt{membench.c} benchmark for both my personal
    laptop (in Figure \ref{fig:mem:laptop}) and the cluster (in Figure
    \ref{fig:mem:cluster}).}
    \label{fig:mem}
\end{figure}

The \texttt{membench.c} benchmark results for my personal laptop (Macbook Pro
2018 with a Core i7-8750H CPU) and the cluster are shown in figure
\ref{fig:mem}.

The memory access graph for the cluster's benchmark results shows that temporal
locality is best for small array sizes and for small \texttt{stride} values.
In particular, for array memory sizes of 16MB or lower (\texttt{csize} of $4
\cdot 2^{20}$ or lower) and \texttt{stride} values of 2048 or lower the mean
read+write time is less than 10 nanoseconds. Temporal locality is worst for
large sizes and strides, although the largest values of \texttt{stride} for each
size (like \texttt{csize / 2} and \texttt{csize / 4}) achieve better mean times
for the aformentioned effect of having \textit{stride} values close to
\textit{csize}.

The pattern that can be read from the graphs, especially the one for the
cluster, shows that the \textit{stride} axis is divided in regions showing
memory access time of similar magnitude. The boundary between the first and the
second region is a \textit{stride} value of rougly 2KB, while a \textit{stride}
of 512KB roughly separates the second and the third region. The difference in
performance between regions and the similarity of performance within regions
suggest the threshold stride values are related to changes in the use of the
cache hierarchy. In particular, the first region may characterize regions where
the L1 cache, the fastest non-register memory available, is predominantly used.
Then the second region might overlap with a more intense use of the L2 cache and
likewise between the third region and the L3 cache.
\marginpar[right text]{\color{white}\url{https://youtu.be/JzJlzGaQFoc}}

\section{Optimize Square Matrix-Matrix Multiplication  \punkte{60}}


\begin{figure}[t]
\begin{verbatim}
INPUT: A (n by n), B (n by n), n
OUTPUT: C (n by n)

s := 26 # block dimension

A_row := <matrix A converted in row major form>
C_temp := <empty s by s matrix>

for i := 0 to n by s:
    i_next := min(i + s, n)

    for j := 0 to n by s:
        j_next := min(j + s, n)

        <set all cells in C_temp to 0>

        for k := 0 to n by s:
            k_next := min(k + s, n)

            # Perform naive matrix multiplication, incrementing cells of C_temp
            # with each multiplication result
            naivemm(A_row[i, k][i_next, k_next], B[k, j][k_next, j_next],
                    C_temp[0, 0][i_next - i, j_next - j])
        end for

        C[i, j][i_next, j_next] = C_temp[0, 0][i_next - i, j_next - j]
    end for
end for
\end{verbatim}
    \caption{Pseudocode listing of my blocked matrix multiplication
    implementation. Matrix indices start from 0 (i.e. row $0$ and column $0$
    denotes the top-left-most cell in a matrix). \\ \texttt{M[a, b][c, d]} denotes
    a rectangular region of the matrix $M$ whose top-left-most cell is the cell
    in $M$ at row $a$ and column $b$ and whose bottom-right-most cell is the 
    cell in $M$ at row $c - 1$ and column $d - 1$.}
    \label{fig:algo}
\end{figure}

The file \texttt{matmult/dgemm-blocked.c} contains a C implementation of the
blocked matrix multiplication algorithm presented in the project. A pseudocode
listing of the implementation is provided in Figure \ref{fig:algo}.

In order to achieve a correct and fast execution, my implementation:

\begin{figure}[t]
\begin{center}
\begin{tikzpicture}
    \fill[blue!60!white] (4,0) rectangle (5,-2);
    \fill[blue!40!white] (4,-2) rectangle (5,-4);
    \fill[blue!60!white] (0,-4) rectangle (2,-5);
    \fill[blue!40!white] (2,-4) rectangle (4,-5);
    \fill[green!40!white] (4,-4) rectangle (5,-5);
    \draw[step=1,gray,very thin] (0,0) grid (5,-5);
    \draw[step=2] (0,0) grid (5,-5);
    \draw[step=5] (0,0) grid (5,-5);
\end{tikzpicture}
\end{center}
    \caption{Result of the block division process of a square matrix of size 5
    using a block size of 2. The 2-by-1 and 1-by-2 rectangular remainders are
    shown in blue and the square matrix of remainder size (i.e. 1) is shown in
    green.}
    \label{fig:matrix}
\end{figure}

\begin{itemize}
    \item Handles the edge cases related to the ``remainders'' in the matrix
        block division, i.e. when the division between the size of the matrix
        and the block size yields a remainder. Assuming only squared matrices
        are multiplied through the algorithm (as in the test suite provided) the
        block division could yield rectangular matrix blocks located in the last
        rows and columns of each matrix, and the bottom-right corner of the
        matrix will be contained in a square matrix block of the size of the
        remainder. The result of this process is shown in Figure
        \ref{fig:matrix};
    \item Converts matrix A into row major format. As shown in Figure
        \ref{fig:iter}, by having A in row major format and B in column major
        format, iterations across matrix block in the inner most loop of the
        algorithm (the one calling \textit{naivemm}) cache hits are maximised by
        achieving space locality between the blocks used. This achieved
        approximately an increase of performance of two percentage points in
        terms of CPU utilization (i.e. from a baseline of $4\%$ to $6\%$),
    \item Caches the result of each innermost iteration into a temporary matrix
        of block size before storing it into matrix C. This achieves better
        space locality when \textit{naivemm} needs to store values in matrix C.
        The block size temporary matrix has virtually no stride and thus cache
        hits are maximised. The copy operation is implemented with bulk copy
        \texttt{memcpy} calls. This optimization achieves an extra half of a
        percentage point in terms of CPU utilization (i.e. from the $6\%$
        discussed above to roughly $6.5\%$).
    \item Exploits some compiler optimizations, namely using the compiler
        optimizer at the \texttt{-O3} setting and using the \texttt{-ffast-math}
        and \texttt{-march=haswell} to respectively apply some floating point
        arithmetic optimizations and to set the compiler target to the exact ISA
        of the cluster's processor. Note that these flags are applied to all
        implementations used in the benchmark as the flags were added in the 
        \texttt{Makefile}. In addition, the \texttt{naivemm} algoritms was
        inlined in the actual \texttt{dgemm-blocked.c} source code instead of
        being separated into a function to be called to achieve better
        performance and compiler optimizations. All these changes increased the
        CPU utilization from $6.5\%$ to an average of $13.45\%$.
\end{itemize}

The chosen matrix block size for running the benchmark on the cluster is:

$$s = 32$$

as shown in the pseudocode. This value has been obtained by running an empirical
binary search on the value using the benchmark as a metric, i.e. by running
\texttt{./run\_matrixmult.sh} several times with different values. For square
blocks (i.e. the worst case) the total size for the matrix $A$ and $B$ sub-block
and the \texttt{C\_temp} temporary matrix block for $C$ is:

$$\mathrm{Bytes} = \mathrm{cellSize} * s^2 * 3 = 8 * 32^2 * 3 = 24576$$

given that a double-precision floating point number, the data type used for
matrix cells in the scope of this project, is 8 bytes long. The obtained total
bytes size is fairly close to the L1 cache size of the processor used in the
cluster ($32\mathrm{Kb} = 32768$ bytes), which is expected given that the
algorithm needs to exploit fast memory as much as possible. The reason the
empirically best value results in a theoretical cache allocation that is only
half of the complete L1 cache size is due to some real-life factors. For example,
cache misses tipically result in aligned page loads which may load unnecessary
data.

A potential way to exploit the different cache levels is to apply the blocked
matrix algorithm iteratively multiple times. For example, OpenBLAS implements
DGEMM by having two levels of matrix blocks to better exploit the L2 and L3
caches found on most processors.

\begin{figure}[t]
\begin{center}
\begin{tikzpicture}
    \node[align=center] at (2.5,0.5) {Matrix A};
    \fill[orange!10!white] (0,0) rectangle (2,-2);
    \fill[orange!25!white] (2,0) rectangle (4,-2);
    \fill[orange!40!white] (4,0) rectangle (5,-2);
    
    \draw[step=1,gray,very thin] (0,0) grid (5,-5);
    \draw[step=2,black,thick] (0,0) grid (5,-5);
    \draw[step=5,black,thick] (0,0) grid (5,-5);
    
    \draw[-to,step=1,red,very thick] (0.5,-0.5) -- (4.5,-0.5); 
    \draw[-to,step=1,red,very thick] (0.5,-1.5) -- (4.5,-1.5); 
    \draw[-to,step=1,red,very thick] (0.5,-2.5) -- (4.5,-2.5); 
    \draw[-to,step=1,red,very thick] (0.5,-3.5) -- (4.5,-3.5); 
    \draw[-to,step=1,red,very thick] (0.5,-4.5) -- (4.5,-4.5); 
    
    \node[align=center] at (8.5,0.5) {Matrix B};
    \fill[orange!10!white] (6,0) rectangle (8,-2);
    \fill[orange!25!white] (6,-2) rectangle (8,-4);
    \fill[orange!40!white] (6,-4) rectangle (8,-5);
    
    \draw[step=1,gray,very thin] (6,0) grid (11,-5);
    \draw[step=2,black,thick] (6,0) grid (11,-5);
    \draw[step=5,black,thick] (6,0) grid (11,-5);
    \draw[black,thick] (11,0) -- (11,-5);

    \draw[-to,step=1,red,very thick] (6.5,-0.5) -- (6.5,-4.5); 
    \draw[-to,step=1,red,very thick] (7.5,-0.5) -- (7.5,-4.5); 
    \draw[-to,step=1,red,very thick] (8.5,-0.5) -- (8.5,-4.5); 
    \draw[-to,step=1,red,very thick] (9.5,-0.5) -- (9.5,-4.5); 
    \draw[-to,step=1,red,very thick] (10.5,-0.5) -- (10.5,-4.5); 
\end{tikzpicture}
\end{center}
    \caption{Inner most loop iteration of the blocked GEMM algorithm across
    matrices A and B. The red lines represent the ``majorness'' of each matrix
    (A is converted by the algorithm in row-major form, while B is given and
    used in column-major form). The shades of orange represent the blocks used
    in each iteration.}
    \label{fig:iter}
\end{figure}

The results of the matrix multiplication benchmark for the naive, blocked, and
BLAS implementations are shown in Figure \ref{fig:bench} as a graph of GFlop/s
over matrix size or in Figure \ref{fig:benchtab} as a table. The blocked
implementation achieves up to 200\% more FLOPS than the naive implementation for
the largest matrix dimensions. However, the blocked implementation achives
roughly an eighth of the FLOPS the Intel MKL BLAS based implementation achieves.

I was unable to run this benchmark suite on my personal machine due to Intel MKL
installation issues that prevented the code to compile.

\begin{figure}[t]
    \includegraphics[width=\textwidth]{timing.pdf}
    \caption{GFlop/s per matrix size of the matrix multiplication benchmark for the naive,
    blocked, and BLAS implementations. The Y-axis is log-scaled.}
    \label{fig:bench}
\end{figure}

\begin{figure}[t]
\begin{center}
\begin{tabular}{c|cc|cc|cc}
    \toprule
    & \multicolumn{2}{c|}{Naive} & \multicolumn{2}{c|}{Blocked} &
    \multicolumn{2}{c}{BLAS} \\
    \makecell{Size} & \makecell{MFLOPS} &
    \makecell{\% CPU} & \makecell{MFLOPS} &
    \makecell{\% CPU} & \makecell{MFLOPS} &
    \makecell{\% CPU} \\
    \midrule
31	 & 3140.45	 & 8.53 & 3844.56	 & 10.45 & 25677.4	 & 69.78 \\
32	 & 3364.78	 & 9.14 & 5342.55	 & 14.52 & 28952.1	 & 78.67 \\
96	 & 2703.08	 & 7.35 & 5620.08	 & 15.27 & 32816.4	 & 89.18 \\
97	 & 2729.68	 & 7.42 & 4754.1	 & 12.92 & 31699.2	 & 86.14 \\
127	 & 2556.58	 & 6.95 & 4977.82	 & 13.53 & 30274.5	 & 82.27 \\
128	 & 1803.41	 & 4.90 & 4817.8	 & 13.09 & 32721.7	 & 88.92 \\
129	 & 2669.26	 & 7.25 & 4594.25	 & 12.48 & 31746.4	 & 86.27 \\
191	 & 2290.09	 & 6.22 & 4931.27	 & 13.40 & 32263.1	 & 87.67 \\
192	 & 1801.66	 & 4.90 & 5549.67	 & 15.08 & 35491.2	 & 96.44 \\
229	 & 2218.61	 & 6.03 & 4982.59	 & 13.54 & 34557.2	 & 93.91 \\
255	 & 2178.15	 & 5.92 & 4528.43	 & 12.31 & 33771.3	 & 91.77 \\
256	 & 808.413	 & 2.20 & 4652.68	 & 12.64 & 35221.1	 & 95.71 \\
257	 & 2238.93	 & 6.08 & 4512.33	 & 12.26 & 33807.9	 & 91.87 \\
319	 & 2174.45	 & 5.91 & 5093.38	 & 13.84 & 34415.8	 & 93.52 \\
320	 & 1612.13	 & 4.38 & 5674.61	 & 15.42 & 36500.2	 & 99.19 \\
321	 & 2173.64	 & 5.91 & 5111.09	 & 13.89 & 35508.1	 & 96.49 \\
417	 & 2125.36	 & 5.78 & 5143.98	 & 13.98 & 36157.6	 & 98.25 \\
479	 & 2107.13	 & 5.73 & 5152.51	 & 14.00 & 36186.4	 & 98.33 \\
480	 & 1848.43	 & 5.02 & 5703	 & 15.50 & 37971.3	 & 103.18 \\
511	 & 2112.99	 & 5.74 & 4479.96	 & 12.17 & 35144	 & 95.50 \\
512	 & 801.127	 & 2.18 & 4596.26	 & 12.49 & 37362.5	 & 101.53 \\
639	 & 1881.94	 & 5.11 & 5168.59	 & 14.05 & 36989.1	 & 100.51 \\
640	 & 815.847	 & 2.22 & 5232.97	 & 14.22 & 38267.8	 & 103.99 \\
767	 & 1825.75	 & 4.96 & 4701.09	 & 12.77 & 37220.8	 & 101.14 \\
768	 & 812.933	 & 2.21 & 4826.12	 & 13.11 & 38744	 & 105.28 \\
769	 & 1825.38	 & 4.96 & 4686.21	 & 12.73 & 37076.1	 & 100.75 \\
    \bottomrule
\end{tabular}
\end{center}
    \caption{MFlop/s and CPU utlisation per matrix size of the matrix
    multiplication benchmark for the naive, blocked, and BLAS implementations.}
    \label{fig:benchtab}
\end{figure}
\end{document}