HPC/Project1/project_1_maggioni_claudio.tex

\documentclass[unicode,11pt,a4paper,oneside,numbers=endperiod,openany]{scrartcl}

\input{assignment.sty}
\usepackage{float}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage{tikz}
\usepackage{multirow}
\usepackage{makecell}
\usepackage{booktabs}
\usepackage[nomessages]{fp}
\usetikzlibrary{decorations.markings}

\begin{document}

\setassignment
\setduedate{12.10.2022 (midnight)}

\serieheader{High-Performance Computing Lab}{2022}{Student: Claudio
Maggioni}{Discussed with: --}{Solution for Project 1}{}
\newline

%\assignmentpolicy
%In this project you will practice memory access optimization,
%performance-oriented programming, and OpenMP parallelizaton on the ICS Cluster.  

\tableofcontents

\section{Explaining Memory Hierarchies \punkte{25}}

\subsection{Memory Hierarchy Parameters of the Cluster}

By invoking \texttt{likwid-topology} for the cache topology and \texttt{free -g}
for the amount of primary memory, the following memory hierarchy parameters are
found:

\begin{center}
\begin{tabular}{llll}
Main memory & 62 GB              \\
L3 cache    & 25 MB per socket   \\
L2 cache    & 256 kB per core    \\
L1 cache    & 32 kB per core    
\end{tabular}
\end{center}

All values are reported using base 2 IEC byte units. The cluster has 2 sockets
and a total of 20 cores (10 per socket). The cache topology diagram reported by
\texttt{likwid-topology -g} is shown in Figure \ref{fig:topo}.

\pagebreak[4]

\begin{figure}[t]
    \begin{center}
       Socket 0:\vspace{0.3cm}
        \begin{tabular}{|l|l|l|l|l|l|l|l|l|l|}
            \hline 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 \\\hline
            32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32
            kB & 32 kB \\\hline
            256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256
            kB & 256 kB & 256 kB \\\hline
            \multicolumn{10}{|c|}{25 MB} \\\hline
      \end{tabular}\vspace{0.8cm}\\
       Socket 1:\vspace{0.3cm}
       \begin{tabular}{|l|l|l|l|l|l|l|l|l|l|}
            \hline 10 & 11 & 12 & 13 & 14 & 15 & 16 & 17 & 18 & 19 \\\hline
            32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32
            kB & 32 kB \\\hline
            256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256
            kB & 256 kB & 256 kB \\\hline
            \multicolumn{10}{|c|}{25 MB} \\\hline
       \end{tabular}
    \end{center}
    \caption{Cache topology diagram as outputted by \texttt{likwid-topology -g}.
    Byte sizes all in IEC units.}
    \label{fig:topo}
\end{figure}

\subsection{Memory Access Pattern of \texttt{membench.c}}

\begin{figure}[t]
\begin{center}
\begin{tikzpicture}
    \tikzset{->-/.style={decoration={
  markings,
  mark=at position .75 with {\arrow{>}}},postaction={decorate}}};

    \draw (0,0) grid (5,1);
    \draw [dashed] (5,0) -- (5.5,0);
    \draw [dashed] (5,1) -- (5.5,1);
    \draw [dashed] (6.5,0) -- (7,0);
    \draw [dashed] (6.5,1) -- (7,1);
    \draw (7,0) grid (12,1);

    \foreach \r in {0,1,...,4}{
        \fill (\r + 0.5,0.5) circle [radius=2pt];
        \draw[->-] (\r-0.5,0.5) to[bend left] (\r+0.5,0.5);
        \draw (\r + 0.5, -0.5) node {$\r$};
    }
    \draw[->-] (4.5,0.5) to[bend left] (5.5,0.5);
    \foreach \r in {7,8,...,11}{
        \fill (\r + 0.5,0.5) circle [radius=2pt];
        \FPeval{l}{round(\r + 128 - 12, 0)}
        \draw[->-] (\r-0.5,0.5) to[bend left] (\r+0.5,0.5);
        \draw (\r + 0.5, -0.5) node {$\l$};
    }
    
    \draw (0,-3) grid (3,-2);
    \draw [dashed] (3,-2) -- (3.5,-2);
    \draw [dashed] (3,-3) -- (3.5,-3);
    \draw [dashed] (4,-2) -- (4.5,-2);
    \draw [dashed] (4,-3) -- (4.5,-3);
    \draw (4.5,-2) -- (7.5,-2);
    \draw (4.5,-3) -- (7.5,-3);
    \foreach \r in {4.5,5.5,...,7.5}{
        \draw (\r,-3) -- (\r,-2);
    }
    \draw [dashed] (7.5,-2) -- (8,-2);
    \draw [dashed] (7.5,-3) -- (8,-3);
    \draw [dashed] (8.5,-2) -- (9,-2);
    \draw [dashed] (8.5,-3) -- (9,-3);
    \draw (9,-3) grid (12,-2);

    \fill (0.5,-2.5) circle [radius=2pt];
    \fill (6,-2.5) circle [radius=2pt];
    \fill (11.5,-2.5) circle [radius=2pt];
    \foreach \r in {0,1,2}{
        \draw (\r + 0.5, -3.5) node {$\r$};
    }
    \foreach \r in {9,10,11}{
        \FPeval{l}{round(\r - 12, 0)}
        \draw (\r + 0.5, -3.5) node {\tiny $2^{20}  \l$};
    }
    \foreach \r in {4.5,5.5}{
        \FPeval{l}{round(\r - 6.5, 0)}
        \draw (\r + 0.5, -3.5) node {\tiny $2^{10}  \l$};
    }
    \draw (7,-3.5) node {\tiny $2^{10}$};
    \draw[->-] (-0.5,-2.5) to[bend left] (0.5,-2.5);
    \draw[->-] (0.5,-2.5) to[bend left] (6,-2.5);
    \draw[->-] (6,-2.5) to[bend left] (11.5,-2.5);
\end{tikzpicture}
\end{center}
    \caption{Memory access patterns of \texttt{membench.c} for \texttt{csize =
    128} and \texttt{stride = 1} (above) and for \texttt{csize = $2^{20}$} and
    \texttt{stride = $2^{10}$} (below)}
    \label{fig:access}
\end{figure}

The benchmark \texttt{membench.c} measures the average time of repeated read and
write overations across a set of indices of a stack-allocated array of 32-bit
signed integers. The indices vary according to the access pattern used, which in
turn is defined by two variables, \texttt{csize} and \texttt{stride}.
\texttt{csize} is an upper bound on the index value, i.e.  (one more of) the
highest index used to access the array in the pattern.  \texttt{stride}
determines the difference between array indexes over access iterations, i.e. a
\texttt{stride} of 1 will access every array index, a \texttt{stride} of 2 will
skip every other index, a \texttt{stride} of 4 will access one index then skip 3
and so on and so forth.

Therefore, for \texttt{csize = 128} and \texttt{stride = 1} the array will
access all indexes between 0 and 127 sequentially, and for \texttt{csize =
$2^{20}$} and \texttt{stride = $2^{10}$} the benchmark will access index 0, then
index $2^{10}-1$, and finally index $2^{20}-1$. The access patterns for these
two configurations are shown visually in Figure \ref{fig:access}.

\subsection{Analyzing Benchmark Results}

\begin{figure}[t]
    \begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{generic_macos.pdf}
        \caption{Personal laptop}
        \label{fig:mem:laptop}
    \end{subfigure}
    \begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{generic_cluster.pdf}
        \caption{Cluster}
        \label{fig:mem:cluster}
    \end{subfigure}
    \caption{Results of the \texttt{membench.c} benchmark for both my personal
    laptop (in Figure \ref{fig:mem:laptop}) and the cluster (in Figure
    \ref{fig:mem:cluster}).}
    \label{fig:mem}
\end{figure}

The \texttt{membench.c} benchmark results for my personal laptop (Macbook Pro
2018 with a Core i7-8750H CPU) and the cluster are shown in figure
\ref{fig:mem}.

The memory access graph for the cluster's benchmark results shows that temporal
locality is best for small array sizes and for small \texttt{stride} values.
In particular, for array memory sizes of 16MB or lower (\texttt{csize} of $4
\cdot 2^{20}$ or lower) and \texttt{stride} values of 2048 or lower the mean
read+write time is less than 10 nanoseconds. Temporal locality is worst for
large sizes and strides, although the largest values of \texttt{stride} for each
size (like \texttt{csize / 2} and \texttt{csize / 4}) achieve better mean times
due to the few elements accessed in the pattern (this observation is also valid
for the largest strides of each size series shown in the graph).

\section{Optimize Square Matrix-Matrix Multiplication  \punkte{60}}

The file \texttt{matmult/dgemm-blocked.c} contains a C implementation of the
blocked matrix multiplication algorithm presented in the project. Other than
implementing the pseudocode, my implementation:

\begin{figure}[t]
\begin{center}
\begin{tikzpicture}
    \fill[blue!60!white] (4,0) rectangle (5,-2);
    \fill[blue!40!white] (4,-2) rectangle (5,-4);
    \fill[blue!60!white] (0,-4) rectangle (2,-5);
    \fill[blue!40!white] (2,-4) rectangle (4,-5);
    \fill[green!40!white] (4,-4) rectangle (5,-5);
    \draw[step=1,gray,very thin] (0,0) grid (5,-5);
    \draw[step=2] (0,0) grid (5,-5);
    \draw[step=5] (0,0) grid (5,-5);
\end{tikzpicture}
\end{center}
    \caption{Result of the block division process of a square matrix of size 5
    using a block size of 2. The 2-by-1 and 1-by-2 rectangular remainders are
    shown in blue and the square matrix of remainder size (i.e. 1) is shown in
    green.}
    \label{fig:matrix}
\end{figure}

\begin{itemize}
    \item Handles the edge cases related to the ``remainders'' in the matrix
        block division, i.e. when the division between the size of the matrix
        and the block size yields a remainder. Assuming only squared matrices
        are multiplied through the algorithm (as in the test suite provided) the
        block division could yield rectangular matrix blocks located in the last
        rows and columns of each matrix, and the bottom-right corner of the
        matrix will be contained in a square matrix block of the size of the
        remainder. The result of this process is shown in Figure
        \ref{fig:matrix};
    \item Converts matrix A into row major format. As shown in Figure
        \ref{fig:iter}, by having A in row major format and B in column major
        format, iterations across matrix block in the inner most loop of the
        algorithm (the one calling \textit{naivemm}) cache hits are maximised by
        achieving space locality between the blocks used;
    \item Caches the result of each innermost iteration into a temporary matrix
        of block size before storing it into matrix C. This achieves better
        space locality when \textit{naivemm} needs to store values in matrix C.
        The block size temporary matrix has virtually no stride and thus cache
        hits are maximised. The copy operation is implemented with bulk copy
        \texttt{memcpy} calls.
\end{itemize}

\begin{figure}[t]
\begin{center}
\begin{tikzpicture}
    \node[align=center] at (2.5,0.5) {Matrix A};
    \fill[orange!10!white] (0,0) rectangle (2,-2);
    \fill[orange!25!white] (2,0) rectangle (4,-2);
    \fill[orange!40!white] (4,0) rectangle (5,-2);
    
    \draw[step=1,gray,very thin] (0,0) grid (5,-5);
    \draw[step=2,black,thick] (0,0) grid (5,-5);
    \draw[step=5,black,thick] (0,0) grid (5,-5);
    
    \draw[-to,step=1,red,very thick] (0.5,-0.5) -- (4.5,-0.5); 
    \draw[-to,step=1,red,very thick] (0.5,-1.5) -- (4.5,-1.5); 
    \draw[-to,step=1,red,very thick] (0.5,-2.5) -- (4.5,-2.5); 
    \draw[-to,step=1,red,very thick] (0.5,-3.5) -- (4.5,-3.5); 
    \draw[-to,step=1,red,very thick] (0.5,-4.5) -- (4.5,-4.5); 
    
    \node[align=center] at (8.5,0.5) {Matrix B};
    \fill[orange!10!white] (6,0) rectangle (8,-2);
    \fill[orange!25!white] (6,-2) rectangle (8,-4);
    \fill[orange!40!white] (6,-4) rectangle (8,-5);
    
    \draw[step=1,gray,very thin] (6,0) grid (11,-5);
    \draw[step=2,black,thick] (6,0) grid (11,-5);
    \draw[step=5,black,thick] (6,0) grid (11,-5);
    \draw[black,thick] (11,0) -- (11,-5);

    \draw[-to,step=1,red,very thick] (6.5,-0.5) -- (6.5,-4.5); 
    \draw[-to,step=1,red,very thick] (7.5,-0.5) -- (7.5,-4.5); 
    \draw[-to,step=1,red,very thick] (8.5,-0.5) -- (8.5,-4.5); 
    \draw[-to,step=1,red,very thick] (9.5,-0.5) -- (9.5,-4.5); 
    \draw[-to,step=1,red,very thick] (10.5,-0.5) -- (10.5,-4.5); 
\end{tikzpicture}
\end{center}
    \caption{Inner most loop iteration of the blocked GEMM algorithm across
    matrices A and B. The red lines represent the ``majorness'' of each matrix
    (A is converted by the algorithm in row-major form, while B is given and
    used in column-major form). The shades of orange represent the blocks used
    in each iteration.}
    \label{fig:iter}
\end{figure}

The results of the matrix multiplication benchmark for the naive, blocked, and
BLAS implementations are shown in Figure \ref{fig:bench} as a graph of GFlop/s
over matrix size or in Figure \ref{fig:benchtab} as a table. The blocked
implementation achieves on average 50\% more FLOPS than the naive
implementation thanks to the optimisations in space and temporal cache locality
described. However, the blocked implementation achives less than a tenth of
FLOPS compared to Intel MKL BLAS based one due to the microarchitecture
optimization the latter one is able to exploit.

\begin{figure}[t]
    \includegraphics[width=\textwidth]{timing.pdf}
    \caption{GFlop/s per matrix size of the matrix multiplication benchmark for the naive,
    blocked, and BLAS implementations. The Y-axis is log-scaled.}
    \label{fig:bench}
\end{figure}

\begin{figure}[t]
\begin{center}
\begin{tabular}{c|cc|cc|cc}
    \toprule
    & \multicolumn{2}{c|}{Naive} & \multicolumn{2}{c|}{Blocked} &
    \multicolumn{2}{c}{BLAS} \\
    \makecell{Size} & \makecell{MFLOPS} &
    \makecell{\% CPU} & \makecell{MFLOPS} &
    \makecell{\% CPU} & \makecell{MFLOPS} &
    \makecell{\% CPU} \\
    \midrule
        31 & 2393.33 & 6.50 & 2112.63 & 5.74 & 23449.20 & 63.72 \\
        32 & 2400.13 & 6.52 & 2187.44 & 5.94 & 28198.90 & 76.63 \\
        96 & 1998.74 & 5.43 & 2325.39 & 6.32 & 32542.30 & 88.43 \\
        97 & 1996.01 & 5.42 & 2322.81 & 6.31 & 29801.30 & 80.98 \\
        127 & 1923.81 & 5.23 & 2330.30 & 6.33 & 28557.80 & 77.60 \\
        128 & 1731.98 & 4.71 & 2282.93 & 6.20 & 32643.30 & 88.70 \\
        129 & 1903.31 & 5.17 & 2334.25 & 6.34 & 31198.20 & 84.78 \\
        191 & 1736.78 & 4.72 & 2345.91 & 6.37 & 32247.30 & 87.63 \\
        192 & 1694.44 & 4.60 & 2345.38 & 6.37 & 32830.60 & 89.21 \\
        229 & 1715.10 & 4.66 & 2351.01 & 6.39 & 34360.90 & 93.37 \\
        255 & 1720.39 & 4.67 & 2335.21 & 6.35 & 33477.70 & 90.97 \\
        256 & 777.65 & 2.11 & 2306.48 & 6.27 & 33473.90 & 90.96 \\
        257 & 1729.27 & 4.70 & 2330.68 & 6.33 & 33686.50 & 91.54 \\
        319 & 1704.80 & 4.63 & 2360.03 & 6.41 & 34335.20 & 93.30 \\
        320 & 1414.84 & 3.84 & 2364.53 & 6.43 & 36438.10 & 99.02 \\
        321 & 1741.30 & 4.73 & 2366.38 & 6.43 & 35433.70 & 96.29 \\
        417 & 1733.00 & 4.71 & 2378.34 & 6.46 & 36133.70 & 98.19 \\
        479 & 1731.17 & 4.70 & 2233.05 & 6.07 & 32951.40 & 89.54 \\
        480 & 1678.77 & 4.56 & 2187.87 & 5.95 & 37260.00 & 101.25 \\
        511 & 1733.60 & 4.71 & 2224.61 & 6.05 & 34128.00 & 92.74 \\
        512 & 782.96 & 2.13 & 2284.85 & 6.21 & 36526.40 & 99.26 \\
        639 & 1714.42 & 4.66 & 2292.78 & 6.23 & 35249.20 & 95.79 \\
        640 & 663.42 & 1.80 & 2264.70 & 6.15 & 36538.70 & 99.29 \\
        767 & 1690.82 & 4.59 & 2324.83 & 6.32 & 35718.50 & 97.06 \\
        768 & 792.04 & 2.15 & 2363.92 & 6.42 & 32116.80 & 87.27 \\
        769 & 1696.95 & 4.61 & 2321.31 & 6.31 & 33033.90 & 89.77 \\
    \bottomrule
\end{tabular}
\end{center}
    \caption{MFlop/s and CPU utlisation per matrix size of the matrix
    multiplication benchmark for the naive, blocked, and BLAS implementations.}
    \label{fig:benchtab}
\end{figure}
\end{document}
hw1: done 1.1 2022-09-26 17:35:06 +00:00			`\documentclass[unicode,11pt,a4paper,oneside,numbers=endperiod,openany]{scrartcl}`

			`\input{assignment.sty}`
hw1: work on ex2 report 2022-10-05 08:03:17 +00:00			`\usepackage{float}`
			`\usepackage{subcaption}`
			`\usepackage{graphicx}`
			`\usepackage{tikz}`
hw1: tweaks to figures 2022-10-05 09:14:40 +00:00			`\usepackage{multirow}`
hw1: added figure for stride/csize graph 2022-10-05 18:04:15 +00:00			`\usepackage{makecell}`
			`\usepackage{booktabs}`
			`\usepackage[nomessages]{fp}`
			`\usetikzlibrary{decorations.markings}`
hw1: work on ex2 report 2022-10-05 08:03:17 +00:00
			`\begin{document}`
hw1: done 1.1 2022-09-26 17:35:06 +00:00
			`\setassignment`
			`\setduedate{12.10.2022 (midnight)}`

			`\serieheader{High-Performance Computing Lab}{2022}{Student: Claudio`
hw1: added figure for stride/csize graph 2022-10-05 18:04:15 +00:00			`Maggioni}{Discussed with: --}{Solution for Project 1}{}`
hw1: done 1.1 2022-09-26 17:35:06 +00:00			`\newline`

hw1: added figure for stride/csize graph 2022-10-05 18:04:15 +00:00			`%\assignmentpolicy`
			`%In this project you will practice memory access optimization,`
			`%performance-oriented programming, and OpenMP parallelizaton on the ICS Cluster.`

			`\tableofcontents`
hw1: done 1.1 2022-09-26 17:35:06 +00:00
			`\section{Explaining Memory Hierarchies \punkte{25}}`

hw1: done ex1 2022-09-27 08:39:48 +00:00			`\subsection{Memory Hierarchy Parameters of the Cluster}`

hw1: added figure for stride/csize graph 2022-10-05 18:04:15 +00:00			`By invoking \texttt{likwid-topology} for the cache topology and \texttt{free -g}`
			`for the amount of primary memory, the following memory hierarchy parameters are`
			`found:`
hw1: done 1.1 2022-09-26 17:35:06 +00:00
			`\begin{center}`
			`\begin{tabular}{llll}`
			`Main memory & 62 GB \\`
			`L3 cache & 25 MB per socket \\`
			`L2 cache & 256 kB per core \\`
			`L1 cache & 32 kB per core`
			`\end{tabular}`
			`\end{center}`

			`All values are reported using base 2 IEC byte units. The cluster has 2 sockets`
			`and a total of 20 cores (10 per socket). The cache topology diagram reported by`
hw1: tweaks to figures 2022-10-05 09:14:40 +00:00			`\texttt{likwid-topology -g} is shown in Figure \ref{fig:topo}.`
hw1: done 1.1 2022-09-26 17:35:06 +00:00
hw1: added figure for stride/csize graph 2022-10-05 18:04:15 +00:00			`\pagebreak[4]`

hw1: tweaks to figures 2022-10-05 09:14:40 +00:00			`\begin{figure}[t]`
			`\begin{center}`
			`Socket 0:\vspace{0.3cm}`
			`\begin{tabular}{\|l\|l\|l\|l\|l\|l\|l\|l\|l\|l\|}`
			`\hline 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 \\\hline`
			`32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32`
			`kB & 32 kB \\\hline`
			`256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256`
			`kB & 256 kB & 256 kB \\\hline`
			`\multicolumn{10}{\|c\|}{25 MB} \\\hline`
			`\end{tabular}\vspace{0.8cm}\\`
			`Socket 1:\vspace{0.3cm}`
			`\begin{tabular}{\|l\|l\|l\|l\|l\|l\|l\|l\|l\|l\|}`
			`\hline 10 & 11 & 12 & 13 & 14 & 15 & 16 & 17 & 18 & 19 \\\hline`
			`32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32`
			`kB & 32 kB \\\hline`
			`256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256`
			`kB & 256 kB & 256 kB \\\hline`
			`\multicolumn{10}{\|c\|}{25 MB} \\\hline`
			`\end{tabular}`
			`\end{center}`
			`\caption{Cache topology diagram as outputted by \texttt{likwid-topology -g}.`
			`Byte sizes all in IEC units.}`
			`\label{fig:topo}`
			`\end{figure}`
hw1: done 1.1 2022-09-26 17:35:06 +00:00
hw1: done ex1 2022-09-27 08:39:48 +00:00			`\subsection{Memory Access Pattern of \texttt{membench.c}}`

hw1: added figure for stride/csize graph 2022-10-05 18:04:15 +00:00			`\begin{figure}[t]`
			`\begin{center}`
			`\begin{tikzpicture}`
			`\tikzset{->-/.style={decoration={`
			`markings,`
			`mark=at position .75 with {\arrow{>}}},postaction={decorate}}};`

			`\draw (0,0) grid (5,1);`
			`\draw [dashed] (5,0) -- (5.5,0);`
			`\draw [dashed] (5,1) -- (5.5,1);`
			`\draw [dashed] (6.5,0) -- (7,0);`
			`\draw [dashed] (6.5,1) -- (7,1);`
			`\draw (7,0) grid (12,1);`

			`\foreach \r in {0,1,...,4}{`
			`\fill (\r + 0.5,0.5) circle [radius=2pt];`
			`\draw[->-] (\r-0.5,0.5) to[bend left] (\r+0.5,0.5);`
			`\draw (\r + 0.5, -0.5) node {$\r$};`
			`}`
			`\draw[->-] (4.5,0.5) to[bend left] (5.5,0.5);`
			`\foreach \r in {7,8,...,11}{`
			`\fill (\r + 0.5,0.5) circle [radius=2pt];`
			`\FPeval{l}{round(\r + 128 - 12, 0)}`
			`\draw[->-] (\r-0.5,0.5) to[bend left] (\r+0.5,0.5);`
			`\draw (\r + 0.5, -0.5) node {$\l$};`
			`}`

			`\draw (0,-3) grid (3,-2);`
			`\draw [dashed] (3,-2) -- (3.5,-2);`
			`\draw [dashed] (3,-3) -- (3.5,-3);`
			`\draw [dashed] (4,-2) -- (4.5,-2);`
			`\draw [dashed] (4,-3) -- (4.5,-3);`
			`\draw (4.5,-2) -- (7.5,-2);`
			`\draw (4.5,-3) -- (7.5,-3);`
			`\foreach \r in {4.5,5.5,...,7.5}{`
			`\draw (\r,-3) -- (\r,-2);`
			`}`
			`\draw [dashed] (7.5,-2) -- (8,-2);`
			`\draw [dashed] (7.5,-3) -- (8,-3);`
			`\draw [dashed] (8.5,-2) -- (9,-2);`
			`\draw [dashed] (8.5,-3) -- (9,-3);`
			`\draw (9,-3) grid (12,-2);`

			`\fill (0.5,-2.5) circle [radius=2pt];`
			`\fill (6,-2.5) circle [radius=2pt];`
			`\fill (11.5,-2.5) circle [radius=2pt];`
			`\foreach \r in {0,1,2}{`
			`\draw (\r + 0.5, -3.5) node {$\r$};`
			`}`
			`\foreach \r in {9,10,11}{`
			`\FPeval{l}{round(\r - 12, 0)}`
			`\draw (\r + 0.5, -3.5) node {\tiny $2^{20} \l$};`
			`}`
			`\foreach \r in {4.5,5.5}{`
			`\FPeval{l}{round(\r - 6.5, 0)}`
			`\draw (\r + 0.5, -3.5) node {\tiny $2^{10} \l$};`
			`}`
			`\draw (7,-3.5) node {\tiny $2^{10}$};`
			`\draw[->-] (-0.5,-2.5) to[bend left] (0.5,-2.5);`
			`\draw[->-] (0.5,-2.5) to[bend left] (6,-2.5);`
			`\draw[->-] (6,-2.5) to[bend left] (11.5,-2.5);`
			`\end{tikzpicture}`
			`\end{center}`
			`\caption{Memory access patterns of \texttt{membench.c} for \texttt{csize =`
			`128} and \texttt{stride = 1} (above) and for \texttt{csize = $2^{20}$} and`
			`\texttt{stride = $2^{10}$} (below)}`
			`\label{fig:access}`
			`\end{figure}`

hw1: done ex1 2022-09-27 08:39:48 +00:00			`The benchmark \texttt{membench.c} measures the average time of repeated read and`
			`write overations across a set of indices of a stack-allocated array of 32-bit`
			`signed integers. The indices vary according to the access pattern used, which in`
			`turn is defined by two variables, \texttt{csize} and \texttt{stride}.`
			`\texttt{csize} is an upper bound on the index value, i.e. (one more of) the`
			`highest index used to access the array in the pattern. \texttt{stride}`
			`determines the difference between array indexes over access iterations, i.e. a`
			`\texttt{stride} of 1 will access every array index, a \texttt{stride} of 2 will`
			`skip every other index, a \texttt{stride} of 4 will access one index then skip 3`
			`and so on and so forth.`

			`Therefore, for \texttt{csize = 128} and \texttt{stride = 1} the array will`
			`access all indexes between 0 and 127 sequentially, and for \texttt{csize =`
			`$2^{20}$} and \texttt{stride = $2^{10}$} the benchmark will access index 0, then`
hw1: added figure for stride/csize graph 2022-10-05 18:04:15 +00:00			`index $2^{10}-1$, and finally index $2^{20}-1$. The access patterns for these`
			`two configurations are shown visually in Figure \ref{fig:access}.`
hw1: done ex1 2022-09-27 08:39:48 +00:00
			`\subsection{Analyzing Benchmark Results}`

hw1: work on ex2 report 2022-10-05 08:03:17 +00:00			`\begin{figure}[t]`
			`\begin{subfigure}{0.5\textwidth}`
			`\includegraphics[width=\textwidth]{generic_macos.pdf}`
			`\caption{Personal laptop}`
			`\label{fig:mem:laptop}`
			`\end{subfigure}`
			`\begin{subfigure}{0.5\textwidth}`
			`\includegraphics[width=\textwidth]{generic_cluster.pdf}`
			`\caption{Cluster}`
			`\label{fig:mem:cluster}`
			`\end{subfigure}`
			`\caption{Results of the \texttt{membench.c} benchmark for both my personal`
			`laptop (in Figure \ref{fig:mem:laptop}) and the cluster (in Figure`
			`\ref{fig:mem:cluster}).}`
			`\label{fig:mem}`
			`\end{figure}`
hw1: done ex1 2022-09-27 08:39:48 +00:00
hw1: work on ex2 report 2022-10-05 08:03:17 +00:00			`The \texttt{membench.c} benchmark results for my personal laptop (Macbook Pro`
			`2018 with a Core i7-8750H CPU) and the cluster are shown in figure`
			`\ref{fig:mem}.`
hw1: done ex1 2022-09-27 08:39:48 +00:00
			`The memory access graph for the cluster's benchmark results shows that temporal`
			`locality is best for small array sizes and for small \texttt{stride} values.`
			`In particular, for array memory sizes of 16MB or lower (\texttt{csize} of $4`
			`\cdot 2^{20}$ or lower) and \texttt{stride} values of 2048 or lower the mean`
			`read+write time is less than 10 nanoseconds. Temporal locality is worst for`
			`large sizes and strides, although the largest values of \texttt{stride} for each`
			`size (like \texttt{csize / 2} and \texttt{csize / 4}) achieve better mean times`
			`due to the few elements accessed in the pattern (this observation is also valid`
			`for the largest strides of each size series shown in the graph).`

hw1: done 1.1 2022-09-26 17:35:06 +00:00			`\section{Optimize Square Matrix-Matrix Multiplication \punkte{60}}`

hw1: work on ex2 report 2022-10-05 08:03:17 +00:00			`The file \texttt{matmult/dgemm-blocked.c} contains a C implementation of the`
			`blocked matrix multiplication algorithm presented in the project. Other than`
			`implementing the pseudocode, my implementation:`
hw1: done 1.1 2022-09-26 17:35:06 +00:00
hw1: work on ex2 report 2022-10-05 08:03:17 +00:00			`\begin{figure}[t]`
			`\begin{center}`
			`\begin{tikzpicture}`
			`\fill[blue!60!white] (4,0) rectangle (5,-2);`
			`\fill[blue!40!white] (4,-2) rectangle (5,-4);`
			`\fill[blue!60!white] (0,-4) rectangle (2,-5);`
			`\fill[blue!40!white] (2,-4) rectangle (4,-5);`
			`\fill[green!40!white] (4,-4) rectangle (5,-5);`
			`\draw[step=1,gray,very thin] (0,0) grid (5,-5);`
			`\draw[step=2] (0,0) grid (5,-5);`
			`\draw[step=5] (0,0) grid (5,-5);`
			`\end{tikzpicture}`
			`\end{center}`
			`\caption{Result of the block division process of a square matrix of size 5`
			`using a block size of 2. The 2-by-1 and 1-by-2 rectangular remainders are`
			`shown in blue and the square matrix of remainder size (i.e. 1) is shown in`
			`green.}`
			`\label{fig:matrix}`
			`\end{figure}`

			`\begin{itemize}`
			\item Handles the edge cases related to the ``remainders'' in the matrix
			`block division, i.e. when the division between the size of the matrix`
			`and the block size yields a remainder. Assuming only squared matrices`
			`are multiplied through the algorithm (as in the test suite provided) the`
			`block division could yield rectangular matrix blocks located in the last`
			`rows and columns of each matrix, and the bottom-right corner of the`
			`matrix will be contained in a square matrix block of the size of the`
			`remainder. The result of this process is shown in Figure`
			`\ref{fig:matrix};`
			`\item Converts matrix A into row major format. As shown in Figure`
			`\ref{fig:iter}, by having A in row major format and B in column major`
			`format, iterations across matrix block in the inner most loop of the`
			`algorithm (the one calling \textit{naivemm}) cache hits are maximised by`
			`achieving space locality between the blocks used;`
			`\item Caches the result of each innermost iteration into a temporary matrix`
			`of block size before storing it into matrix C. This achieves better`
			`space locality when \textit{naivemm} needs to store values in matrix C.`
			`The block size temporary matrix has virtually no stride and thus cache`
			`hits are maximised. The copy operation is implemented with bulk copy`
			`\texttt{memcpy} calls.`
			`\end{itemize}`

			`\begin{figure}[t]`
			`\begin{center}`
			`\begin{tikzpicture}`
			`\node[align=center] at (2.5,0.5) {Matrix A};`
			`\fill[orange!10!white] (0,0) rectangle (2,-2);`
			`\fill[orange!25!white] (2,0) rectangle (4,-2);`
			`\fill[orange!40!white] (4,0) rectangle (5,-2);`

			`\draw[step=1,gray,very thin] (0,0) grid (5,-5);`
			`\draw[step=2,black,thick] (0,0) grid (5,-5);`
			`\draw[step=5,black,thick] (0,0) grid (5,-5);`

			`\draw[-to,step=1,red,very thick] (0.5,-0.5) -- (4.5,-0.5);`
			`\draw[-to,step=1,red,very thick] (0.5,-1.5) -- (4.5,-1.5);`
			`\draw[-to,step=1,red,very thick] (0.5,-2.5) -- (4.5,-2.5);`
			`\draw[-to,step=1,red,very thick] (0.5,-3.5) -- (4.5,-3.5);`
			`\draw[-to,step=1,red,very thick] (0.5,-4.5) -- (4.5,-4.5);`

			`\node[align=center] at (8.5,0.5) {Matrix B};`
			`\fill[orange!10!white] (6,0) rectangle (8,-2);`
			`\fill[orange!25!white] (6,-2) rectangle (8,-4);`
			`\fill[orange!40!white] (6,-4) rectangle (8,-5);`

			`\draw[step=1,gray,very thin] (6,0) grid (11,-5);`
			`\draw[step=2,black,thick] (6,0) grid (11,-5);`
			`\draw[step=5,black,thick] (6,0) grid (11,-5);`
			`\draw[black,thick] (11,0) -- (11,-5);`

			`\draw[-to,step=1,red,very thick] (6.5,-0.5) -- (6.5,-4.5);`
			`\draw[-to,step=1,red,very thick] (7.5,-0.5) -- (7.5,-4.5);`
			`\draw[-to,step=1,red,very thick] (8.5,-0.5) -- (8.5,-4.5);`
			`\draw[-to,step=1,red,very thick] (9.5,-0.5) -- (9.5,-4.5);`
			`\draw[-to,step=1,red,very thick] (10.5,-0.5) -- (10.5,-4.5);`
			`\end{tikzpicture}`
			`\end{center}`
			`\caption{Inner most loop iteration of the blocked GEMM algorithm across`
			matrices A and B. The red lines represent the ``majorness'' of each matrix
			`(A is converted by the algorithm in row-major form, while B is given and`
			`used in column-major form). The shades of orange represent the blocks used`
			`in each iteration.}`
			`\label{fig:iter}`
			`\end{figure}`

			`The results of the matrix multiplication benchmark for the naive, blocked, and`
hw1: added figure for stride/csize graph 2022-10-05 18:04:15 +00:00			`BLAS implementations are shown in Figure \ref{fig:bench} as a graph of GFlop/s`
			`over matrix size or in Figure \ref{fig:benchtab} as a table. The blocked`
			`implementation achieves on average 50\% more FLOPS than the naive`
hw1: submittable 2022-10-05 08:30:24 +00:00			`implementation thanks to the optimisations in space and temporal cache locality`
			`described. However, the blocked implementation achives less than a tenth of`
			`FLOPS compared to Intel MKL BLAS based one due to the microarchitecture`
			`optimization the latter one is able to exploit.`
hw1: work on ex2 report 2022-10-05 08:03:17 +00:00
			`\begin{figure}[t]`
			`\includegraphics[width=\textwidth]{timing.pdf}`
hw1: added figure for stride/csize graph 2022-10-05 18:04:15 +00:00			`\caption{GFlop/s per matrix size of the matrix multiplication benchmark for the naive,`
			`blocked, and BLAS implementations. The Y-axis is log-scaled.}`
hw1: work on ex2 report 2022-10-05 08:03:17 +00:00			`\label{fig:bench}`
			`\end{figure}`
hw1: done 1.1 2022-09-26 17:35:06 +00:00
hw1: added figure for stride/csize graph 2022-10-05 18:04:15 +00:00			`\begin{figure}[t]`
			`\begin{center}`
			`\begin{tabular}{c\|cc\|cc\|cc}`
			`\toprule`
			`& \multicolumn{2}{c\|}{Naive} & \multicolumn{2}{c\|}{Blocked} &`
			`\multicolumn{2}{c}{BLAS} \\`
			`\makecell{Size} & \makecell{MFLOPS} &`
			`\makecell{\% CPU} & \makecell{MFLOPS} &`
			`\makecell{\% CPU} & \makecell{MFLOPS} &`
			`\makecell{\% CPU} \\`
			`\midrule`
			`31 & 2393.33 & 6.50 & 2112.63 & 5.74 & 23449.20 & 63.72 \\`
			`32 & 2400.13 & 6.52 & 2187.44 & 5.94 & 28198.90 & 76.63 \\`
			`96 & 1998.74 & 5.43 & 2325.39 & 6.32 & 32542.30 & 88.43 \\`
			`97 & 1996.01 & 5.42 & 2322.81 & 6.31 & 29801.30 & 80.98 \\`
			`127 & 1923.81 & 5.23 & 2330.30 & 6.33 & 28557.80 & 77.60 \\`
			`128 & 1731.98 & 4.71 & 2282.93 & 6.20 & 32643.30 & 88.70 \\`
			`129 & 1903.31 & 5.17 & 2334.25 & 6.34 & 31198.20 & 84.78 \\`
			`191 & 1736.78 & 4.72 & 2345.91 & 6.37 & 32247.30 & 87.63 \\`
			`192 & 1694.44 & 4.60 & 2345.38 & 6.37 & 32830.60 & 89.21 \\`
			`229 & 1715.10 & 4.66 & 2351.01 & 6.39 & 34360.90 & 93.37 \\`
			`255 & 1720.39 & 4.67 & 2335.21 & 6.35 & 33477.70 & 90.97 \\`
			`256 & 777.65 & 2.11 & 2306.48 & 6.27 & 33473.90 & 90.96 \\`
			`257 & 1729.27 & 4.70 & 2330.68 & 6.33 & 33686.50 & 91.54 \\`
			`319 & 1704.80 & 4.63 & 2360.03 & 6.41 & 34335.20 & 93.30 \\`
			`320 & 1414.84 & 3.84 & 2364.53 & 6.43 & 36438.10 & 99.02 \\`
			`321 & 1741.30 & 4.73 & 2366.38 & 6.43 & 35433.70 & 96.29 \\`
			`417 & 1733.00 & 4.71 & 2378.34 & 6.46 & 36133.70 & 98.19 \\`
			`479 & 1731.17 & 4.70 & 2233.05 & 6.07 & 32951.40 & 89.54 \\`
			`480 & 1678.77 & 4.56 & 2187.87 & 5.95 & 37260.00 & 101.25 \\`
			`511 & 1733.60 & 4.71 & 2224.61 & 6.05 & 34128.00 & 92.74 \\`
			`512 & 782.96 & 2.13 & 2284.85 & 6.21 & 36526.40 & 99.26 \\`
			`639 & 1714.42 & 4.66 & 2292.78 & 6.23 & 35249.20 & 95.79 \\`
			`640 & 663.42 & 1.80 & 2264.70 & 6.15 & 36538.70 & 99.29 \\`
			`767 & 1690.82 & 4.59 & 2324.83 & 6.32 & 35718.50 & 97.06 \\`
			`768 & 792.04 & 2.15 & 2363.92 & 6.42 & 32116.80 & 87.27 \\`
			`769 & 1696.95 & 4.61 & 2321.31 & 6.31 & 33033.90 & 89.77 \\`
			`\bottomrule`
			`\end{tabular}`
			`\end{center}`
			`\caption{MFlop/s and CPU utlisation per matrix size of the matrix`
			`multiplication benchmark for the naive, blocked, and BLAS implementations.}`
			`\label{fig:benchtab}`
			`\end{figure}`
hw1: done 1.1 2022-09-26 17:35:06 +00:00			`\end{document}`