hw1: work on ex2 report

2022-10-05 10:03:17 +02:00 · 2022-10-05 10:03:17 +02:00 · f6dd2a2d6b
commit f6dd2a2d6b
parent 8013c535e6
3 changed files with 1183 additions and 10 deletions
--- a/Project1/project_1_maggioni_claudio.pdf
+++ b/Project1/project_1_maggioni_claudio.pdf
--- a/Project1/project_1_maggioni_claudio.tex
+++ b/Project1/project_1_maggioni_claudio.tex
@ -1,10 +1,15 @@
 \documentclass[unicode,11pt,a4paper,oneside,numbers=endperiod,openany]{scrartcl}

 \input{assignment.sty}
+\usepackage{float}
+\usepackage{subcaption}
+\usepackage{graphicx}
 \usepackage{fancyvrb}
-\begin{document}
+\usepackage{tikz}


+\begin{document}
+
 \setassignment
 \setduedate{12.10.2022 (midnight)}

@ -13,8 +18,8 @@ Maggioni}{Discussed with: ---}{Solution for Project 1}{}
 \newline

 \assignmentpolicy
-In this project you will practice memory access optimization, performance-oriented programming, and OpenMP parallelizaton 
-on the ICS Cluster .  
+In this project you will practice memory access optimization,
+performance-oriented programming, and OpenMP parallelizaton on the ICS Cluster.  

 \section{Explaining Memory Hierarchies \punkte{25}}

@ -92,13 +97,26 @@ index $2^{10}-1$, and finally index $2^{20}-1$.

 \subsection{Analyzing Benchmark Results}

-The \texttt{membench.c} benchmark results for my personal laptop (Macbook Pro
-2018 with a Core i7-8750H CPU) and the cluster are shown below respectively:
+\begin{figure}[t]
+    \begin{subfigure}{0.5\textwidth}
+\includegraphics[width=\textwidth]{generic_macos.pdf}
+        \caption{Personal laptop}
+        \label{fig:mem:laptop}
+    \end{subfigure}
+    \begin{subfigure}{0.5\textwidth}
+\includegraphics[width=\textwidth]{generic_cluster.pdf}
+        \caption{Cluster}
+        \label{fig:mem:cluster}
+    \end{subfigure}
+    \caption{Results of the \texttt{membench.c} benchmark for both my personal
+    laptop (in Figure \ref{fig:mem:laptop}) and the cluster (in Figure
+    \ref{fig:mem:cluster}).}
+    \label{fig:mem}
+\end{figure}

-\begin{center}
-\includegraphics[width=12cm]{generic_macos.pdf}
-\includegraphics[width=12cm]{generic_cluster.pdf}
-\end{center}
+The \texttt{membench.c} benchmark results for my personal laptop (Macbook Pro
+2018 with a Core i7-8750H CPU) and the cluster are shown in figure
+\ref{fig:mem}.

 The memory access graph for the cluster's benchmark results shows that temporal
 locality is best for small array sizes and for small \texttt{stride} values.
@ -112,8 +130,104 @@ for the largest strides of each size series shown in the graph).

 \section{Optimize Square Matrix-Matrix Multiplication  \punkte{60}}

+The file \texttt{matmult/dgemm-blocked.c} contains a C implementation of the
+blocked matrix multiplication algorithm presented in the project. Other than
+implementing the pseudocode, my implementation:

-\section{Quality of the Report  \punkte{15}}
+\begin{figure}[t]
+\begin{center}
+\begin{tikzpicture}
+    \fill[blue!60!white] (4,0) rectangle (5,-2);
+    \fill[blue!40!white] (4,-2) rectangle (5,-4);
+    \fill[blue!60!white] (0,-4) rectangle (2,-5);
+    \fill[blue!40!white] (2,-4) rectangle (4,-5);
+    \fill[green!40!white] (4,-4) rectangle (5,-5);
+    \draw[step=1,gray,very thin] (0,0) grid (5,-5);
+    \draw[step=2] (0,0) grid (5,-5);
+    \draw[step=5] (0,0) grid (5,-5);
+\end{tikzpicture}
+\end{center}
+    \caption{Result of the block division process of a square matrix of size 5
+    using a block size of 2. The 2-by-1 and 1-by-2 rectangular remainders are
+    shown in blue and the square matrix of remainder size (i.e. 1) is shown in
+    green.}
+    \label{fig:matrix}
+\end{figure}

+\begin{itemize}
+    \item Handles the edge cases related to the ``remainders'' in the matrix
+        block division, i.e. when the division between the size of the matrix
+        and the block size yields a remainder. Assuming only squared matrices
+        are multiplied through the algorithm (as in the test suite provided) the
+        block division could yield rectangular matrix blocks located in the last
+        rows and columns of each matrix, and the bottom-right corner of the
+        matrix will be contained in a square matrix block of the size of the
+        remainder. The result of this process is shown in Figure
+        \ref{fig:matrix};
+    \item Converts matrix A into row major format. As shown in Figure
+        \ref{fig:iter}, by having A in row major format and B in column major
+        format, iterations across matrix block in the inner most loop of the
+        algorithm (the one calling \textit{naivemm}) cache hits are maximised by
+        achieving space locality between the blocks used;
+    \item Caches the result of each innermost iteration into a temporary matrix
+        of block size before storing it into matrix C. This achieves better
+        space locality when \textit{naivemm} needs to store values in matrix C.
+        The block size temporary matrix has virtually no stride and thus cache
+        hits are maximised. The copy operation is implemented with bulk copy
+        \texttt{memcpy} calls.
+\end{itemize}
+
+\begin{figure}[t]
+\begin{center}
+\begin{tikzpicture}
+    \node[align=center] at (2.5,0.5) {Matrix A};
+    \fill[orange!10!white] (0,0) rectangle (2,-2);
+    \fill[orange!25!white] (2,0) rectangle (4,-2);
+    \fill[orange!40!white] (4,0) rectangle (5,-2);
+    
+    \draw[step=1,gray,very thin] (0,0) grid (5,-5);
+    \draw[step=2,black,thick] (0,0) grid (5,-5);
+    \draw[step=5,black,thick] (0,0) grid (5,-5);
+    
+    \draw[-to,step=1,red,very thick] (0.5,-0.5) -- (4.5,-0.5); 
+    \draw[-to,step=1,red,very thick] (0.5,-1.5) -- (4.5,-1.5); 
+    \draw[-to,step=1,red,very thick] (0.5,-2.5) -- (4.5,-2.5); 
+    \draw[-to,step=1,red,very thick] (0.5,-3.5) -- (4.5,-3.5); 
+    \draw[-to,step=1,red,very thick] (0.5,-4.5) -- (4.5,-4.5); 
+    
+    \node[align=center] at (8.5,0.5) {Matrix B};
+    \fill[orange!10!white] (6,0) rectangle (8,-2);
+    \fill[orange!25!white] (6,-2) rectangle (8,-4);
+    \fill[orange!40!white] (6,-4) rectangle (8,-5);
+    
+    \draw[step=1,gray,very thin] (6,0) grid (11,-5);
+    \draw[step=2,black,thick] (6,0) grid (11,-5);
+    \draw[step=5,black,thick] (6,0) grid (11,-5);
+    \draw[black,thick] (11,0) -- (11,-5);
+
+    \draw[-to,step=1,red,very thick] (6.5,-0.5) -- (6.5,-4.5); 
+    \draw[-to,step=1,red,very thick] (7.5,-0.5) -- (7.5,-4.5); 
+    \draw[-to,step=1,red,very thick] (8.5,-0.5) -- (8.5,-4.5); 
+    \draw[-to,step=1,red,very thick] (9.5,-0.5) -- (9.5,-4.5); 
+    \draw[-to,step=1,red,very thick] (10.5,-0.5) -- (10.5,-4.5); 
+\end{tikzpicture}
+\end{center}
+    \caption{Inner most loop iteration of the blocked GEMM algorithm across
+    matrices A and B. The red lines represent the ``majorness'' of each matrix
+    (A is converted by the algorithm in row-major form, while B is given and
+    used in column-major form). The shades of orange represent the blocks used
+    in each iteration.}
+    \label{fig:iter}
+\end{figure}
+
+The results of the matrix multiplication benchmark for the naive, blocked, and
+BLAS implementations are shown in Figure \ref{fig:bench}.
+
+\begin{figure}[t]
+    \includegraphics[width=\textwidth]{timing.pdf}
+    \caption{Results of the matrix multiplication benchmark for the naive,
+    blocked, and BLAS implementations}
+    \label{fig:bench}
+\end{figure}

 \end{document}
--- a/Project1/timing.pdf
+++ b/Project1/timing.pdf