diff --git a/.~lock.status.ods# b/.~lock.status.ods# deleted file mode 100644 index 8f081e01..00000000 --- a/.~lock.status.ods# +++ /dev/null @@ -1 +0,0 @@ -,maggicl,Apple2gs.local,24.05.2021 14:04,file:///Users/maggicl/Library/Application%20Support/LibreOffice/4; \ No newline at end of file diff --git a/machine_time_waste/statuses_total_time.ipynb b/machine_time_waste/statuses_total_time.ipynb index dddd11b4..f62c6958 100644 --- a/machine_time_waste/statuses_total_time.ipynb +++ b/machine_time_waste/statuses_total_time.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -27,12 +27,13 @@ " 'text.usetex': True,\n", " 'pgf.rcfonts': False,\n", "})\n", - "import matplotlib.pyplot as plt" + "import matplotlib.pyplot as plt\n", + "pandas.options.display.float_format = '{:,.3f}'.format" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -53,16 +54,16 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "DIR = \"/home/claudio/hdd/git/bachelorThesis\"" + "DIR = \"/Users/maggicl/git/bachelorThesis\"" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -138,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -163,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -174,6 +175,9 @@ "\n", " df.rename(columns = {'time_type': 'Execution phase'}, inplace = True)\n", " \n", + " print(\"Cluster \"+cluster)\n", + " print(df)\n", + " \n", " h = sns.histplot(df, x=\"Last termination\", \n", " weights=\"time_ms\", shrink=.5, common_bins=True,\n", " hue=\"Execution phase\", multiple=\"stack\", discrete=True, legend=True)\n", @@ -191,149 +195,193 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Last termination time_type time_ms\n", - "0 EVICT Queue 1.049774e+12\n", - "1 EVICT Resubmission 5.530617e+08\n", - "2 EVICT Running 3.218063e+13\n", - "3 EVICT Unknown 3.383291e+12\n", - "4 FAIL Queue 9.483261e+11\n", - "5 FAIL Resubmission 7.150500e+01\n", - "6 FAIL Running 7.265195e+12\n", - "7 FAIL Unknown 2.799674e+12\n", - "8 FINISH Queue 3.317009e+13\n", - "9 FINISH Resubmission 1.828825e+07\n", - "10 FINISH Running 3.788436e+13\n", - "11 FINISH Unknown 2.482661e+13\n", - "12 KILL Queue 7.482888e+13\n", - "13 KILL Resubmission 1.211419e+11\n", - "14 KILL Running 6.311166e+14\n", - "15 KILL Unknown 1.207792e+15\n", - " Last termination time_type time_ms\n", - "0 EVICT Queue 2.991028e+11\n", - "1 EVICT Resubmission 1.360657e+09\n", - "2 EVICT Running 2.871365e+13\n", - "3 EVICT Unknown 1.428912e+13\n", - "4 FAIL Queue 9.376134e+10\n", - "5 FAIL Resubmission 1.225520e+02\n", - "6 FAIL Running 8.338530e+12\n", - "7 FAIL Unknown 1.989378e+12\n", - "8 FINISH Queue 6.817208e+12\n", - "9 FINISH Resubmission 1.493729e+03\n", - "10 FINISH Running 8.069421e+13\n", - "11 FINISH Unknown 1.006353e+14\n", - "12 KILL Queue 5.397953e+13\n", - "13 KILL Resubmission 1.842002e+10\n", - "14 KILL Running 5.716892e+14\n", - "15 KILL Unknown 2.088855e+15\n", - " Last termination time_type time_ms\n", - "0 EVICT Queue 3.158380e+11\n", - "1 EVICT Resubmission 2.355575e+09\n", - "2 EVICT Running 4.229815e+13\n", - "3 EVICT Unknown 6.785277e+12\n", - "4 FAIL Queue 2.352869e+11\n", - "5 FAIL Resubmission 4.684500e+01\n", - "6 FAIL Running 9.316941e+12\n", - "7 FAIL Unknown 4.873943e+12\n", - "8 FINISH Queue 1.172189e+13\n", - "9 FINISH Resubmission 3.623451e+03\n", - "10 FINISH Running 1.154498e+14\n", - "11 FINISH Unknown 4.934279e+13\n", - "12 KILL Queue 7.171264e+13\n", - "13 KILL Resubmission 2.108520e+11\n", - "14 KILL Running 6.180005e+14\n", - "15 KILL Unknown 2.088457e+15\n", - " Last termination time_type time_ms\n", - "0 EVICT Queue 1.415993e+11\n", - "1 EVICT Resubmission 2.835890e+08\n", - "2 EVICT Running 4.303187e+13\n", - "3 EVICT Unknown 7.410999e+12\n", - "4 FAIL Queue 2.231462e+10\n", - "5 FAIL Resubmission 1.073960e+02\n", - "6 FAIL Running 1.186956e+13\n", - "7 FAIL Unknown 2.829927e+12\n", - "8 FINISH Queue 4.455665e+12\n", - "9 FINISH Resubmission 1.577302e+03\n", - "10 FINISH Running 6.516562e+13\n", - "11 FINISH Unknown 7.106965e+13\n", - "12 KILL Queue 7.435926e+13\n", - "13 KILL Resubmission 5.556059e+11\n", - "14 KILL Running 4.702722e+14\n", - "15 KILL Unknown 2.040366e+15\n", - " Last termination time_type time_ms\n", - "0 EVICT Queue 1.722618e+10\n", - "1 EVICT Resubmission 1.788932e+09\n", - "2 EVICT Running 1.710804e+13\n", - "3 EVICT Unknown 7.078678e+12\n", - "4 FAIL Queue 2.895755e+09\n", - "5 FAIL Resubmission 5.304400e+01\n", - "6 FAIL Running 2.281806e+12\n", - "7 FAIL Unknown 3.984907e+11\n", - "8 FINISH Queue 7.454410e+11\n", - "9 FINISH Resubmission 6.310360e+02\n", - "10 FINISH Running 4.284518e+13\n", - "11 FINISH Unknown 3.672368e+13\n", - "12 KILL Queue 1.398332e+14\n", - "13 KILL Resubmission 4.825723e+10\n", - "14 KILL Running 3.049664e+14\n", - "15 KILL Unknown 3.072445e+15\n", - " Last termination time_type time_ms\n", - "0 EVICT Queue 3.140594e+10\n", - "1 EVICT Resubmission 1.504263e+10\n", - "2 EVICT Running 5.070239e+13\n", - "3 EVICT Unknown 1.602834e+13\n", - "4 FAIL Queue 5.523972e+09\n", - "5 FAIL Resubmission 2.352700e+01\n", - "6 FAIL Running 3.889624e+12\n", - "7 FAIL Unknown 1.833895e+12\n", - "8 FINISH Queue 1.098116e+13\n", - "9 FINISH Resubmission 6.319590e+02\n", - "10 FINISH Running 9.761364e+13\n", - "11 FINISH Unknown 9.603417e+13\n", - "12 KILL Queue 1.129539e+14\n", - "13 KILL Resubmission 1.356476e+11\n", - "14 KILL Running 4.505937e+14\n", - "15 KILL Unknown 2.669451e+15\n", - " Last termination time_type time_ms\n", - "0 EVICT Queue 9.528645e+10\n", - "1 EVICT Resubmission 1.493116e+09\n", - "2 EVICT Running 8.513084e+12\n", - "3 EVICT Unknown 2.778074e+12\n", - "4 FAIL Queue 2.887122e+11\n", - "5 FAIL Resubmission 1.757300e+01\n", - "6 FAIL Running 1.867799e+12\n", - "7 FAIL Unknown 6.622832e+11\n", - "8 FINISH Queue 8.337090e+11\n", - "9 FINISH Resubmission 6.753141e+07\n", - "10 FINISH Running 3.514254e+13\n", - "11 FINISH Unknown 6.704536e+13\n", - "12 KILL Queue 1.152843e+14\n", - "13 KILL Resubmission 5.814544e+10\n", - "14 KILL Running 2.225128e+14\n", - "15 KILL Unknown 3.894626e+15\n", - " Last termination time_type time_ms\n", - "0 EVICT Queue 4.621613e+10\n", - "1 EVICT Resubmission 4.511340e+02\n", - "2 EVICT Running 2.786346e+13\n", - "3 EVICT Unknown 9.513981e+12\n", - "4 FAIL Queue 7.828423e+09\n", - "5 FAIL Resubmission 1.148130e+02\n", - "6 FAIL Running 3.509052e+12\n", - "7 FAIL Unknown 1.212378e+12\n", - "8 FINISH Queue 9.252380e+12\n", - "9 FINISH Resubmission 1.675400e+02\n", - "10 FINISH Running 7.635478e+13\n", - "11 FINISH Unknown 5.980213e+13\n", - "12 KILL Queue 1.543895e+14\n", - "13 KILL Resubmission 3.419664e+09\n", - "14 KILL Running 3.838571e+14\n", - "15 KILL Unknown 4.039843e+15\n" + "Cluster a\n", + " Last termination Execution phase time_ms\n", + "0 EVICT Queue 0.051\n", + "1 EVICT Resubmission 0.000\n", + "2 EVICT Running 1.564\n", + "3 EVICT Unknown 0.164\n", + "4 FAIL Queue 0.046\n", + "5 FAIL Resubmission 0.000\n", + "6 FAIL Running 0.353\n", + "7 FAIL Unknown 0.136\n", + "8 FINISH Queue 1.612\n", + "9 FINISH Resubmission 0.000\n", + "10 FINISH Running 1.841\n", + "11 FINISH Unknown 1.207\n", + "12 KILL Queue 3.637\n", + "13 KILL Resubmission 0.006\n", + "14 KILL Running 30.676\n", + "15 KILL Unknown 58.706\n", + "Cluster b\n", + " Last termination Execution phase time_ms\n", + "0 EVICT Queue 0.010\n", + "1 EVICT Resubmission 0.000\n", + "2 EVICT Running 0.971\n", + "3 EVICT Unknown 0.483\n", + "4 FAIL Queue 0.003\n", + "5 FAIL Resubmission 0.000\n", + "6 FAIL Running 0.282\n", + "7 FAIL Unknown 0.067\n", + "8 FINISH Queue 0.231\n", + "9 FINISH Resubmission 0.000\n", + "10 FINISH Running 2.729\n", + "11 FINISH Unknown 3.404\n", + "12 KILL Queue 1.826\n", + "13 KILL Resubmission 0.001\n", + "14 KILL Running 19.337\n", + "15 KILL Unknown 70.655\n", + "Cluster c\n", + " Last termination Execution phase time_ms\n", + "0 EVICT Queue 0.010\n", + "1 EVICT Resubmission 0.000\n", + "2 EVICT Running 1.401\n", + "3 EVICT Unknown 0.225\n", + "4 FAIL Queue 0.008\n", + "5 FAIL Resubmission 0.000\n", + "6 FAIL Running 0.309\n", + "7 FAIL Unknown 0.161\n", + "8 FINISH Queue 0.388\n", + "9 FINISH Resubmission 0.000\n", + "10 FINISH Running 3.824\n", + "11 FINISH Unknown 1.635\n", + "12 KILL Queue 2.376\n", + "13 KILL Resubmission 0.007\n", + "14 KILL Running 20.472\n", + "15 KILL Unknown 69.183\n", + "Cluster d\n", + " Last termination Execution phase time_ms\n", + "0 EVICT Queue 0.005\n", + "1 EVICT Resubmission 0.000\n", + "2 EVICT Running 1.542\n", + "3 EVICT Unknown 0.265\n", + "4 FAIL Queue 0.001\n", + "5 FAIL Resubmission 0.000\n", + "6 FAIL Running 0.425\n", + "7 FAIL Unknown 0.101\n", + "8 FINISH Queue 0.160\n", + "9 FINISH Resubmission 0.000\n", + "10 FINISH Running 2.334\n", + "11 FINISH Unknown 2.546\n", + "12 KILL Queue 2.664\n", + "13 KILL Resubmission 0.020\n", + "14 KILL Running 16.846\n", + "15 KILL Unknown 73.091\n", + "Cluster e\n", + " Last termination Execution phase time_ms\n", + "0 EVICT Queue 0.000\n", + "1 EVICT Resubmission 0.000\n", + "2 EVICT Running 0.472\n", + "3 EVICT Unknown 0.195\n", + "4 FAIL Queue 0.000\n", + "5 FAIL Resubmission 0.000\n", + "6 FAIL Running 0.063\n", + "7 FAIL Unknown 0.011\n", + "8 FINISH Queue 0.021\n", + "9 FINISH Resubmission 0.000\n", + "10 FINISH Running 1.182\n", + "11 FINISH Unknown 1.013\n", + "12 KILL Queue 3.858\n", + "13 KILL Resubmission 0.001\n", + "14 KILL Running 8.414\n", + "15 KILL Unknown 84.769\n", + "Cluster f\n", + " Last termination Execution phase time_ms\n", + "0 EVICT Queue 0.001\n", + "1 EVICT Resubmission 0.000\n", + "2 EVICT Running 1.444\n", + "3 EVICT Unknown 0.457\n", + "4 FAIL Queue 0.000\n", + "5 FAIL Resubmission 0.000\n", + "6 FAIL Running 0.111\n", + "7 FAIL Unknown 0.052\n", + "8 FINISH Queue 0.313\n", + "9 FINISH Resubmission 0.000\n", + "10 FINISH Running 2.781\n", + "11 FINISH Unknown 2.736\n", + "12 KILL Queue 3.218\n", + "13 KILL Resubmission 0.004\n", + "14 KILL Running 12.836\n", + "15 KILL Unknown 76.047\n", + "Cluster g\n", + " Last termination Execution phase time_ms\n", + "0 EVICT Queue 0.002\n", + "1 EVICT Resubmission 0.000\n", + "2 EVICT Running 0.196\n", + "3 EVICT Unknown 0.064\n", + "4 FAIL Queue 0.007\n", + "5 FAIL Resubmission 0.000\n", + "6 FAIL Running 0.043\n", + "7 FAIL Unknown 0.015\n", + "8 FINISH Queue 0.019\n", + "9 FINISH Resubmission 0.000\n", + "10 FINISH Running 0.808\n", + "11 FINISH Unknown 1.541\n", + "12 KILL Queue 2.650\n", + "13 KILL Resubmission 0.001\n", + "14 KILL Running 5.116\n", + "15 KILL Unknown 89.538\n", + "Cluster h\n", + " Last termination Execution phase time_ms\n", + "0 EVICT Queue 0.001\n", + "1 EVICT Resubmission 0.000\n", + "2 EVICT Running 0.585\n", + "3 EVICT Unknown 0.200\n", + "4 FAIL Queue 0.000\n", + "5 FAIL Resubmission 0.000\n", + "6 FAIL Running 0.074\n", + "7 FAIL Unknown 0.025\n", + "8 FINISH Queue 0.194\n", + "9 FINISH Resubmission 0.000\n", + "10 FINISH Running 1.602\n", + "11 FINISH Unknown 1.255\n", + "12 KILL Queue 3.240\n", + "13 KILL Resubmission 0.000\n", + "14 KILL Running 8.055\n", + "15 KILL Unknown 84.770\n", + "Cluster all\n", + " Last termination Execution phase time_ms\n", + "0 EVICT Queue 0.007\n", + "1 EVICT Resubmission 0.000\n", + "2 EVICT Running 0.925\n", + "3 EVICT Unknown 0.248\n", + "4 FAIL Queue 0.006\n", + "5 FAIL Resubmission 0.000\n", + "6 FAIL Running 0.179\n", + "7 FAIL Unknown 0.061\n", + "8 FINISH Queue 0.288\n", + "9 FINISH Resubmission 0.000\n", + "10 FINISH Running 2.036\n", + "11 FINISH Unknown 1.867\n", + "12 KILL Queue 2.945\n", + "13 KILL Resubmission 0.004\n", + "14 KILL Running 13.493\n", + "15 KILL Unknown 77.941\n", + "Cluster 2011\n", + " Last termination Execution phase time_ms\n", + "0 EVICT Queue 2.500\n", + "1 EVICT Resubmission 0.000\n", + "2 EVICT Running 17.500\n", + "3 EVICT Unknown 0.000\n", + "4 FAIL Queue 0.000\n", + "5 FAIL Resubmission 0.000\n", + "6 FAIL Running 5.000\n", + "7 FAIL Unknown 0.000\n", + "8 FINISH Queue 1.000\n", + "9 FINISH Resubmission 0.000\n", + "10 FINISH Running 39.000\n", + "11 FINISH Unknown 0.000\n", + "12 KILL Queue 5.000\n", + "13 KILL Resubmission 1.000\n", + "14 KILL Running 30.000\n", + "15 KILL Unknown 0.000\n" ] } ], @@ -343,8 +391,6 @@ "\n", "for cluster in \"abcdefgh\":\n", " df, totals = create_df(cluster)\n", - " \n", - " print(df)\n", "\n", " #plt.figure(figsize=(10,8))\n", " #graph_1(df, cluster)\n", @@ -377,30 +423,30 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Last termination time_type time_ms\n", - "0 EVICT Queue 7.373993e-05\n", - "1 EVICT Resubmission 8.449953e-07\n", - "2 EVICT Running 9.249078e-03\n", - "3 EVICT Unknown 2.484572e-03\n", - "4 FAIL Queue 5.926861e-05\n", - "5 FAIL Resubmission 2.058252e-14\n", - "6 FAIL Running 1.785410e-03\n", - "7 FAIL Unknown 6.131290e-04\n", - "8 FINISH Queue 2.880144e-03\n", - "9 FINISH Resubmission 3.170097e-09\n", - "10 FINISH Running 2.035703e-02\n", - "11 FINISH Unknown 1.867017e-02\n", - "12 KILL Queue 2.945024e-02\n", - "13 KILL Resubmission 4.253091e-05\n", - "14 KILL Running 1.349259e-01\n", - "15 KILL Unknown 7.794080e-01\n" + " Last termination time_type time_ms\n", + "0 EVICT Queue 0.000\n", + "1 EVICT Resubmission 0.000\n", + "2 EVICT Running 0.009\n", + "3 EVICT Unknown 0.002\n", + "4 FAIL Queue 0.000\n", + "5 FAIL Resubmission 0.000\n", + "6 FAIL Running 0.002\n", + "7 FAIL Unknown 0.001\n", + "8 FINISH Queue 0.003\n", + "9 FINISH Resubmission 0.000\n", + "10 FINISH Running 0.020\n", + "11 FINISH Unknown 0.019\n", + "12 KILL Queue 0.029\n", + "13 KILL Resubmission 0.000\n", + "14 KILL Running 0.135\n", + "15 KILL Unknown 0.779\n" ] } ], diff --git a/report/Claudio_Maggioni_report.pdf b/report/Claudio_Maggioni_report.pdf index a31e64f5..32d4fa93 100644 Binary files a/report/Claudio_Maggioni_report.pdf and b/report/Claudio_Maggioni_report.pdf differ diff --git a/report/Claudio_Maggioni_report.tex b/report/Claudio_Maggioni_report.tex index 014213d2..146eba54 100644 --- a/report/Claudio_Maggioni_report.tex +++ b/report/Claudio_Maggioni_report.tex @@ -82,6 +82,15 @@ and stored in JSONL format)\cite{google-drive-marso}, requiring a considerable amount of computational power to analyze them and the implementation of special data engineering techniques for analysis of the data. +\input{figures/machine_configs} + +An overview of the machine configurations in the cluster analyzed with the 2011 +traces and in the 8 clusters composing the 2019 traces can be found in +figure~\ref{fig:machineconfigs}. Additionally, in +figure~\ref{fig:machineconfigs-csts}, the same machine configuration data is +provided for the 2019 traces providing a cluster-by-cluster distribution of the +machines. + This project aims to repeat the analysis performed in 2015 to highlight similarities and differences in workload this decade brought, and expanding the old analysis to understand even better the causes of failures and how to prevent @@ -441,102 +450,133 @@ deltas. Finally, the mean of the computed slowdown values is computed resulting in the clear and coincise tables found in figure~\ref{fig:taskslowdown}. -\hypertarget{ad-hoc-presentation-of-some-analysis-scripts}{% -\subsection{Ad-Hoc presentation of some analysis -scripts}\label{ad-hoc-presentation-of-some-analysis-scripts}} - -\textbf{TBD} (with diagrams) - -\hypertarget{analysis-and-observations}{% -\section{Analysis and observations}\label{analysis-and-observations}} - -\hypertarget{overview-of-machine-configurations-in-each-cluster}{% -\subsection{Overview of machine configurations in each -cluster}\label{overview-of-machine-configurations-in-each-cluster}} - -\input{figures/machine_configs} - -Refer to figure \ref{fig:machineconfigs}. - -\textbf{Observations}: - -\begin{itemize} -\item - machine configurations are definitely more varied than the ones in the - 2011 traces -\item - some clusters have more machine variability -\end{itemize} - -\hypertarget{analysis-of-execution-time-per-each-execution-phase}{% -\subsection{Analysis of execution time per each execution -phase}\label{analysis-of-execution-time-per-each-execution-phase}} +\section{Analysis: Performance Input of Unsuccessful Executions} \input{figures/machine_time_waste} -Refer to figures \ref{fig:machinetimewaste-abs} and -\ref{fig:machinetimewaste-rel}. +Our first investigation focuses on replicating the methodologies used in the +2015 DSN Ros\'a et al.\ paper\cite{vino-paper} regarding usage of machine time +and resources. -\textbf{Observations}: +In this section we perform several analyses focusing on how machine time and +resources are wasted, by means of a temporal vs. spatial resource analysis from +the perspective of single tasks as well as jobs. We then compare the results +from the 2019 traces to the ones that were obtained in 2015 to understand the +workload evolution inside Borg between 2011 and 2019. -\begin{itemize} -\item - Across all cluster almost 50\% of time is spent in ``unknown'' - transitions, i.e. there are some time slices that are related to a - state transition that Google says are not ``typical'' transitions. - This is mostly due to the trace log being intermittent when recording - all state transitions. -\item - 80\% of the time spent in KILL and LOST is unknown. This is - predictable, since both states indicate that the job execution is not - stable (in particular LOST is used when the state logging itself is - unstable) -\item - From the absolute graph we see that the time ``wasted'' on non-finish - terminated jobs is very significant -\item - Execution is the most significant task phase, followed by queuing time - and scheduling time (``ready'' state) -\item - In the absolute graph we see that a significant amount of time is - spent to re-schedule evicted jobs (``evicted'' state) -\item - Cluster A has unusually high queuing times -\end{itemize} +\subsection{Temporal Impact: Machine Time Waste} -\hypertarget{task-slowdown}{% -\subsection{Task slowdown}\label{task-slowdown}} +This analysis explores how machine time is distributed over task events and +submissions. By partitioning the collection of all terminating tasks by their +termination event, the analysis aims to measure the total time spent by tasks in +3 different execution phases: +\begin{description} +\item[resubmission time:] the total of all time deltas between every task + termination event and the immediately succeding task submission event, i.e. + the total time spent by tasks waiting to be resubmitted in Borg after a + termination; +\item[queue time:] the total of all time deltas between every task submission + event and the following task scheduling event, i.e. the total time spent by + tasks queuing before execution; +\item[running time:] the total of all time deltas between every task scheduling + event and the following task termination event, i.e. the total time spent by + tasks ``executing'' (i.e. performing useful computations) in the clusters. +\end{description} + +In the 2019 traces, an additional ``Unknown'' measure is counted. This measure +collects all the times in which the event transitions between the register +events do not allow to safely assume in which execution phase a task may be. +Unknown measures are mostly caused by faults and missed event writes in the task +event log that was used to generate the traces. + +The analysis results are depicted in figure~\ref{fig:machinetimewaste-rel} as a +comparison between the 2011 and 2019 traces, aggregating the data from all +clusters. Additionally, in figure~\ref{fig:machinetimewaste-rel-csts} +cluster-by-cluster breakdown result is provided for the 2019 traces. + +The striking difference between 2011 and 2019 data is in the machine time +distribution per task termination type. In the 2019 traces, 94.38\% of global +machine time is spent on tasks that are eventually \texttt{KILL}ed. +\texttt{FINISH}, \texttt{EVICT} and \texttt{FAIL} tasks respectively register +totals of 4.20\%, 1.18\% and 0.25\% machine time, maintaining a analogous +distribution between them to their distribution in the 2011 traces. + +Considering instead the distribution between execution phase times, the +comparison shows very similar behaviour between the two traces, having the +``Running'' time being dominant (at a total of 16.63\% across task terminations +in 2019) over the queue and resubmission phases (with respective totals in 2019 +of 3.26\% and 0.004\%). + +However, another noteworthy difference between 2011 and 2019 data lies in the new +``Unknown'' trace dataset present only in the latter traces, registering a total +80.12\% of global machine time across al terminations. This data can be +interpreted as a strong indication of the ``poor quality'' of the 2019 traces +w.r.t.\ of accuracy of task event logging. + +Considering instead the behaviour of each single cluster in the 2019 traces, no +significant difference beween them can be observed. The only notable difference +lies between the ``Running time``-``Unknown time'' ratio in \texttt{KILL}ed +tasks, which is at its highest in cluster A (at 30.78\% by 58.71\% of global +machine time) and at its lowest in cluster H (at 8.06\% by 84.77\% of global +machine time). + +\subsection{Average Slowdown per Task} \input{figures/task_slowdown} -Refer to figure \ref{fig:taskslowdown} +This analysis aims to measure the figure of ``slowdown'', which is defined as +the ratio between the response time (i.e\. queue time and running time) of the +last execution of a given task and the total response time across all +executions of said task. This metric is especially useful to analyze the impact +of unsuccesful executions on each task total execution time w.r.t.\ the intrinsic +workload (i.e.\ computational time) of tasks. -\textbf{Observations}: +Refer to figure~\ref{fig:taskslowdown} for a comparison between the 2011 and +2019 mean task slowdown measures broke down by task priority. Additionally, said +means are computed on a cluster-by-cluster basis for 2019 data in +figure~\ref{fig:taskslowdown-csts}. -\begin{itemize} -\item - Priority values are different from 0-11 values in the 2011 traces. A - conversion table is provided by Google; -\item - For some priorities (e.g.~101 for cluster D) the relative number of - finishing task is very low and the mean slowdown is very high (315). - This behaviour differs from the relatively homogeneous values from the - 2011 traces. -\item - Some slowdown values cannot be computed since either some tasks have a - 0ns execution time or for some priorities no tasks in the traces - terminate successfully. More raw data on those exception is in - Jupyter. -\item +In 2015 Ros\'a et al.\cite{vino-paper} measured mean task slowdown per each task +priority value, which at the time were $[0,11]$ numeric values. However, +in 2019 traces, task priorities are given as a $[0,500]$ numeric value. +Therefore, to allow for an easier comparison, mean task slowdown values are +computed by task priority tier over the 2019 data. Priority tiers are +semantically relevant priority ranges defined in the Tirmazi et al. +2020\cite{google-marso-19} that introduced the 2019 traces. Equivalent priority +tiers are also provided next to the 2011 priority values in the table covering +the 2015 analysis. + +In the given tables, the \textbf{\% finished} column corresponds to the +percentage of \texttt{FINISH}ed tasks for that priority or tier. \textbf{Mean +response [s] (last execution)} instead shows the mean response time of the last +task execution of each task in that priority/tier. +\textbf{Mean response [s] (all executions)} provides a very similar figure, +though this column shows the mean response time across all executions. +\textbf{Mean slowdown} instead provides the mean slowdown value for each task +priority/tier. + +Comparing the tables in figure~\ref{fig:taskslowdown} we observe that the +maximum mean slowdown measure for 2019 data (i.e.\ 7.84, for the BEB tier) is almost +double of the maximum measure in 2011 data (i.e.\ 3.39, for priority $3$ +corresponding to the BEB tier). The ``Best effort batch'' tier, as the name +suggest, is a lower priority tier where failures are more tolerated. Therefore, +due to the increased concurrency in the 2019 clusters compared to 2011 and the +higher machine time spent for unsuccesful executions (as observed in the +previous analysis) and increase slowdown rate for this class is not particularly +surprising. + +\textbf{TBD} The \% of finishing jobs is relatively low comparing with the 2011 traces. -\end{itemize} + +\input{figures/spatial_resource_waste} +\input{figures/table_iii} % has table III and table IV in it +\input{figures/figure_5} \hypertarget{reserved-and-actual-resource-usage-of-tasks}{% \subsection{Reserved and actual resource usage of tasks}\label{reserved-and-actual-resource-usage-of-tasks}} -\input{figures/spatial_resource_waste} Refer to figures \ref{fig:spatialresourcewaste-actual} and \ref{fig:spatialresourcewaste-requested}. @@ -553,6 +593,63 @@ Refer to figures \ref{fig:spatialresourcewaste-actual} and both CPU and RAM \end{itemize} + +Refer to figure \ref{fig:tableIII}. + +\textbf{Observations}: + +\begin{itemize} +\item + The mean number of events per task is an order of magnitude higher + than in the 2011 traces +\item + Generally speaking, the event type with higher mean is the termination + event for the task +\item + The \# evts mean is higher than the sum of all other event type means, + since it appears there are a lot more non-termination events in the + 2019 traces. +\end{itemize} + +\hypertarget{mean-number-of-tasks-and-event-distribution-per-job-type}{% +\subsection{Mean number of tasks and event distribution per job +type}\label{mean-number-of-tasks-and-event-distribution-per-job-type}} + + +\textbf{Observations}: + +\begin{itemize} +\item + Again the mean number of tasks is significantly higher than the 2011 + traces, indicating a higher complexity of workloads +\item + Cluster A has no evicted jobs +\item + The number of events is however lower than the event means in the 2011 + traces +\end{itemize} + +\hypertarget{probability-of-task-successful-termination-given-its-unsuccesful-events}{% +\subsection{Probability of task successful termination given its +unsuccesful +events}\label{probability-of-task-successful-termination-given-its-unsuccesful-events}} + + +Refer to figure \ref{fig:figureV}. + +\textbf{Observations}: + +\begin{itemize} +\item + Behaviour is very different from cluster to cluster +\item + There is no easy conclusion, unlike in 2011, on the correlation + between succesful probability and \# of events of a specific type. +\item + Clusters B, C and D in particular have very unsmooth lines that vary a + lot for small \# evts differences. This may be due to an uneven + distribution of \# evts in the traces. +\end{itemize} \hypertarget{correlation-between-task-events-metadata-and-task-termination}{% \subsection{Correlation between task events' metadata and task termination}\label{correlation-between-task-events-metadata-and-task-termination}} @@ -620,66 +717,6 @@ Refer to figures \ref{fig:figureIX-a}, \ref{fig:figureIX-b}, and \subsection{Mean number of tasks and event distribution per task type}\label{mean-number-of-tasks-and-event-distribution-per-task-type}} -\input{figures/table_iii} - -Refer to figure \ref{fig:tableIII}. - -\textbf{Observations}: - -\begin{itemize} -\item - The mean number of events per task is an order of magnitude higher - than in the 2011 traces -\item - Generally speaking, the event type with higher mean is the termination - event for the task -\item - The \# evts mean is higher than the sum of all other event type means, - since it appears there are a lot more non-termination events in the - 2019 traces. -\end{itemize} - -\hypertarget{mean-number-of-tasks-and-event-distribution-per-job-type}{% -\subsection{Mean number of tasks and event distribution per job -type}\label{mean-number-of-tasks-and-event-distribution-per-job-type}} - -Refer to figure \ref{fig:tableIV}. - -\textbf{Observations}: - -\begin{itemize} -\item - Again the mean number of tasks is significantly higher than the 2011 - traces, indicating a higher complexity of workloads -\item - Cluster A has no evicted jobs -\item - The number of events is however lower than the event means in the 2011 - traces -\end{itemize} - -\hypertarget{probability-of-task-successful-termination-given-its-unsuccesful-events}{% -\subsection{Probability of task successful termination given its -unsuccesful -events}\label{probability-of-task-successful-termination-given-its-unsuccesful-events}} - -\input{figures/figure_5} - -Refer to figure \ref{fig:figureV}. - -\textbf{Observations}: - -\begin{itemize} -\item - Behaviour is very different from cluster to cluster -\item - There is no easy conclusion, unlike in 2011, on the correlation - between succesful probability and \# of events of a specific type. -\item - Clusters B, C and D in particular have very unsmooth lines that vary a - lot for small \# evts differences. This may be due to an uneven - distribution of \# evts in the traces. -\end{itemize} \hypertarget{potential-causes-of-unsuccesful-executions}{% \subsection{Potential causes of unsuccesful diff --git a/report/figures/spatial_resource_waste.tex b/report/figures/spatial_resource_waste.tex index f02ad376..4d514ef1 100644 --- a/report/figures/spatial_resource_waste.tex +++ b/report/figures/spatial_resource_waste.tex @@ -6,13 +6,13 @@ %\hfill \end{subfigure}} -\begin{figure} +\begin{figure}[p] \spatialresourcewaste[0.5\textwidth]{used-2011} \spatialresourcewaste[0.5\textwidth]{used-all} \caption{Percentages of CPU and RAM resources used by tasks w.r.t. task termination type in 2011 and 2019 traces (total of clusters A to D). The x axis is the type of resource, y-axis is the percentage of resource used and color represents task termination. Numeric values are displayed below the graph as a table.}\label{fig:spatialresourcewaste-requested} \end{figure} -\begin{figure} +\begin{figure}[p] \spatialresourcewaste{used-a} \spatialresourcewaste{used-b} \spatialresourcewaste{used-c} @@ -20,13 +20,13 @@ \caption{Percentages of CPU and RAM resources used by tasks w.r.t. task termination type for clusters A to D in 2019 traces. Refer to figure~\ref{fig:spatialresourcewaste-requested} for plot explaination.}\label{fig:spatialresourcewaste-actual-csts} \end{figure} -\begin{figure} +\begin{figure}[p] \spatialresourcewaste[0.5\textwidth]{requested-2011} \spatialresourcewaste[0.5\textwidth]{requested-all} \caption{Percentages of CPU and RAM resources requested by tasks w.r.t. task termination type in 2011 and 2019 traces. The x axis is the type of resource, y-axis is the percentage of resource used and color represents task termination. Numeric values are displayed below the graph as a table.}\label{fig:spatialresourcewaste-actual} \end{figure} -\begin{figure} +\begin{figure}[p] \spatialresourcewaste{requested-a} \spatialresourcewaste{requested-b} \spatialresourcewaste{requested-c}