report work

This commit is contained in:
Claudio Maggioni 2021-05-26 21:46:30 +02:00
parent 0bd6c475ec
commit e200cea3ab
5 changed files with 390 additions and 308 deletions

View file

@ -1 +0,0 @@
,maggicl,Apple2gs.local,24.05.2021 14:04,file:///Users/maggicl/Library/Application%20Support/LibreOffice/4;

View file

@ -11,7 +11,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 31,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -27,12 +27,13 @@
" 'text.usetex': True,\n", " 'text.usetex': True,\n",
" 'pgf.rcfonts': False,\n", " 'pgf.rcfonts': False,\n",
"})\n", "})\n",
"import matplotlib.pyplot as plt" "import matplotlib.pyplot as plt\n",
"pandas.options.display.float_format = '{:,.3f}'.format"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 32,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -53,16 +54,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 33,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"DIR = \"/home/claudio/hdd/git/bachelorThesis\"" "DIR = \"/Users/maggicl/git/bachelorThesis\""
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 34,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -138,7 +139,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 35,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -163,7 +164,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 36,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -174,6 +175,9 @@
"\n", "\n",
" df.rename(columns = {'time_type': 'Execution phase'}, inplace = True)\n", " df.rename(columns = {'time_type': 'Execution phase'}, inplace = True)\n",
" \n", " \n",
" print(\"Cluster \"+cluster)\n",
" print(df)\n",
" \n",
" h = sns.histplot(df, x=\"Last termination\", \n", " h = sns.histplot(df, x=\"Last termination\", \n",
" weights=\"time_ms\", shrink=.5, common_bins=True,\n", " weights=\"time_ms\", shrink=.5, common_bins=True,\n",
" hue=\"Execution phase\", multiple=\"stack\", discrete=True, legend=True)\n", " hue=\"Execution phase\", multiple=\"stack\", discrete=True, legend=True)\n",
@ -191,149 +195,193 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 37,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
" Last termination time_type time_ms\n", "Cluster a\n",
"0 EVICT Queue 1.049774e+12\n", " Last termination Execution phase time_ms\n",
"1 EVICT Resubmission 5.530617e+08\n", "0 EVICT Queue 0.051\n",
"2 EVICT Running 3.218063e+13\n", "1 EVICT Resubmission 0.000\n",
"3 EVICT Unknown 3.383291e+12\n", "2 EVICT Running 1.564\n",
"4 FAIL Queue 9.483261e+11\n", "3 EVICT Unknown 0.164\n",
"5 FAIL Resubmission 7.150500e+01\n", "4 FAIL Queue 0.046\n",
"6 FAIL Running 7.265195e+12\n", "5 FAIL Resubmission 0.000\n",
"7 FAIL Unknown 2.799674e+12\n", "6 FAIL Running 0.353\n",
"8 FINISH Queue 3.317009e+13\n", "7 FAIL Unknown 0.136\n",
"9 FINISH Resubmission 1.828825e+07\n", "8 FINISH Queue 1.612\n",
"10 FINISH Running 3.788436e+13\n", "9 FINISH Resubmission 0.000\n",
"11 FINISH Unknown 2.482661e+13\n", "10 FINISH Running 1.841\n",
"12 KILL Queue 7.482888e+13\n", "11 FINISH Unknown 1.207\n",
"13 KILL Resubmission 1.211419e+11\n", "12 KILL Queue 3.637\n",
"14 KILL Running 6.311166e+14\n", "13 KILL Resubmission 0.006\n",
"15 KILL Unknown 1.207792e+15\n", "14 KILL Running 30.676\n",
" Last termination time_type time_ms\n", "15 KILL Unknown 58.706\n",
"0 EVICT Queue 2.991028e+11\n", "Cluster b\n",
"1 EVICT Resubmission 1.360657e+09\n", " Last termination Execution phase time_ms\n",
"2 EVICT Running 2.871365e+13\n", "0 EVICT Queue 0.010\n",
"3 EVICT Unknown 1.428912e+13\n", "1 EVICT Resubmission 0.000\n",
"4 FAIL Queue 9.376134e+10\n", "2 EVICT Running 0.971\n",
"5 FAIL Resubmission 1.225520e+02\n", "3 EVICT Unknown 0.483\n",
"6 FAIL Running 8.338530e+12\n", "4 FAIL Queue 0.003\n",
"7 FAIL Unknown 1.989378e+12\n", "5 FAIL Resubmission 0.000\n",
"8 FINISH Queue 6.817208e+12\n", "6 FAIL Running 0.282\n",
"9 FINISH Resubmission 1.493729e+03\n", "7 FAIL Unknown 0.067\n",
"10 FINISH Running 8.069421e+13\n", "8 FINISH Queue 0.231\n",
"11 FINISH Unknown 1.006353e+14\n", "9 FINISH Resubmission 0.000\n",
"12 KILL Queue 5.397953e+13\n", "10 FINISH Running 2.729\n",
"13 KILL Resubmission 1.842002e+10\n", "11 FINISH Unknown 3.404\n",
"14 KILL Running 5.716892e+14\n", "12 KILL Queue 1.826\n",
"15 KILL Unknown 2.088855e+15\n", "13 KILL Resubmission 0.001\n",
" Last termination time_type time_ms\n", "14 KILL Running 19.337\n",
"0 EVICT Queue 3.158380e+11\n", "15 KILL Unknown 70.655\n",
"1 EVICT Resubmission 2.355575e+09\n", "Cluster c\n",
"2 EVICT Running 4.229815e+13\n", " Last termination Execution phase time_ms\n",
"3 EVICT Unknown 6.785277e+12\n", "0 EVICT Queue 0.010\n",
"4 FAIL Queue 2.352869e+11\n", "1 EVICT Resubmission 0.000\n",
"5 FAIL Resubmission 4.684500e+01\n", "2 EVICT Running 1.401\n",
"6 FAIL Running 9.316941e+12\n", "3 EVICT Unknown 0.225\n",
"7 FAIL Unknown 4.873943e+12\n", "4 FAIL Queue 0.008\n",
"8 FINISH Queue 1.172189e+13\n", "5 FAIL Resubmission 0.000\n",
"9 FINISH Resubmission 3.623451e+03\n", "6 FAIL Running 0.309\n",
"10 FINISH Running 1.154498e+14\n", "7 FAIL Unknown 0.161\n",
"11 FINISH Unknown 4.934279e+13\n", "8 FINISH Queue 0.388\n",
"12 KILL Queue 7.171264e+13\n", "9 FINISH Resubmission 0.000\n",
"13 KILL Resubmission 2.108520e+11\n", "10 FINISH Running 3.824\n",
"14 KILL Running 6.180005e+14\n", "11 FINISH Unknown 1.635\n",
"15 KILL Unknown 2.088457e+15\n", "12 KILL Queue 2.376\n",
" Last termination time_type time_ms\n", "13 KILL Resubmission 0.007\n",
"0 EVICT Queue 1.415993e+11\n", "14 KILL Running 20.472\n",
"1 EVICT Resubmission 2.835890e+08\n", "15 KILL Unknown 69.183\n",
"2 EVICT Running 4.303187e+13\n", "Cluster d\n",
"3 EVICT Unknown 7.410999e+12\n", " Last termination Execution phase time_ms\n",
"4 FAIL Queue 2.231462e+10\n", "0 EVICT Queue 0.005\n",
"5 FAIL Resubmission 1.073960e+02\n", "1 EVICT Resubmission 0.000\n",
"6 FAIL Running 1.186956e+13\n", "2 EVICT Running 1.542\n",
"7 FAIL Unknown 2.829927e+12\n", "3 EVICT Unknown 0.265\n",
"8 FINISH Queue 4.455665e+12\n", "4 FAIL Queue 0.001\n",
"9 FINISH Resubmission 1.577302e+03\n", "5 FAIL Resubmission 0.000\n",
"10 FINISH Running 6.516562e+13\n", "6 FAIL Running 0.425\n",
"11 FINISH Unknown 7.106965e+13\n", "7 FAIL Unknown 0.101\n",
"12 KILL Queue 7.435926e+13\n", "8 FINISH Queue 0.160\n",
"13 KILL Resubmission 5.556059e+11\n", "9 FINISH Resubmission 0.000\n",
"14 KILL Running 4.702722e+14\n", "10 FINISH Running 2.334\n",
"15 KILL Unknown 2.040366e+15\n", "11 FINISH Unknown 2.546\n",
" Last termination time_type time_ms\n", "12 KILL Queue 2.664\n",
"0 EVICT Queue 1.722618e+10\n", "13 KILL Resubmission 0.020\n",
"1 EVICT Resubmission 1.788932e+09\n", "14 KILL Running 16.846\n",
"2 EVICT Running 1.710804e+13\n", "15 KILL Unknown 73.091\n",
"3 EVICT Unknown 7.078678e+12\n", "Cluster e\n",
"4 FAIL Queue 2.895755e+09\n", " Last termination Execution phase time_ms\n",
"5 FAIL Resubmission 5.304400e+01\n", "0 EVICT Queue 0.000\n",
"6 FAIL Running 2.281806e+12\n", "1 EVICT Resubmission 0.000\n",
"7 FAIL Unknown 3.984907e+11\n", "2 EVICT Running 0.472\n",
"8 FINISH Queue 7.454410e+11\n", "3 EVICT Unknown 0.195\n",
"9 FINISH Resubmission 6.310360e+02\n", "4 FAIL Queue 0.000\n",
"10 FINISH Running 4.284518e+13\n", "5 FAIL Resubmission 0.000\n",
"11 FINISH Unknown 3.672368e+13\n", "6 FAIL Running 0.063\n",
"12 KILL Queue 1.398332e+14\n", "7 FAIL Unknown 0.011\n",
"13 KILL Resubmission 4.825723e+10\n", "8 FINISH Queue 0.021\n",
"14 KILL Running 3.049664e+14\n", "9 FINISH Resubmission 0.000\n",
"15 KILL Unknown 3.072445e+15\n", "10 FINISH Running 1.182\n",
" Last termination time_type time_ms\n", "11 FINISH Unknown 1.013\n",
"0 EVICT Queue 3.140594e+10\n", "12 KILL Queue 3.858\n",
"1 EVICT Resubmission 1.504263e+10\n", "13 KILL Resubmission 0.001\n",
"2 EVICT Running 5.070239e+13\n", "14 KILL Running 8.414\n",
"3 EVICT Unknown 1.602834e+13\n", "15 KILL Unknown 84.769\n",
"4 FAIL Queue 5.523972e+09\n", "Cluster f\n",
"5 FAIL Resubmission 2.352700e+01\n", " Last termination Execution phase time_ms\n",
"6 FAIL Running 3.889624e+12\n", "0 EVICT Queue 0.001\n",
"7 FAIL Unknown 1.833895e+12\n", "1 EVICT Resubmission 0.000\n",
"8 FINISH Queue 1.098116e+13\n", "2 EVICT Running 1.444\n",
"9 FINISH Resubmission 6.319590e+02\n", "3 EVICT Unknown 0.457\n",
"10 FINISH Running 9.761364e+13\n", "4 FAIL Queue 0.000\n",
"11 FINISH Unknown 9.603417e+13\n", "5 FAIL Resubmission 0.000\n",
"12 KILL Queue 1.129539e+14\n", "6 FAIL Running 0.111\n",
"13 KILL Resubmission 1.356476e+11\n", "7 FAIL Unknown 0.052\n",
"14 KILL Running 4.505937e+14\n", "8 FINISH Queue 0.313\n",
"15 KILL Unknown 2.669451e+15\n", "9 FINISH Resubmission 0.000\n",
" Last termination time_type time_ms\n", "10 FINISH Running 2.781\n",
"0 EVICT Queue 9.528645e+10\n", "11 FINISH Unknown 2.736\n",
"1 EVICT Resubmission 1.493116e+09\n", "12 KILL Queue 3.218\n",
"2 EVICT Running 8.513084e+12\n", "13 KILL Resubmission 0.004\n",
"3 EVICT Unknown 2.778074e+12\n", "14 KILL Running 12.836\n",
"4 FAIL Queue 2.887122e+11\n", "15 KILL Unknown 76.047\n",
"5 FAIL Resubmission 1.757300e+01\n", "Cluster g\n",
"6 FAIL Running 1.867799e+12\n", " Last termination Execution phase time_ms\n",
"7 FAIL Unknown 6.622832e+11\n", "0 EVICT Queue 0.002\n",
"8 FINISH Queue 8.337090e+11\n", "1 EVICT Resubmission 0.000\n",
"9 FINISH Resubmission 6.753141e+07\n", "2 EVICT Running 0.196\n",
"10 FINISH Running 3.514254e+13\n", "3 EVICT Unknown 0.064\n",
"11 FINISH Unknown 6.704536e+13\n", "4 FAIL Queue 0.007\n",
"12 KILL Queue 1.152843e+14\n", "5 FAIL Resubmission 0.000\n",
"13 KILL Resubmission 5.814544e+10\n", "6 FAIL Running 0.043\n",
"14 KILL Running 2.225128e+14\n", "7 FAIL Unknown 0.015\n",
"15 KILL Unknown 3.894626e+15\n", "8 FINISH Queue 0.019\n",
" Last termination time_type time_ms\n", "9 FINISH Resubmission 0.000\n",
"0 EVICT Queue 4.621613e+10\n", "10 FINISH Running 0.808\n",
"1 EVICT Resubmission 4.511340e+02\n", "11 FINISH Unknown 1.541\n",
"2 EVICT Running 2.786346e+13\n", "12 KILL Queue 2.650\n",
"3 EVICT Unknown 9.513981e+12\n", "13 KILL Resubmission 0.001\n",
"4 FAIL Queue 7.828423e+09\n", "14 KILL Running 5.116\n",
"5 FAIL Resubmission 1.148130e+02\n", "15 KILL Unknown 89.538\n",
"6 FAIL Running 3.509052e+12\n", "Cluster h\n",
"7 FAIL Unknown 1.212378e+12\n", " Last termination Execution phase time_ms\n",
"8 FINISH Queue 9.252380e+12\n", "0 EVICT Queue 0.001\n",
"9 FINISH Resubmission 1.675400e+02\n", "1 EVICT Resubmission 0.000\n",
"10 FINISH Running 7.635478e+13\n", "2 EVICT Running 0.585\n",
"11 FINISH Unknown 5.980213e+13\n", "3 EVICT Unknown 0.200\n",
"12 KILL Queue 1.543895e+14\n", "4 FAIL Queue 0.000\n",
"13 KILL Resubmission 3.419664e+09\n", "5 FAIL Resubmission 0.000\n",
"14 KILL Running 3.838571e+14\n", "6 FAIL Running 0.074\n",
"15 KILL Unknown 4.039843e+15\n" "7 FAIL Unknown 0.025\n",
"8 FINISH Queue 0.194\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 1.602\n",
"11 FINISH Unknown 1.255\n",
"12 KILL Queue 3.240\n",
"13 KILL Resubmission 0.000\n",
"14 KILL Running 8.055\n",
"15 KILL Unknown 84.770\n",
"Cluster all\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 0.007\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 0.925\n",
"3 EVICT Unknown 0.248\n",
"4 FAIL Queue 0.006\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 0.179\n",
"7 FAIL Unknown 0.061\n",
"8 FINISH Queue 0.288\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 2.036\n",
"11 FINISH Unknown 1.867\n",
"12 KILL Queue 2.945\n",
"13 KILL Resubmission 0.004\n",
"14 KILL Running 13.493\n",
"15 KILL Unknown 77.941\n",
"Cluster 2011\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 2.500\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 17.500\n",
"3 EVICT Unknown 0.000\n",
"4 FAIL Queue 0.000\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 5.000\n",
"7 FAIL Unknown 0.000\n",
"8 FINISH Queue 1.000\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 39.000\n",
"11 FINISH Unknown 0.000\n",
"12 KILL Queue 5.000\n",
"13 KILL Resubmission 1.000\n",
"14 KILL Running 30.000\n",
"15 KILL Unknown 0.000\n"
] ]
} }
], ],
@ -343,8 +391,6 @@
"\n", "\n",
"for cluster in \"abcdefgh\":\n", "for cluster in \"abcdefgh\":\n",
" df, totals = create_df(cluster)\n", " df, totals = create_df(cluster)\n",
" \n",
" print(df)\n",
"\n", "\n",
" #plt.figure(figsize=(10,8))\n", " #plt.figure(figsize=(10,8))\n",
" #graph_1(df, cluster)\n", " #graph_1(df, cluster)\n",
@ -377,7 +423,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 38,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -385,22 +431,22 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
" Last termination time_type time_ms\n", " Last termination time_type time_ms\n",
"0 EVICT Queue 7.373993e-05\n", "0 EVICT Queue 0.000\n",
"1 EVICT Resubmission 8.449953e-07\n", "1 EVICT Resubmission 0.000\n",
"2 EVICT Running 9.249078e-03\n", "2 EVICT Running 0.009\n",
"3 EVICT Unknown 2.484572e-03\n", "3 EVICT Unknown 0.002\n",
"4 FAIL Queue 5.926861e-05\n", "4 FAIL Queue 0.000\n",
"5 FAIL Resubmission 2.058252e-14\n", "5 FAIL Resubmission 0.000\n",
"6 FAIL Running 1.785410e-03\n", "6 FAIL Running 0.002\n",
"7 FAIL Unknown 6.131290e-04\n", "7 FAIL Unknown 0.001\n",
"8 FINISH Queue 2.880144e-03\n", "8 FINISH Queue 0.003\n",
"9 FINISH Resubmission 3.170097e-09\n", "9 FINISH Resubmission 0.000\n",
"10 FINISH Running 2.035703e-02\n", "10 FINISH Running 0.020\n",
"11 FINISH Unknown 1.867017e-02\n", "11 FINISH Unknown 0.019\n",
"12 KILL Queue 2.945024e-02\n", "12 KILL Queue 0.029\n",
"13 KILL Resubmission 4.253091e-05\n", "13 KILL Resubmission 0.000\n",
"14 KILL Running 1.349259e-01\n", "14 KILL Running 0.135\n",
"15 KILL Unknown 7.794080e-01\n" "15 KILL Unknown 0.779\n"
] ]
} }
], ],

Binary file not shown.

View file

@ -82,6 +82,15 @@ and stored in JSONL format)\cite{google-drive-marso}, requiring a considerable
amount of computational power to analyze them and the implementation of special amount of computational power to analyze them and the implementation of special
data engineering techniques for analysis of the data. data engineering techniques for analysis of the data.
\input{figures/machine_configs}
An overview of the machine configurations in the cluster analyzed with the 2011
traces and in the 8 clusters composing the 2019 traces can be found in
figure~\ref{fig:machineconfigs}. Additionally, in
figure~\ref{fig:machineconfigs-csts}, the same machine configuration data is
provided for the 2019 traces providing a cluster-by-cluster distribution of the
machines.
This project aims to repeat the analysis performed in 2015 to highlight This project aims to repeat the analysis performed in 2015 to highlight
similarities and differences in workload this decade brought, and expanding the similarities and differences in workload this decade brought, and expanding the
old analysis to understand even better the causes of failures and how to prevent old analysis to understand even better the causes of failures and how to prevent
@ -441,102 +450,133 @@ deltas. Finally, the mean of the computed slowdown values is computed resulting
in the clear and coincise tables found in figure~\ref{fig:taskslowdown}. in the clear and coincise tables found in figure~\ref{fig:taskslowdown}.
\hypertarget{ad-hoc-presentation-of-some-analysis-scripts}{%
\subsection{Ad-Hoc presentation of some analysis
scripts}\label{ad-hoc-presentation-of-some-analysis-scripts}}
\textbf{TBD} (with diagrams)
\hypertarget{analysis-and-observations}{%
\section{Analysis and observations}\label{analysis-and-observations}}
\hypertarget{overview-of-machine-configurations-in-each-cluster}{%
\subsection{Overview of machine configurations in each
cluster}\label{overview-of-machine-configurations-in-each-cluster}}
\input{figures/machine_configs}
Refer to figure \ref{fig:machineconfigs}.
\textbf{Observations}:
\begin{itemize}
\item
machine configurations are definitely more varied than the ones in the
2011 traces
\item
some clusters have more machine variability
\end{itemize}
\hypertarget{analysis-of-execution-time-per-each-execution-phase}{%
\subsection{Analysis of execution time per each execution
phase}\label{analysis-of-execution-time-per-each-execution-phase}}
\section{Analysis: Performance Input of Unsuccessful Executions}
\input{figures/machine_time_waste} \input{figures/machine_time_waste}
Refer to figures \ref{fig:machinetimewaste-abs} and Our first investigation focuses on replicating the methodologies used in the
\ref{fig:machinetimewaste-rel}. 2015 DSN Ros\'a et al.\ paper\cite{vino-paper} regarding usage of machine time
and resources.
\textbf{Observations}: In this section we perform several analyses focusing on how machine time and
resources are wasted, by means of a temporal vs. spatial resource analysis from
the perspective of single tasks as well as jobs. We then compare the results
from the 2019 traces to the ones that were obtained in 2015 to understand the
workload evolution inside Borg between 2011 and 2019.
\begin{itemize} \subsection{Temporal Impact: Machine Time Waste}
\item
Across all cluster almost 50\% of time is spent in ``unknown''
transitions, i.e. there are some time slices that are related to a
state transition that Google says are not ``typical'' transitions.
This is mostly due to the trace log being intermittent when recording
all state transitions.
\item
80\% of the time spent in KILL and LOST is unknown. This is
predictable, since both states indicate that the job execution is not
stable (in particular LOST is used when the state logging itself is
unstable)
\item
From the absolute graph we see that the time ``wasted'' on non-finish
terminated jobs is very significant
\item
Execution is the most significant task phase, followed by queuing time
and scheduling time (``ready'' state)
\item
In the absolute graph we see that a significant amount of time is
spent to re-schedule evicted jobs (``evicted'' state)
\item
Cluster A has unusually high queuing times
\end{itemize}
\hypertarget{task-slowdown}{% This analysis explores how machine time is distributed over task events and
\subsection{Task slowdown}\label{task-slowdown}} submissions. By partitioning the collection of all terminating tasks by their
termination event, the analysis aims to measure the total time spent by tasks in
3 different execution phases:
\begin{description}
\item[resubmission time:] the total of all time deltas between every task
termination event and the immediately succeding task submission event, i.e.
the total time spent by tasks waiting to be resubmitted in Borg after a
termination;
\item[queue time:] the total of all time deltas between every task submission
event and the following task scheduling event, i.e. the total time spent by
tasks queuing before execution;
\item[running time:] the total of all time deltas between every task scheduling
event and the following task termination event, i.e. the total time spent by
tasks ``executing'' (i.e. performing useful computations) in the clusters.
\end{description}
In the 2019 traces, an additional ``Unknown'' measure is counted. This measure
collects all the times in which the event transitions between the register
events do not allow to safely assume in which execution phase a task may be.
Unknown measures are mostly caused by faults and missed event writes in the task
event log that was used to generate the traces.
The analysis results are depicted in figure~\ref{fig:machinetimewaste-rel} as a
comparison between the 2011 and 2019 traces, aggregating the data from all
clusters. Additionally, in figure~\ref{fig:machinetimewaste-rel-csts}
cluster-by-cluster breakdown result is provided for the 2019 traces.
The striking difference between 2011 and 2019 data is in the machine time
distribution per task termination type. In the 2019 traces, 94.38\% of global
machine time is spent on tasks that are eventually \texttt{KILL}ed.
\texttt{FINISH}, \texttt{EVICT} and \texttt{FAIL} tasks respectively register
totals of 4.20\%, 1.18\% and 0.25\% machine time, maintaining a analogous
distribution between them to their distribution in the 2011 traces.
Considering instead the distribution between execution phase times, the
comparison shows very similar behaviour between the two traces, having the
``Running'' time being dominant (at a total of 16.63\% across task terminations
in 2019) over the queue and resubmission phases (with respective totals in 2019
of 3.26\% and 0.004\%).
However, another noteworthy difference between 2011 and 2019 data lies in the new
``Unknown'' trace dataset present only in the latter traces, registering a total
80.12\% of global machine time across al terminations. This data can be
interpreted as a strong indication of the ``poor quality'' of the 2019 traces
w.r.t.\ of accuracy of task event logging.
Considering instead the behaviour of each single cluster in the 2019 traces, no
significant difference beween them can be observed. The only notable difference
lies between the ``Running time``-``Unknown time'' ratio in \texttt{KILL}ed
tasks, which is at its highest in cluster A (at 30.78\% by 58.71\% of global
machine time) and at its lowest in cluster H (at 8.06\% by 84.77\% of global
machine time).
\subsection{Average Slowdown per Task}
\input{figures/task_slowdown} \input{figures/task_slowdown}
Refer to figure \ref{fig:taskslowdown} This analysis aims to measure the figure of ``slowdown'', which is defined as
the ratio between the response time (i.e\. queue time and running time) of the
last execution of a given task and the total response time across all
executions of said task. This metric is especially useful to analyze the impact
of unsuccesful executions on each task total execution time w.r.t.\ the intrinsic
workload (i.e.\ computational time) of tasks.
\textbf{Observations}: Refer to figure~\ref{fig:taskslowdown} for a comparison between the 2011 and
2019 mean task slowdown measures broke down by task priority. Additionally, said
means are computed on a cluster-by-cluster basis for 2019 data in
figure~\ref{fig:taskslowdown-csts}.
\begin{itemize} In 2015 Ros\'a et al.\cite{vino-paper} measured mean task slowdown per each task
\item priority value, which at the time were $[0,11]$ numeric values. However,
Priority values are different from 0-11 values in the 2011 traces. A in 2019 traces, task priorities are given as a $[0,500]$ numeric value.
conversion table is provided by Google; Therefore, to allow for an easier comparison, mean task slowdown values are
\item computed by task priority tier over the 2019 data. Priority tiers are
For some priorities (e.g.~101 for cluster D) the relative number of semantically relevant priority ranges defined in the Tirmazi et al.
finishing task is very low and the mean slowdown is very high (315). 2020\cite{google-marso-19} that introduced the 2019 traces. Equivalent priority
This behaviour differs from the relatively homogeneous values from the tiers are also provided next to the 2011 priority values in the table covering
2011 traces. the 2015 analysis.
\item
Some slowdown values cannot be computed since either some tasks have a In the given tables, the \textbf{\% finished} column corresponds to the
0ns execution time or for some priorities no tasks in the traces percentage of \texttt{FINISH}ed tasks for that priority or tier. \textbf{Mean
terminate successfully. More raw data on those exception is in response [s] (last execution)} instead shows the mean response time of the last
Jupyter. task execution of each task in that priority/tier.
\item \textbf{Mean response [s] (all executions)} provides a very similar figure,
though this column shows the mean response time across all executions.
\textbf{Mean slowdown} instead provides the mean slowdown value for each task
priority/tier.
Comparing the tables in figure~\ref{fig:taskslowdown} we observe that the
maximum mean slowdown measure for 2019 data (i.e.\ 7.84, for the BEB tier) is almost
double of the maximum measure in 2011 data (i.e.\ 3.39, for priority $3$
corresponding to the BEB tier). The ``Best effort batch'' tier, as the name
suggest, is a lower priority tier where failures are more tolerated. Therefore,
due to the increased concurrency in the 2019 clusters compared to 2011 and the
higher machine time spent for unsuccesful executions (as observed in the
previous analysis) and increase slowdown rate for this class is not particularly
surprising.
\textbf{TBD}
The \% of finishing jobs is relatively low comparing with the 2011 The \% of finishing jobs is relatively low comparing with the 2011
traces. traces.
\end{itemize}
\input{figures/spatial_resource_waste}
\input{figures/table_iii} % has table III and table IV in it
\input{figures/figure_5}
\hypertarget{reserved-and-actual-resource-usage-of-tasks}{% \hypertarget{reserved-and-actual-resource-usage-of-tasks}{%
\subsection{Reserved and actual resource usage of \subsection{Reserved and actual resource usage of
tasks}\label{reserved-and-actual-resource-usage-of-tasks}} tasks}\label{reserved-and-actual-resource-usage-of-tasks}}
\input{figures/spatial_resource_waste}
Refer to figures \ref{fig:spatialresourcewaste-actual} and Refer to figures \ref{fig:spatialresourcewaste-actual} and
\ref{fig:spatialresourcewaste-requested}. \ref{fig:spatialresourcewaste-requested}.
@ -553,6 +593,63 @@ Refer to figures \ref{fig:spatialresourcewaste-actual} and
both CPU and RAM both CPU and RAM
\end{itemize} \end{itemize}
Refer to figure \ref{fig:tableIII}.
\textbf{Observations}:
\begin{itemize}
\item
The mean number of events per task is an order of magnitude higher
than in the 2011 traces
\item
Generally speaking, the event type with higher mean is the termination
event for the task
\item
The \# evts mean is higher than the sum of all other event type means,
since it appears there are a lot more non-termination events in the
2019 traces.
\end{itemize}
\hypertarget{mean-number-of-tasks-and-event-distribution-per-job-type}{%
\subsection{Mean number of tasks and event distribution per job
type}\label{mean-number-of-tasks-and-event-distribution-per-job-type}}
\textbf{Observations}:
\begin{itemize}
\item
Again the mean number of tasks is significantly higher than the 2011
traces, indicating a higher complexity of workloads
\item
Cluster A has no evicted jobs
\item
The number of events is however lower than the event means in the 2011
traces
\end{itemize}
\hypertarget{probability-of-task-successful-termination-given-its-unsuccesful-events}{%
\subsection{Probability of task successful termination given its
unsuccesful
events}\label{probability-of-task-successful-termination-given-its-unsuccesful-events}}
Refer to figure \ref{fig:figureV}.
\textbf{Observations}:
\begin{itemize}
\item
Behaviour is very different from cluster to cluster
\item
There is no easy conclusion, unlike in 2011, on the correlation
between succesful probability and \# of events of a specific type.
\item
Clusters B, C and D in particular have very unsmooth lines that vary a
lot for small \# evts differences. This may be due to an uneven
distribution of \# evts in the traces.
\end{itemize}
\hypertarget{correlation-between-task-events-metadata-and-task-termination}{% \hypertarget{correlation-between-task-events-metadata-and-task-termination}{%
\subsection{Correlation between task events' metadata and task \subsection{Correlation between task events' metadata and task
termination}\label{correlation-between-task-events-metadata-and-task-termination}} termination}\label{correlation-between-task-events-metadata-and-task-termination}}
@ -620,66 +717,6 @@ Refer to figures \ref{fig:figureIX-a}, \ref{fig:figureIX-b}, and
\subsection{Mean number of tasks and event distribution per task \subsection{Mean number of tasks and event distribution per task
type}\label{mean-number-of-tasks-and-event-distribution-per-task-type}} type}\label{mean-number-of-tasks-and-event-distribution-per-task-type}}
\input{figures/table_iii}
Refer to figure \ref{fig:tableIII}.
\textbf{Observations}:
\begin{itemize}
\item
The mean number of events per task is an order of magnitude higher
than in the 2011 traces
\item
Generally speaking, the event type with higher mean is the termination
event for the task
\item
The \# evts mean is higher than the sum of all other event type means,
since it appears there are a lot more non-termination events in the
2019 traces.
\end{itemize}
\hypertarget{mean-number-of-tasks-and-event-distribution-per-job-type}{%
\subsection{Mean number of tasks and event distribution per job
type}\label{mean-number-of-tasks-and-event-distribution-per-job-type}}
Refer to figure \ref{fig:tableIV}.
\textbf{Observations}:
\begin{itemize}
\item
Again the mean number of tasks is significantly higher than the 2011
traces, indicating a higher complexity of workloads
\item
Cluster A has no evicted jobs
\item
The number of events is however lower than the event means in the 2011
traces
\end{itemize}
\hypertarget{probability-of-task-successful-termination-given-its-unsuccesful-events}{%
\subsection{Probability of task successful termination given its
unsuccesful
events}\label{probability-of-task-successful-termination-given-its-unsuccesful-events}}
\input{figures/figure_5}
Refer to figure \ref{fig:figureV}.
\textbf{Observations}:
\begin{itemize}
\item
Behaviour is very different from cluster to cluster
\item
There is no easy conclusion, unlike in 2011, on the correlation
between succesful probability and \# of events of a specific type.
\item
Clusters B, C and D in particular have very unsmooth lines that vary a
lot for small \# evts differences. This may be due to an uneven
distribution of \# evts in the traces.
\end{itemize}
\hypertarget{potential-causes-of-unsuccesful-executions}{% \hypertarget{potential-causes-of-unsuccesful-executions}{%
\subsection{Potential causes of unsuccesful \subsection{Potential causes of unsuccesful

View file

@ -6,13 +6,13 @@
%\hfill %\hfill
\end{subfigure}} \end{subfigure}}
\begin{figure} \begin{figure}[p]
\spatialresourcewaste[0.5\textwidth]{used-2011} \spatialresourcewaste[0.5\textwidth]{used-2011}
\spatialresourcewaste[0.5\textwidth]{used-all} \spatialresourcewaste[0.5\textwidth]{used-all}
\caption{Percentages of CPU and RAM resources used by tasks w.r.t. task termination type in 2011 and 2019 traces (total of clusters A to D). The x axis is the type of resource, y-axis is the percentage of resource used and color represents task termination. Numeric values are displayed below the graph as a table.}\label{fig:spatialresourcewaste-requested} \caption{Percentages of CPU and RAM resources used by tasks w.r.t. task termination type in 2011 and 2019 traces (total of clusters A to D). The x axis is the type of resource, y-axis is the percentage of resource used and color represents task termination. Numeric values are displayed below the graph as a table.}\label{fig:spatialresourcewaste-requested}
\end{figure} \end{figure}
\begin{figure} \begin{figure}[p]
\spatialresourcewaste{used-a} \spatialresourcewaste{used-a}
\spatialresourcewaste{used-b} \spatialresourcewaste{used-b}
\spatialresourcewaste{used-c} \spatialresourcewaste{used-c}
@ -20,13 +20,13 @@
\caption{Percentages of CPU and RAM resources used by tasks w.r.t. task termination type for clusters A to D in 2019 traces. Refer to figure~\ref{fig:spatialresourcewaste-requested} for plot explaination.}\label{fig:spatialresourcewaste-actual-csts} \caption{Percentages of CPU and RAM resources used by tasks w.r.t. task termination type for clusters A to D in 2019 traces. Refer to figure~\ref{fig:spatialresourcewaste-requested} for plot explaination.}\label{fig:spatialresourcewaste-actual-csts}
\end{figure} \end{figure}
\begin{figure} \begin{figure}[p]
\spatialresourcewaste[0.5\textwidth]{requested-2011} \spatialresourcewaste[0.5\textwidth]{requested-2011}
\spatialresourcewaste[0.5\textwidth]{requested-all} \spatialresourcewaste[0.5\textwidth]{requested-all}
\caption{Percentages of CPU and RAM resources requested by tasks w.r.t. task termination type in 2011 and 2019 traces. The x axis is the type of resource, y-axis is the percentage of resource used and color represents task termination. Numeric values are displayed below the graph as a table.}\label{fig:spatialresourcewaste-actual} \caption{Percentages of CPU and RAM resources requested by tasks w.r.t. task termination type in 2011 and 2019 traces. The x axis is the type of resource, y-axis is the percentage of resource used and color represents task termination. Numeric values are displayed below the graph as a table.}\label{fig:spatialresourcewaste-actual}
\end{figure} \end{figure}
\begin{figure} \begin{figure}[p]
\spatialresourcewaste{requested-a} \spatialresourcewaste{requested-a}
\spatialresourcewaste{requested-b} \spatialresourcewaste{requested-b}
\spatialresourcewaste{requested-c} \spatialresourcewaste{requested-c}