report work

This commit is contained in:
Claudio Maggioni 2021-05-26 21:46:30 +02:00
parent 828b05a60a
commit a7e2d987c3
5 changed files with 390 additions and 308 deletions

View File

@ -1 +0,0 @@
,maggicl,Apple2gs.local,24.05.2021 14:04,file:///Users/maggicl/Library/Application%20Support/LibreOffice/4;

View File

@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
@ -27,12 +27,13 @@
" 'text.usetex': True,\n",
" 'pgf.rcfonts': False,\n",
"})\n",
"import matplotlib.pyplot as plt"
"import matplotlib.pyplot as plt\n",
"pandas.options.display.float_format = '{:,.3f}'.format"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
@ -53,16 +54,16 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"DIR = \"/home/claudio/hdd/git/bachelorThesis\""
"DIR = \"/Users/maggicl/git/bachelorThesis\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
@ -138,7 +139,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
@ -163,7 +164,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
@ -174,6 +175,9 @@
"\n",
" df.rename(columns = {'time_type': 'Execution phase'}, inplace = True)\n",
" \n",
" print(\"Cluster \"+cluster)\n",
" print(df)\n",
" \n",
" h = sns.histplot(df, x=\"Last termination\", \n",
" weights=\"time_ms\", shrink=.5, common_bins=True,\n",
" hue=\"Execution phase\", multiple=\"stack\", discrete=True, legend=True)\n",
@ -191,149 +195,193 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Last termination time_type time_ms\n",
"0 EVICT Queue 1.049774e+12\n",
"1 EVICT Resubmission 5.530617e+08\n",
"2 EVICT Running 3.218063e+13\n",
"3 EVICT Unknown 3.383291e+12\n",
"4 FAIL Queue 9.483261e+11\n",
"5 FAIL Resubmission 7.150500e+01\n",
"6 FAIL Running 7.265195e+12\n",
"7 FAIL Unknown 2.799674e+12\n",
"8 FINISH Queue 3.317009e+13\n",
"9 FINISH Resubmission 1.828825e+07\n",
"10 FINISH Running 3.788436e+13\n",
"11 FINISH Unknown 2.482661e+13\n",
"12 KILL Queue 7.482888e+13\n",
"13 KILL Resubmission 1.211419e+11\n",
"14 KILL Running 6.311166e+14\n",
"15 KILL Unknown 1.207792e+15\n",
" Last termination time_type time_ms\n",
"0 EVICT Queue 2.991028e+11\n",
"1 EVICT Resubmission 1.360657e+09\n",
"2 EVICT Running 2.871365e+13\n",
"3 EVICT Unknown 1.428912e+13\n",
"4 FAIL Queue 9.376134e+10\n",
"5 FAIL Resubmission 1.225520e+02\n",
"6 FAIL Running 8.338530e+12\n",
"7 FAIL Unknown 1.989378e+12\n",
"8 FINISH Queue 6.817208e+12\n",
"9 FINISH Resubmission 1.493729e+03\n",
"10 FINISH Running 8.069421e+13\n",
"11 FINISH Unknown 1.006353e+14\n",
"12 KILL Queue 5.397953e+13\n",
"13 KILL Resubmission 1.842002e+10\n",
"14 KILL Running 5.716892e+14\n",
"15 KILL Unknown 2.088855e+15\n",
" Last termination time_type time_ms\n",
"0 EVICT Queue 3.158380e+11\n",
"1 EVICT Resubmission 2.355575e+09\n",
"2 EVICT Running 4.229815e+13\n",
"3 EVICT Unknown 6.785277e+12\n",
"4 FAIL Queue 2.352869e+11\n",
"5 FAIL Resubmission 4.684500e+01\n",
"6 FAIL Running 9.316941e+12\n",
"7 FAIL Unknown 4.873943e+12\n",
"8 FINISH Queue 1.172189e+13\n",
"9 FINISH Resubmission 3.623451e+03\n",
"10 FINISH Running 1.154498e+14\n",
"11 FINISH Unknown 4.934279e+13\n",
"12 KILL Queue 7.171264e+13\n",
"13 KILL Resubmission 2.108520e+11\n",
"14 KILL Running 6.180005e+14\n",
"15 KILL Unknown 2.088457e+15\n",
" Last termination time_type time_ms\n",
"0 EVICT Queue 1.415993e+11\n",
"1 EVICT Resubmission 2.835890e+08\n",
"2 EVICT Running 4.303187e+13\n",
"3 EVICT Unknown 7.410999e+12\n",
"4 FAIL Queue 2.231462e+10\n",
"5 FAIL Resubmission 1.073960e+02\n",
"6 FAIL Running 1.186956e+13\n",
"7 FAIL Unknown 2.829927e+12\n",
"8 FINISH Queue 4.455665e+12\n",
"9 FINISH Resubmission 1.577302e+03\n",
"10 FINISH Running 6.516562e+13\n",
"11 FINISH Unknown 7.106965e+13\n",
"12 KILL Queue 7.435926e+13\n",
"13 KILL Resubmission 5.556059e+11\n",
"14 KILL Running 4.702722e+14\n",
"15 KILL Unknown 2.040366e+15\n",
" Last termination time_type time_ms\n",
"0 EVICT Queue 1.722618e+10\n",
"1 EVICT Resubmission 1.788932e+09\n",
"2 EVICT Running 1.710804e+13\n",
"3 EVICT Unknown 7.078678e+12\n",
"4 FAIL Queue 2.895755e+09\n",
"5 FAIL Resubmission 5.304400e+01\n",
"6 FAIL Running 2.281806e+12\n",
"7 FAIL Unknown 3.984907e+11\n",
"8 FINISH Queue 7.454410e+11\n",
"9 FINISH Resubmission 6.310360e+02\n",
"10 FINISH Running 4.284518e+13\n",
"11 FINISH Unknown 3.672368e+13\n",
"12 KILL Queue 1.398332e+14\n",
"13 KILL Resubmission 4.825723e+10\n",
"14 KILL Running 3.049664e+14\n",
"15 KILL Unknown 3.072445e+15\n",
" Last termination time_type time_ms\n",
"0 EVICT Queue 3.140594e+10\n",
"1 EVICT Resubmission 1.504263e+10\n",
"2 EVICT Running 5.070239e+13\n",
"3 EVICT Unknown 1.602834e+13\n",
"4 FAIL Queue 5.523972e+09\n",
"5 FAIL Resubmission 2.352700e+01\n",
"6 FAIL Running 3.889624e+12\n",
"7 FAIL Unknown 1.833895e+12\n",
"8 FINISH Queue 1.098116e+13\n",
"9 FINISH Resubmission 6.319590e+02\n",
"10 FINISH Running 9.761364e+13\n",
"11 FINISH Unknown 9.603417e+13\n",
"12 KILL Queue 1.129539e+14\n",
"13 KILL Resubmission 1.356476e+11\n",
"14 KILL Running 4.505937e+14\n",
"15 KILL Unknown 2.669451e+15\n",
" Last termination time_type time_ms\n",
"0 EVICT Queue 9.528645e+10\n",
"1 EVICT Resubmission 1.493116e+09\n",
"2 EVICT Running 8.513084e+12\n",
"3 EVICT Unknown 2.778074e+12\n",
"4 FAIL Queue 2.887122e+11\n",
"5 FAIL Resubmission 1.757300e+01\n",
"6 FAIL Running 1.867799e+12\n",
"7 FAIL Unknown 6.622832e+11\n",
"8 FINISH Queue 8.337090e+11\n",
"9 FINISH Resubmission 6.753141e+07\n",
"10 FINISH Running 3.514254e+13\n",
"11 FINISH Unknown 6.704536e+13\n",
"12 KILL Queue 1.152843e+14\n",
"13 KILL Resubmission 5.814544e+10\n",
"14 KILL Running 2.225128e+14\n",
"15 KILL Unknown 3.894626e+15\n",
" Last termination time_type time_ms\n",
"0 EVICT Queue 4.621613e+10\n",
"1 EVICT Resubmission 4.511340e+02\n",
"2 EVICT Running 2.786346e+13\n",
"3 EVICT Unknown 9.513981e+12\n",
"4 FAIL Queue 7.828423e+09\n",
"5 FAIL Resubmission 1.148130e+02\n",
"6 FAIL Running 3.509052e+12\n",
"7 FAIL Unknown 1.212378e+12\n",
"8 FINISH Queue 9.252380e+12\n",
"9 FINISH Resubmission 1.675400e+02\n",
"10 FINISH Running 7.635478e+13\n",
"11 FINISH Unknown 5.980213e+13\n",
"12 KILL Queue 1.543895e+14\n",
"13 KILL Resubmission 3.419664e+09\n",
"14 KILL Running 3.838571e+14\n",
"15 KILL Unknown 4.039843e+15\n"
"Cluster a\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 0.051\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 1.564\n",
"3 EVICT Unknown 0.164\n",
"4 FAIL Queue 0.046\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 0.353\n",
"7 FAIL Unknown 0.136\n",
"8 FINISH Queue 1.612\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 1.841\n",
"11 FINISH Unknown 1.207\n",
"12 KILL Queue 3.637\n",
"13 KILL Resubmission 0.006\n",
"14 KILL Running 30.676\n",
"15 KILL Unknown 58.706\n",
"Cluster b\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 0.010\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 0.971\n",
"3 EVICT Unknown 0.483\n",
"4 FAIL Queue 0.003\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 0.282\n",
"7 FAIL Unknown 0.067\n",
"8 FINISH Queue 0.231\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 2.729\n",
"11 FINISH Unknown 3.404\n",
"12 KILL Queue 1.826\n",
"13 KILL Resubmission 0.001\n",
"14 KILL Running 19.337\n",
"15 KILL Unknown 70.655\n",
"Cluster c\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 0.010\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 1.401\n",
"3 EVICT Unknown 0.225\n",
"4 FAIL Queue 0.008\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 0.309\n",
"7 FAIL Unknown 0.161\n",
"8 FINISH Queue 0.388\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 3.824\n",
"11 FINISH Unknown 1.635\n",
"12 KILL Queue 2.376\n",
"13 KILL Resubmission 0.007\n",
"14 KILL Running 20.472\n",
"15 KILL Unknown 69.183\n",
"Cluster d\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 0.005\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 1.542\n",
"3 EVICT Unknown 0.265\n",
"4 FAIL Queue 0.001\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 0.425\n",
"7 FAIL Unknown 0.101\n",
"8 FINISH Queue 0.160\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 2.334\n",
"11 FINISH Unknown 2.546\n",
"12 KILL Queue 2.664\n",
"13 KILL Resubmission 0.020\n",
"14 KILL Running 16.846\n",
"15 KILL Unknown 73.091\n",
"Cluster e\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 0.000\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 0.472\n",
"3 EVICT Unknown 0.195\n",
"4 FAIL Queue 0.000\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 0.063\n",
"7 FAIL Unknown 0.011\n",
"8 FINISH Queue 0.021\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 1.182\n",
"11 FINISH Unknown 1.013\n",
"12 KILL Queue 3.858\n",
"13 KILL Resubmission 0.001\n",
"14 KILL Running 8.414\n",
"15 KILL Unknown 84.769\n",
"Cluster f\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 0.001\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 1.444\n",
"3 EVICT Unknown 0.457\n",
"4 FAIL Queue 0.000\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 0.111\n",
"7 FAIL Unknown 0.052\n",
"8 FINISH Queue 0.313\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 2.781\n",
"11 FINISH Unknown 2.736\n",
"12 KILL Queue 3.218\n",
"13 KILL Resubmission 0.004\n",
"14 KILL Running 12.836\n",
"15 KILL Unknown 76.047\n",
"Cluster g\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 0.002\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 0.196\n",
"3 EVICT Unknown 0.064\n",
"4 FAIL Queue 0.007\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 0.043\n",
"7 FAIL Unknown 0.015\n",
"8 FINISH Queue 0.019\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 0.808\n",
"11 FINISH Unknown 1.541\n",
"12 KILL Queue 2.650\n",
"13 KILL Resubmission 0.001\n",
"14 KILL Running 5.116\n",
"15 KILL Unknown 89.538\n",
"Cluster h\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 0.001\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 0.585\n",
"3 EVICT Unknown 0.200\n",
"4 FAIL Queue 0.000\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 0.074\n",
"7 FAIL Unknown 0.025\n",
"8 FINISH Queue 0.194\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 1.602\n",
"11 FINISH Unknown 1.255\n",
"12 KILL Queue 3.240\n",
"13 KILL Resubmission 0.000\n",
"14 KILL Running 8.055\n",
"15 KILL Unknown 84.770\n",
"Cluster all\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 0.007\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 0.925\n",
"3 EVICT Unknown 0.248\n",
"4 FAIL Queue 0.006\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 0.179\n",
"7 FAIL Unknown 0.061\n",
"8 FINISH Queue 0.288\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 2.036\n",
"11 FINISH Unknown 1.867\n",
"12 KILL Queue 2.945\n",
"13 KILL Resubmission 0.004\n",
"14 KILL Running 13.493\n",
"15 KILL Unknown 77.941\n",
"Cluster 2011\n",
" Last termination Execution phase time_ms\n",
"0 EVICT Queue 2.500\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 17.500\n",
"3 EVICT Unknown 0.000\n",
"4 FAIL Queue 0.000\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 5.000\n",
"7 FAIL Unknown 0.000\n",
"8 FINISH Queue 1.000\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 39.000\n",
"11 FINISH Unknown 0.000\n",
"12 KILL Queue 5.000\n",
"13 KILL Resubmission 1.000\n",
"14 KILL Running 30.000\n",
"15 KILL Unknown 0.000\n"
]
}
],
@ -343,8 +391,6 @@
"\n",
"for cluster in \"abcdefgh\":\n",
" df, totals = create_df(cluster)\n",
" \n",
" print(df)\n",
"\n",
" #plt.figure(figsize=(10,8))\n",
" #graph_1(df, cluster)\n",
@ -377,30 +423,30 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Last termination time_type time_ms\n",
"0 EVICT Queue 7.373993e-05\n",
"1 EVICT Resubmission 8.449953e-07\n",
"2 EVICT Running 9.249078e-03\n",
"3 EVICT Unknown 2.484572e-03\n",
"4 FAIL Queue 5.926861e-05\n",
"5 FAIL Resubmission 2.058252e-14\n",
"6 FAIL Running 1.785410e-03\n",
"7 FAIL Unknown 6.131290e-04\n",
"8 FINISH Queue 2.880144e-03\n",
"9 FINISH Resubmission 3.170097e-09\n",
"10 FINISH Running 2.035703e-02\n",
"11 FINISH Unknown 1.867017e-02\n",
"12 KILL Queue 2.945024e-02\n",
"13 KILL Resubmission 4.253091e-05\n",
"14 KILL Running 1.349259e-01\n",
"15 KILL Unknown 7.794080e-01\n"
" Last termination time_type time_ms\n",
"0 EVICT Queue 0.000\n",
"1 EVICT Resubmission 0.000\n",
"2 EVICT Running 0.009\n",
"3 EVICT Unknown 0.002\n",
"4 FAIL Queue 0.000\n",
"5 FAIL Resubmission 0.000\n",
"6 FAIL Running 0.002\n",
"7 FAIL Unknown 0.001\n",
"8 FINISH Queue 0.003\n",
"9 FINISH Resubmission 0.000\n",
"10 FINISH Running 0.020\n",
"11 FINISH Unknown 0.019\n",
"12 KILL Queue 0.029\n",
"13 KILL Resubmission 0.000\n",
"14 KILL Running 0.135\n",
"15 KILL Unknown 0.779\n"
]
}
],

Binary file not shown.

View File

@ -82,6 +82,15 @@ and stored in JSONL format)\cite{google-drive-marso}, requiring a considerable
amount of computational power to analyze them and the implementation of special
data engineering techniques for analysis of the data.
\input{figures/machine_configs}
An overview of the machine configurations in the cluster analyzed with the 2011
traces and in the 8 clusters composing the 2019 traces can be found in
figure~\ref{fig:machineconfigs}. Additionally, in
figure~\ref{fig:machineconfigs-csts}, the same machine configuration data is
provided for the 2019 traces providing a cluster-by-cluster distribution of the
machines.
This project aims to repeat the analysis performed in 2015 to highlight
similarities and differences in workload this decade brought, and expanding the
old analysis to understand even better the causes of failures and how to prevent
@ -441,102 +450,133 @@ deltas. Finally, the mean of the computed slowdown values is computed resulting
in the clear and coincise tables found in figure~\ref{fig:taskslowdown}.
\hypertarget{ad-hoc-presentation-of-some-analysis-scripts}{%
\subsection{Ad-Hoc presentation of some analysis
scripts}\label{ad-hoc-presentation-of-some-analysis-scripts}}
\textbf{TBD} (with diagrams)
\hypertarget{analysis-and-observations}{%
\section{Analysis and observations}\label{analysis-and-observations}}
\hypertarget{overview-of-machine-configurations-in-each-cluster}{%
\subsection{Overview of machine configurations in each
cluster}\label{overview-of-machine-configurations-in-each-cluster}}
\input{figures/machine_configs}
Refer to figure \ref{fig:machineconfigs}.
\textbf{Observations}:
\begin{itemize}
\item
machine configurations are definitely more varied than the ones in the
2011 traces
\item
some clusters have more machine variability
\end{itemize}
\hypertarget{analysis-of-execution-time-per-each-execution-phase}{%
\subsection{Analysis of execution time per each execution
phase}\label{analysis-of-execution-time-per-each-execution-phase}}
\section{Analysis: Performance Input of Unsuccessful Executions}
\input{figures/machine_time_waste}
Refer to figures \ref{fig:machinetimewaste-abs} and
\ref{fig:machinetimewaste-rel}.
Our first investigation focuses on replicating the methodologies used in the
2015 DSN Ros\'a et al.\ paper\cite{vino-paper} regarding usage of machine time
and resources.
\textbf{Observations}:
In this section we perform several analyses focusing on how machine time and
resources are wasted, by means of a temporal vs. spatial resource analysis from
the perspective of single tasks as well as jobs. We then compare the results
from the 2019 traces to the ones that were obtained in 2015 to understand the
workload evolution inside Borg between 2011 and 2019.
\begin{itemize}
\item
Across all cluster almost 50\% of time is spent in ``unknown''
transitions, i.e. there are some time slices that are related to a
state transition that Google says are not ``typical'' transitions.
This is mostly due to the trace log being intermittent when recording
all state transitions.
\item
80\% of the time spent in KILL and LOST is unknown. This is
predictable, since both states indicate that the job execution is not
stable (in particular LOST is used when the state logging itself is
unstable)
\item
From the absolute graph we see that the time ``wasted'' on non-finish
terminated jobs is very significant
\item
Execution is the most significant task phase, followed by queuing time
and scheduling time (``ready'' state)
\item
In the absolute graph we see that a significant amount of time is
spent to re-schedule evicted jobs (``evicted'' state)
\item
Cluster A has unusually high queuing times
\end{itemize}
\subsection{Temporal Impact: Machine Time Waste}
\hypertarget{task-slowdown}{%
\subsection{Task slowdown}\label{task-slowdown}}
This analysis explores how machine time is distributed over task events and
submissions. By partitioning the collection of all terminating tasks by their
termination event, the analysis aims to measure the total time spent by tasks in
3 different execution phases:
\begin{description}
\item[resubmission time:] the total of all time deltas between every task
termination event and the immediately succeding task submission event, i.e.
the total time spent by tasks waiting to be resubmitted in Borg after a
termination;
\item[queue time:] the total of all time deltas between every task submission
event and the following task scheduling event, i.e. the total time spent by
tasks queuing before execution;
\item[running time:] the total of all time deltas between every task scheduling
event and the following task termination event, i.e. the total time spent by
tasks ``executing'' (i.e. performing useful computations) in the clusters.
\end{description}
In the 2019 traces, an additional ``Unknown'' measure is counted. This measure
collects all the times in which the event transitions between the register
events do not allow to safely assume in which execution phase a task may be.
Unknown measures are mostly caused by faults and missed event writes in the task
event log that was used to generate the traces.
The analysis results are depicted in figure~\ref{fig:machinetimewaste-rel} as a
comparison between the 2011 and 2019 traces, aggregating the data from all
clusters. Additionally, in figure~\ref{fig:machinetimewaste-rel-csts}
cluster-by-cluster breakdown result is provided for the 2019 traces.
The striking difference between 2011 and 2019 data is in the machine time
distribution per task termination type. In the 2019 traces, 94.38\% of global
machine time is spent on tasks that are eventually \texttt{KILL}ed.
\texttt{FINISH}, \texttt{EVICT} and \texttt{FAIL} tasks respectively register
totals of 4.20\%, 1.18\% and 0.25\% machine time, maintaining a analogous
distribution between them to their distribution in the 2011 traces.
Considering instead the distribution between execution phase times, the
comparison shows very similar behaviour between the two traces, having the
``Running'' time being dominant (at a total of 16.63\% across task terminations
in 2019) over the queue and resubmission phases (with respective totals in 2019
of 3.26\% and 0.004\%).
However, another noteworthy difference between 2011 and 2019 data lies in the new
``Unknown'' trace dataset present only in the latter traces, registering a total
80.12\% of global machine time across al terminations. This data can be
interpreted as a strong indication of the ``poor quality'' of the 2019 traces
w.r.t.\ of accuracy of task event logging.
Considering instead the behaviour of each single cluster in the 2019 traces, no
significant difference beween them can be observed. The only notable difference
lies between the ``Running time``-``Unknown time'' ratio in \texttt{KILL}ed
tasks, which is at its highest in cluster A (at 30.78\% by 58.71\% of global
machine time) and at its lowest in cluster H (at 8.06\% by 84.77\% of global
machine time).
\subsection{Average Slowdown per Task}
\input{figures/task_slowdown}
Refer to figure \ref{fig:taskslowdown}
This analysis aims to measure the figure of ``slowdown'', which is defined as
the ratio between the response time (i.e\. queue time and running time) of the
last execution of a given task and the total response time across all
executions of said task. This metric is especially useful to analyze the impact
of unsuccesful executions on each task total execution time w.r.t.\ the intrinsic
workload (i.e.\ computational time) of tasks.
\textbf{Observations}:
Refer to figure~\ref{fig:taskslowdown} for a comparison between the 2011 and
2019 mean task slowdown measures broke down by task priority. Additionally, said
means are computed on a cluster-by-cluster basis for 2019 data in
figure~\ref{fig:taskslowdown-csts}.
\begin{itemize}
\item
Priority values are different from 0-11 values in the 2011 traces. A
conversion table is provided by Google;
\item
For some priorities (e.g.~101 for cluster D) the relative number of
finishing task is very low and the mean slowdown is very high (315).
This behaviour differs from the relatively homogeneous values from the
2011 traces.
\item
Some slowdown values cannot be computed since either some tasks have a
0ns execution time or for some priorities no tasks in the traces
terminate successfully. More raw data on those exception is in
Jupyter.
\item
In 2015 Ros\'a et al.\cite{vino-paper} measured mean task slowdown per each task
priority value, which at the time were $[0,11]$ numeric values. However,
in 2019 traces, task priorities are given as a $[0,500]$ numeric value.
Therefore, to allow for an easier comparison, mean task slowdown values are
computed by task priority tier over the 2019 data. Priority tiers are
semantically relevant priority ranges defined in the Tirmazi et al.
2020\cite{google-marso-19} that introduced the 2019 traces. Equivalent priority
tiers are also provided next to the 2011 priority values in the table covering
the 2015 analysis.
In the given tables, the \textbf{\% finished} column corresponds to the
percentage of \texttt{FINISH}ed tasks for that priority or tier. \textbf{Mean
response [s] (last execution)} instead shows the mean response time of the last
task execution of each task in that priority/tier.
\textbf{Mean response [s] (all executions)} provides a very similar figure,
though this column shows the mean response time across all executions.
\textbf{Mean slowdown} instead provides the mean slowdown value for each task
priority/tier.
Comparing the tables in figure~\ref{fig:taskslowdown} we observe that the
maximum mean slowdown measure for 2019 data (i.e.\ 7.84, for the BEB tier) is almost
double of the maximum measure in 2011 data (i.e.\ 3.39, for priority $3$
corresponding to the BEB tier). The ``Best effort batch'' tier, as the name
suggest, is a lower priority tier where failures are more tolerated. Therefore,
due to the increased concurrency in the 2019 clusters compared to 2011 and the
higher machine time spent for unsuccesful executions (as observed in the
previous analysis) and increase slowdown rate for this class is not particularly
surprising.
\textbf{TBD}
The \% of finishing jobs is relatively low comparing with the 2011
traces.
\end{itemize}
\input{figures/spatial_resource_waste}
\input{figures/table_iii} % has table III and table IV in it
\input{figures/figure_5}
\hypertarget{reserved-and-actual-resource-usage-of-tasks}{%
\subsection{Reserved and actual resource usage of
tasks}\label{reserved-and-actual-resource-usage-of-tasks}}
\input{figures/spatial_resource_waste}
Refer to figures \ref{fig:spatialresourcewaste-actual} and
\ref{fig:spatialresourcewaste-requested}.
@ -553,6 +593,63 @@ Refer to figures \ref{fig:spatialresourcewaste-actual} and
both CPU and RAM
\end{itemize}
Refer to figure \ref{fig:tableIII}.
\textbf{Observations}:
\begin{itemize}
\item
The mean number of events per task is an order of magnitude higher
than in the 2011 traces
\item
Generally speaking, the event type with higher mean is the termination
event for the task
\item
The \# evts mean is higher than the sum of all other event type means,
since it appears there are a lot more non-termination events in the
2019 traces.
\end{itemize}
\hypertarget{mean-number-of-tasks-and-event-distribution-per-job-type}{%
\subsection{Mean number of tasks and event distribution per job
type}\label{mean-number-of-tasks-and-event-distribution-per-job-type}}
\textbf{Observations}:
\begin{itemize}
\item
Again the mean number of tasks is significantly higher than the 2011
traces, indicating a higher complexity of workloads
\item
Cluster A has no evicted jobs
\item
The number of events is however lower than the event means in the 2011
traces
\end{itemize}
\hypertarget{probability-of-task-successful-termination-given-its-unsuccesful-events}{%
\subsection{Probability of task successful termination given its
unsuccesful
events}\label{probability-of-task-successful-termination-given-its-unsuccesful-events}}
Refer to figure \ref{fig:figureV}.
\textbf{Observations}:
\begin{itemize}
\item
Behaviour is very different from cluster to cluster
\item
There is no easy conclusion, unlike in 2011, on the correlation
between succesful probability and \# of events of a specific type.
\item
Clusters B, C and D in particular have very unsmooth lines that vary a
lot for small \# evts differences. This may be due to an uneven
distribution of \# evts in the traces.
\end{itemize}
\hypertarget{correlation-between-task-events-metadata-and-task-termination}{%
\subsection{Correlation between task events' metadata and task
termination}\label{correlation-between-task-events-metadata-and-task-termination}}
@ -620,66 +717,6 @@ Refer to figures \ref{fig:figureIX-a}, \ref{fig:figureIX-b}, and
\subsection{Mean number of tasks and event distribution per task
type}\label{mean-number-of-tasks-and-event-distribution-per-task-type}}
\input{figures/table_iii}
Refer to figure \ref{fig:tableIII}.
\textbf{Observations}:
\begin{itemize}
\item
The mean number of events per task is an order of magnitude higher
than in the 2011 traces
\item
Generally speaking, the event type with higher mean is the termination
event for the task
\item
The \# evts mean is higher than the sum of all other event type means,
since it appears there are a lot more non-termination events in the
2019 traces.
\end{itemize}
\hypertarget{mean-number-of-tasks-and-event-distribution-per-job-type}{%
\subsection{Mean number of tasks and event distribution per job
type}\label{mean-number-of-tasks-and-event-distribution-per-job-type}}
Refer to figure \ref{fig:tableIV}.
\textbf{Observations}:
\begin{itemize}
\item
Again the mean number of tasks is significantly higher than the 2011
traces, indicating a higher complexity of workloads
\item
Cluster A has no evicted jobs
\item
The number of events is however lower than the event means in the 2011
traces
\end{itemize}
\hypertarget{probability-of-task-successful-termination-given-its-unsuccesful-events}{%
\subsection{Probability of task successful termination given its
unsuccesful
events}\label{probability-of-task-successful-termination-given-its-unsuccesful-events}}
\input{figures/figure_5}
Refer to figure \ref{fig:figureV}.
\textbf{Observations}:
\begin{itemize}
\item
Behaviour is very different from cluster to cluster
\item
There is no easy conclusion, unlike in 2011, on the correlation
between succesful probability and \# of events of a specific type.
\item
Clusters B, C and D in particular have very unsmooth lines that vary a
lot for small \# evts differences. This may be due to an uneven
distribution of \# evts in the traces.
\end{itemize}
\hypertarget{potential-causes-of-unsuccesful-executions}{%
\subsection{Potential causes of unsuccesful

View File

@ -6,13 +6,13 @@
%\hfill
\end{subfigure}}
\begin{figure}
\begin{figure}[p]
\spatialresourcewaste[0.5\textwidth]{used-2011}
\spatialresourcewaste[0.5\textwidth]{used-all}
\caption{Percentages of CPU and RAM resources used by tasks w.r.t. task termination type in 2011 and 2019 traces (total of clusters A to D). The x axis is the type of resource, y-axis is the percentage of resource used and color represents task termination. Numeric values are displayed below the graph as a table.}\label{fig:spatialresourcewaste-requested}
\end{figure}
\begin{figure}
\begin{figure}[p]
\spatialresourcewaste{used-a}
\spatialresourcewaste{used-b}
\spatialresourcewaste{used-c}
@ -20,13 +20,13 @@
\caption{Percentages of CPU and RAM resources used by tasks w.r.t. task termination type for clusters A to D in 2019 traces. Refer to figure~\ref{fig:spatialresourcewaste-requested} for plot explaination.}\label{fig:spatialresourcewaste-actual-csts}
\end{figure}
\begin{figure}
\begin{figure}[p]
\spatialresourcewaste[0.5\textwidth]{requested-2011}
\spatialresourcewaste[0.5\textwidth]{requested-all}
\caption{Percentages of CPU and RAM resources requested by tasks w.r.t. task termination type in 2011 and 2019 traces. The x axis is the type of resource, y-axis is the percentage of resource used and color represents task termination. Numeric values are displayed below the graph as a table.}\label{fig:spatialresourcewaste-actual}
\end{figure}
\begin{figure}
\begin{figure}[p]
\spatialresourcewaste{requested-a}
\spatialresourcewaste{requested-b}
\spatialresourcewaste{requested-c}