report work

2021-05-26 21:46:30 +02:00 · 2021-05-26 21:46:30 +02:00 · a7e2d987c3
parent 828b05a60a
commit a7e2d987c3
5 changed files with 390 additions and 308 deletions
--- a/.~lock.status.ods#
+++ b/.~lock.status.ods#
@ -1 +0,0 @@
-,maggicl,Apple2gs.local,24.05.2021 14:04,file:///Users/maggicl/Library/Application%20Support/LibreOffice/4;
--- a/machine_time_waste/statuses_total_time.ipynb
+++ b/machine_time_waste/statuses_total_time.ipynb
@ -11,7 +11,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
@ -27,12 +27,13 @@
    "    'text.usetex': True,\n",
    "    'pgf.rcfonts': False,\n",
    "})\n",
-    "import matplotlib.pyplot as plt"
+    "import matplotlib.pyplot as plt\n",
+    "pandas.options.display.float_format = '{:,.3f}'.format"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
@ -53,16 +54,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
-    "DIR = \"/home/claudio/hdd/git/bachelorThesis\""
+    "DIR = \"/Users/maggicl/git/bachelorThesis\""
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
@ -138,7 +139,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
@ -163,7 +164,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
@ -174,6 +175,9 @@
    "\n",
    "    df.rename(columns = {'time_type': 'Execution phase'}, inplace = True)\n",
    "    \n",
+    "    print(\"Cluster \"+cluster)\n",
+    "    print(df)\n",
+    "    \n",
    "    h = sns.histplot(df, x=\"Last termination\", \n",
    "                     weights=\"time_ms\", shrink=.5, common_bins=True,\n",
    "                     hue=\"Execution phase\", multiple=\"stack\", discrete=True, legend=True)\n",
@ -191,149 +195,193 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "   Last termination     time_type       time_ms\n",
-      "0             EVICT         Queue  1.049774e+12\n",
-      "1             EVICT  Resubmission  5.530617e+08\n",
-      "2             EVICT       Running  3.218063e+13\n",
-      "3             EVICT       Unknown  3.383291e+12\n",
-      "4              FAIL         Queue  9.483261e+11\n",
-      "5              FAIL  Resubmission  7.150500e+01\n",
-      "6              FAIL       Running  7.265195e+12\n",
-      "7              FAIL       Unknown  2.799674e+12\n",
-      "8            FINISH         Queue  3.317009e+13\n",
-      "9            FINISH  Resubmission  1.828825e+07\n",
-      "10           FINISH       Running  3.788436e+13\n",
-      "11           FINISH       Unknown  2.482661e+13\n",
-      "12             KILL         Queue  7.482888e+13\n",
-      "13             KILL  Resubmission  1.211419e+11\n",
-      "14             KILL       Running  6.311166e+14\n",
-      "15             KILL       Unknown  1.207792e+15\n",
-      "   Last termination     time_type       time_ms\n",
-      "0             EVICT         Queue  2.991028e+11\n",
-      "1             EVICT  Resubmission  1.360657e+09\n",
-      "2             EVICT       Running  2.871365e+13\n",
-      "3             EVICT       Unknown  1.428912e+13\n",
-      "4              FAIL         Queue  9.376134e+10\n",
-      "5              FAIL  Resubmission  1.225520e+02\n",
-      "6              FAIL       Running  8.338530e+12\n",
-      "7              FAIL       Unknown  1.989378e+12\n",
-      "8            FINISH         Queue  6.817208e+12\n",
-      "9            FINISH  Resubmission  1.493729e+03\n",
-      "10           FINISH       Running  8.069421e+13\n",
-      "11           FINISH       Unknown  1.006353e+14\n",
-      "12             KILL         Queue  5.397953e+13\n",
-      "13             KILL  Resubmission  1.842002e+10\n",
-      "14             KILL       Running  5.716892e+14\n",
-      "15             KILL       Unknown  2.088855e+15\n",
-      "   Last termination     time_type       time_ms\n",
-      "0             EVICT         Queue  3.158380e+11\n",
-      "1             EVICT  Resubmission  2.355575e+09\n",
-      "2             EVICT       Running  4.229815e+13\n",
-      "3             EVICT       Unknown  6.785277e+12\n",
-      "4              FAIL         Queue  2.352869e+11\n",
-      "5              FAIL  Resubmission  4.684500e+01\n",
-      "6              FAIL       Running  9.316941e+12\n",
-      "7              FAIL       Unknown  4.873943e+12\n",
-      "8            FINISH         Queue  1.172189e+13\n",
-      "9            FINISH  Resubmission  3.623451e+03\n",
-      "10           FINISH       Running  1.154498e+14\n",
-      "11           FINISH       Unknown  4.934279e+13\n",
-      "12             KILL         Queue  7.171264e+13\n",
-      "13             KILL  Resubmission  2.108520e+11\n",
-      "14             KILL       Running  6.180005e+14\n",
-      "15             KILL       Unknown  2.088457e+15\n",
-      "   Last termination     time_type       time_ms\n",
-      "0             EVICT         Queue  1.415993e+11\n",
-      "1             EVICT  Resubmission  2.835890e+08\n",
-      "2             EVICT       Running  4.303187e+13\n",
-      "3             EVICT       Unknown  7.410999e+12\n",
-      "4              FAIL         Queue  2.231462e+10\n",
-      "5              FAIL  Resubmission  1.073960e+02\n",
-      "6              FAIL       Running  1.186956e+13\n",
-      "7              FAIL       Unknown  2.829927e+12\n",
-      "8            FINISH         Queue  4.455665e+12\n",
-      "9            FINISH  Resubmission  1.577302e+03\n",
-      "10           FINISH       Running  6.516562e+13\n",
-      "11           FINISH       Unknown  7.106965e+13\n",
-      "12             KILL         Queue  7.435926e+13\n",
-      "13             KILL  Resubmission  5.556059e+11\n",
-      "14             KILL       Running  4.702722e+14\n",
-      "15             KILL       Unknown  2.040366e+15\n",
-      "   Last termination     time_type       time_ms\n",
-      "0             EVICT         Queue  1.722618e+10\n",
-      "1             EVICT  Resubmission  1.788932e+09\n",
-      "2             EVICT       Running  1.710804e+13\n",
-      "3             EVICT       Unknown  7.078678e+12\n",
-      "4              FAIL         Queue  2.895755e+09\n",
-      "5              FAIL  Resubmission  5.304400e+01\n",
-      "6              FAIL       Running  2.281806e+12\n",
-      "7              FAIL       Unknown  3.984907e+11\n",
-      "8            FINISH         Queue  7.454410e+11\n",
-      "9            FINISH  Resubmission  6.310360e+02\n",
-      "10           FINISH       Running  4.284518e+13\n",
-      "11           FINISH       Unknown  3.672368e+13\n",
-      "12             KILL         Queue  1.398332e+14\n",
-      "13             KILL  Resubmission  4.825723e+10\n",
-      "14             KILL       Running  3.049664e+14\n",
-      "15             KILL       Unknown  3.072445e+15\n",
-      "   Last termination     time_type       time_ms\n",
-      "0             EVICT         Queue  3.140594e+10\n",
-      "1             EVICT  Resubmission  1.504263e+10\n",
-      "2             EVICT       Running  5.070239e+13\n",
-      "3             EVICT       Unknown  1.602834e+13\n",
-      "4              FAIL         Queue  5.523972e+09\n",
-      "5              FAIL  Resubmission  2.352700e+01\n",
-      "6              FAIL       Running  3.889624e+12\n",
-      "7              FAIL       Unknown  1.833895e+12\n",
-      "8            FINISH         Queue  1.098116e+13\n",
-      "9            FINISH  Resubmission  6.319590e+02\n",
-      "10           FINISH       Running  9.761364e+13\n",
-      "11           FINISH       Unknown  9.603417e+13\n",
-      "12             KILL         Queue  1.129539e+14\n",
-      "13             KILL  Resubmission  1.356476e+11\n",
-      "14             KILL       Running  4.505937e+14\n",
-      "15             KILL       Unknown  2.669451e+15\n",
-      "   Last termination     time_type       time_ms\n",
-      "0             EVICT         Queue  9.528645e+10\n",
-      "1             EVICT  Resubmission  1.493116e+09\n",
-      "2             EVICT       Running  8.513084e+12\n",
-      "3             EVICT       Unknown  2.778074e+12\n",
-      "4              FAIL         Queue  2.887122e+11\n",
-      "5              FAIL  Resubmission  1.757300e+01\n",
-      "6              FAIL       Running  1.867799e+12\n",
-      "7              FAIL       Unknown  6.622832e+11\n",
-      "8            FINISH         Queue  8.337090e+11\n",
-      "9            FINISH  Resubmission  6.753141e+07\n",
-      "10           FINISH       Running  3.514254e+13\n",
-      "11           FINISH       Unknown  6.704536e+13\n",
-      "12             KILL         Queue  1.152843e+14\n",
-      "13             KILL  Resubmission  5.814544e+10\n",
-      "14             KILL       Running  2.225128e+14\n",
-      "15             KILL       Unknown  3.894626e+15\n",
-      "   Last termination     time_type       time_ms\n",
-      "0             EVICT         Queue  4.621613e+10\n",
-      "1             EVICT  Resubmission  4.511340e+02\n",
-      "2             EVICT       Running  2.786346e+13\n",
-      "3             EVICT       Unknown  9.513981e+12\n",
-      "4              FAIL         Queue  7.828423e+09\n",
-      "5              FAIL  Resubmission  1.148130e+02\n",
-      "6              FAIL       Running  3.509052e+12\n",
-      "7              FAIL       Unknown  1.212378e+12\n",
-      "8            FINISH         Queue  9.252380e+12\n",
-      "9            FINISH  Resubmission  1.675400e+02\n",
-      "10           FINISH       Running  7.635478e+13\n",
-      "11           FINISH       Unknown  5.980213e+13\n",
-      "12             KILL         Queue  1.543895e+14\n",
-      "13             KILL  Resubmission  3.419664e+09\n",
-      "14             KILL       Running  3.838571e+14\n",
-      "15             KILL       Unknown  4.039843e+15\n"
+      "Cluster a\n",
+      "   Last termination Execution phase  time_ms\n",
+      "0             EVICT           Queue    0.051\n",
+      "1             EVICT    Resubmission    0.000\n",
+      "2             EVICT         Running    1.564\n",
+      "3             EVICT         Unknown    0.164\n",
+      "4              FAIL           Queue    0.046\n",
+      "5              FAIL    Resubmission    0.000\n",
+      "6              FAIL         Running    0.353\n",
+      "7              FAIL         Unknown    0.136\n",
+      "8            FINISH           Queue    1.612\n",
+      "9            FINISH    Resubmission    0.000\n",
+      "10           FINISH         Running    1.841\n",
+      "11           FINISH         Unknown    1.207\n",
+      "12             KILL           Queue    3.637\n",
+      "13             KILL    Resubmission    0.006\n",
+      "14             KILL         Running   30.676\n",
+      "15             KILL         Unknown   58.706\n",
+      "Cluster b\n",
+      "   Last termination Execution phase  time_ms\n",
+      "0             EVICT           Queue    0.010\n",
+      "1             EVICT    Resubmission    0.000\n",
+      "2             EVICT         Running    0.971\n",
+      "3             EVICT         Unknown    0.483\n",
+      "4              FAIL           Queue    0.003\n",
+      "5              FAIL    Resubmission    0.000\n",
+      "6              FAIL         Running    0.282\n",
+      "7              FAIL         Unknown    0.067\n",
+      "8            FINISH           Queue    0.231\n",
+      "9            FINISH    Resubmission    0.000\n",
+      "10           FINISH         Running    2.729\n",
+      "11           FINISH         Unknown    3.404\n",
+      "12             KILL           Queue    1.826\n",
+      "13             KILL    Resubmission    0.001\n",
+      "14             KILL         Running   19.337\n",
+      "15             KILL         Unknown   70.655\n",
+      "Cluster c\n",
+      "   Last termination Execution phase  time_ms\n",
+      "0             EVICT           Queue    0.010\n",
+      "1             EVICT    Resubmission    0.000\n",
+      "2             EVICT         Running    1.401\n",
+      "3             EVICT         Unknown    0.225\n",
+      "4              FAIL           Queue    0.008\n",
+      "5              FAIL    Resubmission    0.000\n",
+      "6              FAIL         Running    0.309\n",
+      "7              FAIL         Unknown    0.161\n",
+      "8            FINISH           Queue    0.388\n",
+      "9            FINISH    Resubmission    0.000\n",
+      "10           FINISH         Running    3.824\n",
+      "11           FINISH         Unknown    1.635\n",
+      "12             KILL           Queue    2.376\n",
+      "13             KILL    Resubmission    0.007\n",
+      "14             KILL         Running   20.472\n",
+      "15             KILL         Unknown   69.183\n",
+      "Cluster d\n",
+      "   Last termination Execution phase  time_ms\n",
+      "0             EVICT           Queue    0.005\n",
+      "1             EVICT    Resubmission    0.000\n",
+      "2             EVICT         Running    1.542\n",
+      "3             EVICT         Unknown    0.265\n",
+      "4              FAIL           Queue    0.001\n",
+      "5              FAIL    Resubmission    0.000\n",
+      "6              FAIL         Running    0.425\n",
+      "7              FAIL         Unknown    0.101\n",
+      "8            FINISH           Queue    0.160\n",
+      "9            FINISH    Resubmission    0.000\n",
+      "10           FINISH         Running    2.334\n",
+      "11           FINISH         Unknown    2.546\n",
+      "12             KILL           Queue    2.664\n",
+      "13             KILL    Resubmission    0.020\n",
+      "14             KILL         Running   16.846\n",
+      "15             KILL         Unknown   73.091\n",
+      "Cluster e\n",
+      "   Last termination Execution phase  time_ms\n",
+      "0             EVICT           Queue    0.000\n",
+      "1             EVICT    Resubmission    0.000\n",
+      "2             EVICT         Running    0.472\n",
+      "3             EVICT         Unknown    0.195\n",
+      "4              FAIL           Queue    0.000\n",
+      "5              FAIL    Resubmission    0.000\n",
+      "6              FAIL         Running    0.063\n",
+      "7              FAIL         Unknown    0.011\n",
+      "8            FINISH           Queue    0.021\n",
+      "9            FINISH    Resubmission    0.000\n",
+      "10           FINISH         Running    1.182\n",
+      "11           FINISH         Unknown    1.013\n",
+      "12             KILL           Queue    3.858\n",
+      "13             KILL    Resubmission    0.001\n",
+      "14             KILL         Running    8.414\n",
+      "15             KILL         Unknown   84.769\n",
+      "Cluster f\n",
+      "   Last termination Execution phase  time_ms\n",
+      "0             EVICT           Queue    0.001\n",
+      "1             EVICT    Resubmission    0.000\n",
+      "2             EVICT         Running    1.444\n",
+      "3             EVICT         Unknown    0.457\n",
+      "4              FAIL           Queue    0.000\n",
+      "5              FAIL    Resubmission    0.000\n",
+      "6              FAIL         Running    0.111\n",
+      "7              FAIL         Unknown    0.052\n",
+      "8            FINISH           Queue    0.313\n",
+      "9            FINISH    Resubmission    0.000\n",
+      "10           FINISH         Running    2.781\n",
+      "11           FINISH         Unknown    2.736\n",
+      "12             KILL           Queue    3.218\n",
+      "13             KILL    Resubmission    0.004\n",
+      "14             KILL         Running   12.836\n",
+      "15             KILL         Unknown   76.047\n",
+      "Cluster g\n",
+      "   Last termination Execution phase  time_ms\n",
+      "0             EVICT           Queue    0.002\n",
+      "1             EVICT    Resubmission    0.000\n",
+      "2             EVICT         Running    0.196\n",
+      "3             EVICT         Unknown    0.064\n",
+      "4              FAIL           Queue    0.007\n",
+      "5              FAIL    Resubmission    0.000\n",
+      "6              FAIL         Running    0.043\n",
+      "7              FAIL         Unknown    0.015\n",
+      "8            FINISH           Queue    0.019\n",
+      "9            FINISH    Resubmission    0.000\n",
+      "10           FINISH         Running    0.808\n",
+      "11           FINISH         Unknown    1.541\n",
+      "12             KILL           Queue    2.650\n",
+      "13             KILL    Resubmission    0.001\n",
+      "14             KILL         Running    5.116\n",
+      "15             KILL         Unknown   89.538\n",
+      "Cluster h\n",
+      "   Last termination Execution phase  time_ms\n",
+      "0             EVICT           Queue    0.001\n",
+      "1             EVICT    Resubmission    0.000\n",
+      "2             EVICT         Running    0.585\n",
+      "3             EVICT         Unknown    0.200\n",
+      "4              FAIL           Queue    0.000\n",
+      "5              FAIL    Resubmission    0.000\n",
+      "6              FAIL         Running    0.074\n",
+      "7              FAIL         Unknown    0.025\n",
+      "8            FINISH           Queue    0.194\n",
+      "9            FINISH    Resubmission    0.000\n",
+      "10           FINISH         Running    1.602\n",
+      "11           FINISH         Unknown    1.255\n",
+      "12             KILL           Queue    3.240\n",
+      "13             KILL    Resubmission    0.000\n",
+      "14             KILL         Running    8.055\n",
+      "15             KILL         Unknown   84.770\n",
+      "Cluster all\n",
+      "   Last termination Execution phase  time_ms\n",
+      "0             EVICT           Queue    0.007\n",
+      "1             EVICT    Resubmission    0.000\n",
+      "2             EVICT         Running    0.925\n",
+      "3             EVICT         Unknown    0.248\n",
+      "4              FAIL           Queue    0.006\n",
+      "5              FAIL    Resubmission    0.000\n",
+      "6              FAIL         Running    0.179\n",
+      "7              FAIL         Unknown    0.061\n",
+      "8            FINISH           Queue    0.288\n",
+      "9            FINISH    Resubmission    0.000\n",
+      "10           FINISH         Running    2.036\n",
+      "11           FINISH         Unknown    1.867\n",
+      "12             KILL           Queue    2.945\n",
+      "13             KILL    Resubmission    0.004\n",
+      "14             KILL         Running   13.493\n",
+      "15             KILL         Unknown   77.941\n",
+      "Cluster 2011\n",
+      "   Last termination Execution phase  time_ms\n",
+      "0             EVICT           Queue    2.500\n",
+      "1             EVICT    Resubmission    0.000\n",
+      "2             EVICT         Running   17.500\n",
+      "3             EVICT         Unknown    0.000\n",
+      "4              FAIL           Queue    0.000\n",
+      "5              FAIL    Resubmission    0.000\n",
+      "6              FAIL         Running    5.000\n",
+      "7              FAIL         Unknown    0.000\n",
+      "8            FINISH           Queue    1.000\n",
+      "9            FINISH    Resubmission    0.000\n",
+      "10           FINISH         Running   39.000\n",
+      "11           FINISH         Unknown    0.000\n",
+      "12             KILL           Queue    5.000\n",
+      "13             KILL    Resubmission    1.000\n",
+      "14             KILL         Running   30.000\n",
+      "15             KILL         Unknown    0.000\n"
     ]
    }
   ],
@ -343,8 +391,6 @@
    "\n",
    "for cluster in \"abcdefgh\":\n",
    "    df, totals = create_df(cluster)\n",
-    "    \n",
-    "    print(df)\n",
    "\n",
    "    #plt.figure(figsize=(10,8))\n",
    "    #graph_1(df, cluster)\n",
@ -377,30 +423,30 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "   Last termination     time_type       time_ms\n",
-      "0             EVICT         Queue  7.373993e-05\n",
-      "1             EVICT  Resubmission  8.449953e-07\n",
-      "2             EVICT       Running  9.249078e-03\n",
-      "3             EVICT       Unknown  2.484572e-03\n",
-      "4              FAIL         Queue  5.926861e-05\n",
-      "5              FAIL  Resubmission  2.058252e-14\n",
-      "6              FAIL       Running  1.785410e-03\n",
-      "7              FAIL       Unknown  6.131290e-04\n",
-      "8            FINISH         Queue  2.880144e-03\n",
-      "9            FINISH  Resubmission  3.170097e-09\n",
-      "10           FINISH       Running  2.035703e-02\n",
-      "11           FINISH       Unknown  1.867017e-02\n",
-      "12             KILL         Queue  2.945024e-02\n",
-      "13             KILL  Resubmission  4.253091e-05\n",
-      "14             KILL       Running  1.349259e-01\n",
-      "15             KILL       Unknown  7.794080e-01\n"
+      "   Last termination     time_type  time_ms\n",
+      "0             EVICT         Queue    0.000\n",
+      "1             EVICT  Resubmission    0.000\n",
+      "2             EVICT       Running    0.009\n",
+      "3             EVICT       Unknown    0.002\n",
+      "4              FAIL         Queue    0.000\n",
+      "5              FAIL  Resubmission    0.000\n",
+      "6              FAIL       Running    0.002\n",
+      "7              FAIL       Unknown    0.001\n",
+      "8            FINISH         Queue    0.003\n",
+      "9            FINISH  Resubmission    0.000\n",
+      "10           FINISH       Running    0.020\n",
+      "11           FINISH       Unknown    0.019\n",
+      "12             KILL         Queue    0.029\n",
+      "13             KILL  Resubmission    0.000\n",
+      "14             KILL       Running    0.135\n",
+      "15             KILL       Unknown    0.779\n"
     ]
    }
   ],
--- a/report/Claudio_Maggioni_report.pdf
+++ b/report/Claudio_Maggioni_report.pdf
--- a/report/Claudio_Maggioni_report.tex
+++ b/report/Claudio_Maggioni_report.tex
@ -82,6 +82,15 @@ and stored in JSONL format)\cite{google-drive-marso}, requiring a considerable
 amount of computational power to analyze them and the implementation of special
 data engineering techniques for analysis of the data.

+\input{figures/machine_configs}
+
+An overview of the machine configurations in the cluster analyzed with the 2011
+traces and in the 8 clusters composing the 2019 traces can be found in
+figure~\ref{fig:machineconfigs}. Additionally, in
+figure~\ref{fig:machineconfigs-csts}, the same machine configuration data is
+provided for the 2019 traces providing a cluster-by-cluster distribution of the
+machines.
+
 This project aims to repeat the analysis performed in 2015 to highlight
 similarities and differences in workload this decade brought, and expanding the
 old analysis to understand even better the causes of failures and how to prevent
@ -441,102 +450,133 @@ deltas. Finally, the mean of the computed slowdown values is computed resulting
 in the clear and coincise tables found in figure~\ref{fig:taskslowdown}.


-\hypertarget{ad-hoc-presentation-of-some-analysis-scripts}{%
-\subsection{Ad-Hoc presentation of some analysis
-scripts}\label{ad-hoc-presentation-of-some-analysis-scripts}}
-
-\textbf{TBD} (with diagrams)
-
-\hypertarget{analysis-and-observations}{%
-\section{Analysis and observations}\label{analysis-and-observations}}
-
-\hypertarget{overview-of-machine-configurations-in-each-cluster}{%
-\subsection{Overview of machine configurations in each
-cluster}\label{overview-of-machine-configurations-in-each-cluster}}
-
-\input{figures/machine_configs}
-
-Refer to figure \ref{fig:machineconfigs}.
-
-\textbf{Observations}:
-
-\begin{itemize}
-\item
-  machine configurations are definitely more varied than the ones in the
-  2011 traces
-\item
-  some clusters have more machine variability
-\end{itemize}
-
-\hypertarget{analysis-of-execution-time-per-each-execution-phase}{%
-\subsection{Analysis of execution time per each execution
-phase}\label{analysis-of-execution-time-per-each-execution-phase}}

+\section{Analysis: Performance Input of Unsuccessful Executions}
 \input{figures/machine_time_waste}

-Refer to figures \ref{fig:machinetimewaste-abs} and
-\ref{fig:machinetimewaste-rel}.
+Our first investigation focuses on replicating the methodologies used in the
+2015 DSN Ros\'a et al.\ paper\cite{vino-paper} regarding usage of machine time
+and resources.

-\textbf{Observations}:
+In this section we perform several analyses focusing on how machine time and
+resources are wasted, by means of a temporal vs. spatial resource analysis from
+the perspective of single tasks as well as jobs. We then compare the results
+from the 2019 traces to the ones that were obtained in 2015 to understand the
+workload evolution inside Borg between 2011 and 2019.

-\begin{itemize}
-\item
-  Across all cluster almost 50\% of time is spent in ``unknown''
-  transitions, i.e. there are some time slices that are related to a
-  state transition that Google says are not ``typical'' transitions.
-  This is mostly due to the trace log being intermittent when recording
-  all state transitions.
-\item
-  80\% of the time spent in KILL and LOST is unknown. This is
-  predictable, since both states indicate that the job execution is not
-  stable (in particular LOST is used when the state logging itself is
-  unstable)
-\item
-  From the absolute graph we see that the time ``wasted'' on non-finish
-  terminated jobs is very significant
-\item
-  Execution is the most significant task phase, followed by queuing time
-  and scheduling time (``ready'' state)
-\item
-  In the absolute graph we see that a significant amount of time is
-  spent to re-schedule evicted jobs (``evicted'' state)
-\item
-  Cluster A has unusually high queuing times
-\end{itemize}
+\subsection{Temporal Impact: Machine Time Waste}

-\hypertarget{task-slowdown}{%
-\subsection{Task slowdown}\label{task-slowdown}}
+This analysis explores how machine time is distributed over task events and
+submissions. By partitioning the collection of all terminating tasks by their
+termination event, the analysis aims to measure the total time spent by tasks in
+3 different execution phases:

+\begin{description}
+\item[resubmission time:] the total of all time deltas between every task
+  termination event and the immediately succeding task submission event, i.e.
+    the total time spent by tasks waiting to be resubmitted in Borg after a
+    termination;
+\item[queue time:] the total of all time deltas between every task submission
+  event and the following task scheduling event, i.e. the total time spent by
+    tasks queuing before execution;
+\item[running time:] the total of all time deltas between every task scheduling
+  event and the following task termination event, i.e. the total time spent by
+    tasks ``executing'' (i.e. performing useful computations) in the clusters.
+\end{description}
+
+In the 2019 traces, an additional ``Unknown'' measure is counted. This measure
+collects all the times in which the event transitions between the register
+events do not allow to safely assume in which execution phase a task may be.
+Unknown measures are mostly caused by faults and missed event writes in the task
+event log that was used to generate the traces.
+
+The analysis results are depicted in figure~\ref{fig:machinetimewaste-rel} as a
+comparison between the 2011 and 2019 traces, aggregating the data from all
+clusters. Additionally, in figure~\ref{fig:machinetimewaste-rel-csts}
+cluster-by-cluster breakdown result is provided for the 2019 traces.
+
+The striking difference between 2011 and 2019 data is in the machine time
+distribution per task termination type. In the 2019 traces, 94.38\% of global
+machine time is spent on tasks that are eventually \texttt{KILL}ed.
+\texttt{FINISH}, \texttt{EVICT} and \texttt{FAIL} tasks respectively register
+totals of 4.20\%, 1.18\% and 0.25\% machine time, maintaining a analogous
+distribution between them to their distribution in the 2011 traces.
+
+Considering instead the distribution between execution phase times, the
+comparison shows very similar behaviour between the two traces, having the
+``Running'' time being dominant (at a total of 16.63\% across task terminations
+in 2019) over the queue and resubmission phases (with respective totals in 2019
+of 3.26\% and 0.004\%).
+
+However, another noteworthy difference between 2011 and 2019 data lies in the new
+``Unknown'' trace dataset present only in the latter traces, registering a total
+80.12\% of global machine time across al terminations. This data can be
+interpreted as a strong indication of the ``poor quality'' of the 2019 traces
+w.r.t.\ of accuracy of task event logging.
+
+Considering instead the behaviour of each single cluster in the 2019 traces, no
+significant difference beween them can be observed. The only notable difference
+lies between the ``Running time``-``Unknown time'' ratio in \texttt{KILL}ed
+tasks, which is at its highest in cluster A (at 30.78\% by 58.71\% of global
+machine time) and at its lowest in cluster H (at 8.06\% by 84.77\% of global
+machine time).
+
+\subsection{Average Slowdown per Task}
 \input{figures/task_slowdown}

-Refer to figure \ref{fig:taskslowdown}
+This analysis aims to measure the figure of ``slowdown'', which is defined as
+the ratio between the response time (i.e\. queue time and running time) of the
+last execution of a given task and the total response time across all
+executions of said task. This metric is especially useful to analyze the impact
+of unsuccesful executions on each task total execution time w.r.t.\ the intrinsic
+workload (i.e.\ computational time) of tasks.

-\textbf{Observations}:
+Refer to figure~\ref{fig:taskslowdown} for a comparison between the 2011 and
+2019 mean task slowdown measures broke down by task priority. Additionally, said
+means are computed on a cluster-by-cluster basis for 2019 data in
+figure~\ref{fig:taskslowdown-csts}.

-\begin{itemize}
-\item
-  Priority values are different from 0-11 values in the 2011 traces. A
-  conversion table is provided by Google;
-\item
-  For some priorities (e.g.~101 for cluster D) the relative number of
-  finishing task is very low and the mean slowdown is very high (315).
-  This behaviour differs from the relatively homogeneous values from the
-  2011 traces.
-\item
-  Some slowdown values cannot be computed since either some tasks have a
-  0ns execution time or for some priorities no tasks in the traces
-  terminate successfully. More raw data on those exception is in
-  Jupyter.
-\item
+In 2015 Ros\'a et al.\cite{vino-paper} measured mean task slowdown per each task
+priority value, which at the time were $[0,11]$ numeric values. However,
+in 2019 traces, task priorities are given as a $[0,500]$ numeric value.
+Therefore, to allow for an easier comparison, mean task slowdown values are
+computed by task priority tier over the 2019 data. Priority tiers are
+semantically relevant priority ranges defined in the Tirmazi et al.
+2020\cite{google-marso-19} that introduced the 2019 traces. Equivalent priority
+tiers are also provided next to the 2011 priority values in the table covering
+the 2015 analysis.
+
+In the given tables, the \textbf{\% finished} column corresponds to the
+percentage of \texttt{FINISH}ed tasks for that priority or tier. \textbf{Mean
+response [s] (last execution)} instead shows the mean response time of the last
+task execution of each task in that priority/tier.
+\textbf{Mean response [s] (all executions)} provides a very similar figure,
+though this column shows the mean response time across all executions.
+\textbf{Mean slowdown} instead provides the mean slowdown value for each task
+priority/tier.
+
+Comparing the tables in figure~\ref{fig:taskslowdown} we observe that the
+maximum mean slowdown measure for 2019 data (i.e.\ 7.84, for the BEB tier) is almost
+double of the maximum measure in 2011 data (i.e.\ 3.39, for priority $3$
+corresponding to the BEB tier). The ``Best effort batch'' tier, as the name
+suggest, is a lower priority tier where failures are more tolerated. Therefore,
+due to the increased concurrency in the 2019 clusters compared to 2011 and the
+higher machine time spent for unsuccesful executions (as observed in the
+previous analysis) and increase slowdown rate for this class is not particularly
+surprising.
+
+\textbf{TBD}
  The \% of finishing jobs is relatively low comparing with the 2011
  traces.
-\end{itemize}
+
+\input{figures/spatial_resource_waste}
+\input{figures/table_iii} % has table III and table IV in it
+\input{figures/figure_5}

 \hypertarget{reserved-and-actual-resource-usage-of-tasks}{%
 \subsection{Reserved and actual resource usage of
 tasks}\label{reserved-and-actual-resource-usage-of-tasks}}

-\input{figures/spatial_resource_waste}

 Refer to figures \ref{fig:spatialresourcewaste-actual} and
 \ref{fig:spatialresourcewaste-requested}.
@ -553,6 +593,63 @@ Refer to figures \ref{fig:spatialresourcewaste-actual} and
  both CPU and RAM
 \end{itemize}

+
+Refer to figure \ref{fig:tableIII}.
+
+\textbf{Observations}:
+
+\begin{itemize}
+\item
+  The mean number of events per task is an order of magnitude higher
+  than in the 2011 traces
+\item
+  Generally speaking, the event type with higher mean is the termination
+  event for the task
+\item
+  The \# evts mean is higher than the sum of all other event type means,
+  since it appears there are a lot more non-termination events in the
+  2019 traces.
+\end{itemize}
+
+\hypertarget{mean-number-of-tasks-and-event-distribution-per-job-type}{%
+\subsection{Mean number of tasks and event distribution per job
+type}\label{mean-number-of-tasks-and-event-distribution-per-job-type}}
+
+
+\textbf{Observations}:
+
+\begin{itemize}
+\item
+  Again the mean number of tasks is significantly higher than the 2011
+  traces, indicating a higher complexity of workloads
+\item
+  Cluster A has no evicted jobs
+\item
+  The number of events is however lower than the event means in the 2011
+  traces
+\end{itemize}
+
+\hypertarget{probability-of-task-successful-termination-given-its-unsuccesful-events}{%
+\subsection{Probability of task successful termination given its
+unsuccesful
+events}\label{probability-of-task-successful-termination-given-its-unsuccesful-events}}
+
+
+Refer to figure \ref{fig:figureV}.
+
+\textbf{Observations}:
+
+\begin{itemize}
+\item
+  Behaviour is very different from cluster to cluster
+\item
+  There is no easy conclusion, unlike in 2011, on the correlation
+  between succesful probability and \# of events of a specific type.
+\item
+  Clusters B, C and D in particular have very unsmooth lines that vary a
+  lot for small \# evts differences. This may be due to an uneven
+  distribution of \# evts in the traces.
+\end{itemize}
 \hypertarget{correlation-between-task-events-metadata-and-task-termination}{%
 \subsection{Correlation between task events' metadata and task
 termination}\label{correlation-between-task-events-metadata-and-task-termination}}
@ -620,66 +717,6 @@ Refer to figures \ref{fig:figureIX-a}, \ref{fig:figureIX-b}, and
 \subsection{Mean number of tasks and event distribution per task
 type}\label{mean-number-of-tasks-and-event-distribution-per-task-type}}

-\input{figures/table_iii}
-
-Refer to figure \ref{fig:tableIII}.
-
-\textbf{Observations}:
-
-\begin{itemize}
-\item
-  The mean number of events per task is an order of magnitude higher
-  than in the 2011 traces
-\item
-  Generally speaking, the event type with higher mean is the termination
-  event for the task
-\item
-  The \# evts mean is higher than the sum of all other event type means,
-  since it appears there are a lot more non-termination events in the
-  2019 traces.
-\end{itemize}
-
-\hypertarget{mean-number-of-tasks-and-event-distribution-per-job-type}{%
-\subsection{Mean number of tasks and event distribution per job
-type}\label{mean-number-of-tasks-and-event-distribution-per-job-type}}
-
-Refer to figure \ref{fig:tableIV}.
-
-\textbf{Observations}:
-
-\begin{itemize}
-\item
-  Again the mean number of tasks is significantly higher than the 2011
-  traces, indicating a higher complexity of workloads
-\item
-  Cluster A has no evicted jobs
-\item
-  The number of events is however lower than the event means in the 2011
-  traces
-\end{itemize}
-
-\hypertarget{probability-of-task-successful-termination-given-its-unsuccesful-events}{%
-\subsection{Probability of task successful termination given its
-unsuccesful
-events}\label{probability-of-task-successful-termination-given-its-unsuccesful-events}}
-
-\input{figures/figure_5}
-
-Refer to figure \ref{fig:figureV}.
-
-\textbf{Observations}:
-
-\begin{itemize}
-\item
-  Behaviour is very different from cluster to cluster
-\item
-  There is no easy conclusion, unlike in 2011, on the correlation
-  between succesful probability and \# of events of a specific type.
-\item
-  Clusters B, C and D in particular have very unsmooth lines that vary a
-  lot for small \# evts differences. This may be due to an uneven
-  distribution of \# evts in the traces.
-\end{itemize}

 \hypertarget{potential-causes-of-unsuccesful-executions}{%
 \subsection{Potential causes of unsuccesful
--- a/report/figures/spatial_resource_waste.tex
+++ b/report/figures/spatial_resource_waste.tex
@ -6,13 +6,13 @@
 	%\hfill
 	\end{subfigure}}

-\begin{figure}
+\begin{figure}[p]
 \spatialresourcewaste[0.5\textwidth]{used-2011}
 \spatialresourcewaste[0.5\textwidth]{used-all}
 	\caption{Percentages of CPU and RAM resources used by tasks w.r.t. task termination type in 2011 and 2019 traces (total of clusters A to D). The x axis is the type of resource, y-axis is the percentage of resource used and color represents task termination. Numeric values are displayed below the graph as a table.}\label{fig:spatialresourcewaste-requested}
 \end{figure}

-\begin{figure}
+\begin{figure}[p]
 \spatialresourcewaste{used-a}
 \spatialresourcewaste{used-b}
 \spatialresourcewaste{used-c}
@ -20,13 +20,13 @@
 	\caption{Percentages of CPU and RAM resources used by tasks w.r.t. task termination type for clusters A to D in 2019 traces. Refer to figure~\ref{fig:spatialresourcewaste-requested} for plot explaination.}\label{fig:spatialresourcewaste-actual-csts}
 \end{figure}

-\begin{figure}
+\begin{figure}[p]
 \spatialresourcewaste[0.5\textwidth]{requested-2011}
 \spatialresourcewaste[0.5\textwidth]{requested-all}
 	\caption{Percentages of CPU and RAM resources requested by tasks w.r.t. task termination type in 2011 and 2019 traces. The x axis is the type of resource, y-axis is the percentage of resource used and color represents task termination. Numeric values are displayed below the graph as a table.}\label{fig:spatialresourcewaste-actual}
 \end{figure}

-\begin{figure}
+\begin{figure}[p]
 \spatialresourcewaste{requested-a}
 \spatialresourcewaste{requested-b}
 \spatialresourcewaste{requested-c}
				`@ -1 +0,0 @@`
				`,maggicl,Apple2gs.local,24.05.2021 14:04,file:///Users/maggicl/Library/Application%20Support/LibreOffice/4;`