test

2021-05-17 16:27:17 +02:00 · 2021-05-17 16:27:17 +02:00 · 3746bfc9c9
commit 3746bfc9c9
parent 0d935c2112
6 changed files with 909 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,293 @@ figure_9/*.parquet/
 figure_9/?_task_count/
 figure_9/?_machine_locality/
 table_iii/*.parquet/
 ## Core latex/pdflatex auxiliary files:
 *.aux
 *.lof
 *.log
 *.lot
 *.fls
 *.out
 *.toc
 *.fmt
 *.fot
 *.cb
 *.cb2
 .*.lb
 ## Intermediate documents:
 *.dvi
 *.xdv
 *-converted-to.*
 # these rules might exclude image files for figures etc.
 # *.ps
 # *.eps
 # *.pdf
 ## Generated if empty string is given at "Please type another file name for output:"
 .pdf
 ## Bibliography auxiliary files (bibtex/biblatex/biber):
 *.bbl
 *.bcf
 *.blg
 *-blx.aux
 *-blx.bib
 *.run.xml
 ## Build tool auxiliary files:
 *.fdb_latexmk
 *.synctex
 *.synctex(busy)
 *.synctex.gz
 *.synctex.gz(busy)
 *.pdfsync
 ## Build tool directories for auxiliary files
 # latexrun
 latex.out/
 ## Auxiliary and intermediate files from other packages:
 # algorithms
 *.alg
 *.loa
 # achemso
 acs-*.bib
 # amsthm
 *.thm
 # beamer
 *.nav
 *.pre
 *.snm
 *.vrb
 # changes
 *.soc
 # comment
 *.cut
 # cprotect
 *.cpt
 # elsarticle (documentclass of Elsevier journals)
 *.spl
 # endnotes
 *.ent
 # fixme
 *.lox
 # feynmf/feynmp
 *.mf
 *.mp
 *.t[1-9]
 *.t[1-9][0-9]
 *.tfm
 #(r)(e)ledmac/(r)(e)ledpar
 *.end
 *.?end
 *.[1-9]
 *.[1-9][0-9]
 *.[1-9][0-9][0-9]
 *.[1-9]R
 *.[1-9][0-9]R
 *.[1-9][0-9][0-9]R
 *.eledsec[1-9]
 *.eledsec[1-9]R
 *.eledsec[1-9][0-9]
 *.eledsec[1-9][0-9]R
 *.eledsec[1-9][0-9][0-9]
 *.eledsec[1-9][0-9][0-9]R
 # glossaries
 *.acn
 *.acr
 *.glg
 *.glo
 *.gls
 *.glsdefs
 *.lzo
 *.lzs
 # uncomment this for glossaries-extra (will ignore makeindex's style files!)
 # *.ist
 # gnuplottex
 *-gnuplottex-*
 # gregoriotex
 *.gaux
 *.glog
 *.gtex
 # htlatex
 *.4ct
 *.4tc
 *.idv
 *.lg
 *.trc
 *.xref
 # hyperref
 *.brf
 # knitr
 *-concordance.tex
 # TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
 # *.tikz
 *-tikzDictionary
 # listings
 *.lol
 # luatexja-ruby
 *.ltjruby
 # makeidx
 *.idx
 *.ilg
 *.ind
 # minitoc
 *.maf
 *.mlf
 *.mlt
 *.mtc[0-9]*
 *.slf[0-9]*
 *.slt[0-9]*
 *.stc[0-9]*
 # minted
 _minted*
 *.pyg
 # morewrites
 *.mw
 # newpax
 *.newpax
 # nomencl
 *.nlg
 *.nlo
 *.nls
 # pax
 *.pax
 # pdfpcnotes
 *.pdfpc
 # sagetex
 *.sagetex.sage
 *.sagetex.py
 *.sagetex.scmd
 # scrwfile
 *.wrt
 # sympy
 *.sout
 *.sympy
 sympy-plots-for-*.tex/
 # pdfcomment
 *.upa
 *.upb
 # pythontex
 *.pytxcode
 pythontex-files-*/
 # tcolorbox
 *.listing
 # thmtools
 *.loe
 # TikZ & PGF
 *.dpth
 *.md5
 *.auxlock
 # todonotes
 *.tdo
 # vhistory
 *.hst
 *.ver
 # easy-todo
 *.lod
 # xcolor
 *.xcp
 # xmpincl
 *.xmpi
 # xindy
 *.xdy
 # xypic precompiled matrices and outlines
 *.xyc
 *.xyd
 # endfloat
 *.ttt
 *.fff
 # Latexian
 TSWLatexianTemp*
 ## Editors:
 # WinEdt
 *.bak
 *.sav
 # Texpad
 .texpadtmp
 # LyX
 *.lyx~
 # Kile
 *.backup
 # gummi
 .*.swp
 # KBibTeX
 *~[0-9]*
 # TeXnicCenter
 *.tps
 # auto folder when using emacs and auctex
 ./auto/*
 *.el
 # expex forward references with \gathertags
 *-tags.tex
 # standalone packages
 *.sta
 # Makeindex log files
 *.lpz
 # xwatermark package
 *.xwm
 # REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
 # option is specified. Footnotes are the stored in a file with suffix Notes.bib.
 # Uncomment the next line to have this generated file ignored.
 #*Notes.bib
--- a/.~lock.status.ods#
+++ b/.~lock.status.ods#
@ -1 +0,0 @@
 ,maggicl,Apple2gs.local,16.05.2021 14:55,file:///Users/maggicl/Library/Application%20Support/LibreOffice/4;
--- a/report/Claudio_Maggioni_report.md
+++ b/report/Claudio_Maggioni_report.md
@ -52,7 +52,17 @@ header-includes:
 ## Rosà et al. 2015 DSN paper
-**TBD**
+In 2015, Dr. Andrea Rosà, Lydia Y. Chen, Prof. Walter Binder published a
 research paper titled "Understanding the Dark Side of Big Data Clusters:
 An Analysis beyond Failures" performing several analysis on Google's 2011
 Borg cluster traces. The salient conclusion of that research is that lots of
 computation performed by Google would eventually fail, leading to large amounts
 of computational power being wasted.
 Our aim with this thesis is to repeat the analysis performed in 2015 on the new
 2019 dataset to find similarities and differences with the previous analysis,
 and ulimately find if computational power is indeed wasted in this new workload
 as well.
 ## Google Borg
@ -162,22 +172,30 @@ This approach is discussed with further detail in the following section.
 **TBD**
 ## Overview on challenging aspects of analysis (data size, schema, avaliable computation resources)
 **TBD**
 ## Introduction on Apache Spark
-**TBD**
+Apache Spark is a unified analytics engine for large-scale data processing. In
 layman's terms, Spark is really useful to parallelize computations in a fast and
 streamlined way.
-## General workflow description of apache spark workflow
+In the scope of this thesis, Spark was used essentially as a Map-Reduce
 framework for computing aggregated results on the various tables. Due to the
 sharded nature of table "files", Spark is able to spawn a thread per file and
 run computations using all processors on the server machines used to run the
 analysis.
-**TBD** (extract from the notes sent to Filippo shown below)
+Spark is also quite powerful since it provides automated thread pooling
 services, and it is able to efficiently store and cache intermediate computation
 on secondary storage without any additional effort required from the data
 engineer. This feature was especially useful due to the sheer size of the
 analyzed data, since the computations required to store up to 1TiB of
 intermediate data on disk.
-The Google 2019 Borg cluster traces analysis were conducted by using Apache
+The chosen programming language for writing analysis scripts was Python. Spark
-Spark and its Python 3 API (pyspark).  Spark was used to execute a series of
+has very powerful native Python bindings in the form of the _PySpark_ API, which
-queries to perform various sums and aggregations over the entire dataset
+were used to implement the various queries.
-provided by Google.
+
 ## Query architecture
 In general, each query follows a general Map-Reduce template, where traces are
 first read, parsed, filtered by performing selections, projections and computing
@ -202,10 +220,10 @@ memory during the query, a projection is often applied to the data by the means
 of a .map() operation over the entire trace set, performed using Spark's RDD
 API.
-Another operation that is often necessary to perform prior to the Map-Reduce core of
+Another operation that is often necessary to perform prior to the Map-Reduce
-each query is a record filtering process, which is often motivated by the
+core of each query is a record filtering process, which is often motivated by
-presence of incomplete data (i.e. records which contain fields whose values is
+the presence of incomplete data (i.e. records which contain fields whose values
-unknown). This filtering is performed using the .filter() operation of Spark's
+is unknown). This filtering is performed using the .filter() operation of Spark's
 RDD API.
 The core of each query is often a groupBy followed by a map() operation on the
@ -222,6 +240,8 @@ compute and save intermediate results beforehand.
 ## General Query script design
 **TBD**
 ## Ad-Hoc presentation of some analysis scripts
--- a/report/Claudio_Maggioni_report.pdf
+++ b/report/Claudio_Maggioni_report.pdf
--- a/report/Claudio_Maggioni_report.tex
+++ b/report/Claudio_Maggioni_report.tex
@ -0,0 +1,583 @@
 \documentclass{usiinfbachelorproject}
 \title{Understanding and Comparing Unsuccessful Executions in Large Datacenters}
 \author{Claudio Maggioni}
 \usepackage{amsmath}
 \usepackage{subcaption}
 \usepackage{booktabs}
 \usepackage{graphicx}
 \captionsetup{labelfont={bf}}
 %\subtitle{The (optional) subtitle}
 \versiondate{\today}
 \begin{committee}
 \advisor[Universit\`a della Svizzera Italiana,
 Switzerland]{Prof.}{Walter}{Binder}
 \assistant[Universit\`a della Svizzera Italiana,
 Switzerland]{Dr.}{Andrea}{Ros\'a}
 \end{committee}
 \abstract{The project aims at comparing two different traces coming from large
 datacenters, focusing in particular on unsuccessful executions of jobs and
 tasks submitted by users. The objective of this project is to compare the
 resource waste caused by unsuccessful executions, their impact on application
 performance, and their root causes. We will show the strong negative impact on
 CPU and RAM usage and on task slowdown.  We will analyze patterns of
 unsuccessful jobs and tasks, particularly focusing on their interdependency.
 Moreover, we will uncover their root causes by inspecting key workload and
 system attributes such asmachine locality and concurrency level.}
 \begin{document}
 \tableofcontents
 \newpage
 \hypertarget{introduction-including-motivation}{%
 \section{Introduction (including
 Motivation)}\label{introduction-including-motivation}}
 \hypertarget{state-of-the-art}{%
 \section{State of the Art}\label{state-of-the-art}}
 \hypertarget{introduction}{%
 \subsection{Introduction}\label{introduction}}
 \textbf{TBD}
 \hypertarget{rosuxe0-et-al.-2015-dsn-paper}{%
 \subsection{Rosà et al.~2015 DSN
 paper}\label{rosuxe0-et-al.-2015-dsn-paper}}
 In 2015, Dr.~Andrea Rosà, Lydia Y. Chen, Prof.~Walter Binder published a
 research paper titled ``Understanding the Dark Side of Big Data
 Clusters: An Analysis beyond Failures'' performing several analysis on
 Google's 2011 Borg cluster traces. The salient conclusion of that
 research is that lots of computation performed by Google would
 eventually fail, leading to large amounts of computational power being
 wasted.
 Our aim with this thesis is to repeat the analysis performed in 2015 on
 the new 2019 dataset to find similarities and differences with the
 previous analysis, and ulimately find if computational power is indeed
 wasted in this new workload as well.
 \hypertarget{google-borg}{%
 \subsection{Google Borg}\label{google-borg}}
 Borg is Google's own cluster management software. Among the various
 cluster management services it provides, the main ones are: job queuing,
 scheduling, allocation, and deallocation due to higher priority
 computations.
 The data this thesis is based on is from 8 Borg ``cells''
 (i.e.~clusters) spanning 8 different datacenters, all focused on
 ``compute'' (i.e.~computational oriented) workloads. The data collection
 timespan matches the entire month of May 2019.
 In Google's lingo a ``job'' is a large unit of computational workload
 made up of several ``tasks'', i.e.~a number of executions of single
 executables running on a single machine. A job may run tasks
 sequentially or in parallel, and the condition for a job's succesful
 termination is nontrivial.
 Both tasks and jobs lifecyles are represented by several events, which
 are encoded and stored in the trace as rows of various tables. Among the
 information events provide, the field ``type'' provides information on
 the execution status of the job or task. This field can have the
 following values:
 \begin{itemize}
 \tightlist
 \item
  \textbf{QUEUE}: The job or task was marked not eligible for scheduling
  by Borg's scheduler, and thus Borg will move the job/task in a long
  wait queue;
 \item
  \textbf{SUBMIT}: The job or task was submitted to Borg for execution;
 \item
  \textbf{ENABLE}: The job or task became eligible for scheduling;
 \item
  \textbf{SCHEDULE}: The job or task's execution started;
 \item
  \textbf{EVICT}: The job or task was terminated in order to free
  computational resources for an higher priority job;
 \item
  \textbf{FAIL}: The job or task terminated its execution unsuccesfully
  due to a failure;
 \item
  \textbf{FINISH}: The job or task terminated succesfully;
 \item
  \textbf{KILL}: The job or task terminated its execution because of a
  manual request to stop it;
 \item
  \textbf{LOST}: It is assumed a job or task is has been terminated, but
  due to missing data there is insufficent information to identify when
  or how;
 \item
  \textbf{UPDATE\_PENDING}: The metadata (scheduling class, resource
  requirements, \ldots) of the job/task was updated while the job was
  waiting to be scheduled;
 \item
  \textbf{UPDATE\_RUNNING}: The metadata (scheduling class, resource
  requirements, \ldots) of the job/task was updated while the job was in
  execution;
 \end{itemize}
 Figure \ref{fig:eventTypes} shows the expected transitions between event
 types.
 \begin{figure}
 \centering
 \includegraphics{./figures/event_types.png}
 \caption{Typical transitions between task/job event types according to
 Google \label{fig:eventTypes}}
 \end{figure}
 \hypertarget{traces-contents}{%
 \subsection{Traces contents}\label{traces-contents}}
 The traces provided by Google contain mainly a collection of job and
 task events spanning a month of execution of the 8 different clusters.
 In addition to this data, some additional data on the machines'
 configuration in terms of resources (i.e.~amount of CPU and RAM) and
 additional machine-related metadata.
 Due to Google's policy, most identification related data (like job/task
 IDs, raw resource amounts and other text values) were obfuscated prior
 to the release of the traces. One obfuscation that is noteworthy in the
 scope of this thesis is related to CPU and RAM amounts, which are
 expressed respetively in NCUs (\emph{Normalized Compute Units}) and NMUs
 (\emph{Normalized Memory Units}).
 NCUs and NMUs are defined based on the raw machine resource
 distributions of the machines within the 8 clusters. A machine having 1
 NCU CPU power and 1 NMU memory size has the maximum amount of raw CPU
 power and raw RAM size found in the clusters. While RAM size is measured
 in bytes for normalization purposes, CPU power was measured in GCU
 (\emph{Google Compute Units}), a proprietary CPU power measurement unit
 used by Google that combines several parameters like number of
 processors and cores, clock frequency, and architecture (i.e.~ISA).
 \hypertarget{overview-of-traces-format}{%
 \subsection{Overview of traces'
 format}\label{overview-of-traces-format}}
 The traces have a collective size of approximately 8TiB and are stored
 in a Gzip-compressed JSONL (JSON lines) format, which means that each
 table is represented by a single logical ``file'' (stored in several
 file segments) where each carriage return separated line represents a
 single record for that table.
 There are namely 5 different table ``files'':
 \begin{itemize}
 \tightlist
 \item
  \texttt{machine\_configs}, which is a table containing each physical
  machine's configuration and its evolution over time;
 \item
  \texttt{instance\_events}, which is a table of task events;
 \item
  \texttt{collection\_events}, which is a table of job events;
 \item
  \texttt{machine\_attributes}, which is a table containing (obfuscated)
  metadata about each physical machine and its evolution over time;
 \item
  \texttt{instance\_usage}, which contains resource (CPU/RAM) measures
  of jobs and tasks running on the single machines.
 \end{itemize}
 The scope of this thesis focuses on the tables
 \texttt{machine\_configs}, \texttt{instance\_events} and
 \texttt{collection\_events}.
 \hypertarget{remark-on-traces-size}{%
 \subsection{Remark on traces size}\label{remark-on-traces-size}}
 While the 2011 Google Borg traces were relatively small, with a total
 size in the order of the tens of gigabytes, the 2019 traces are quite
 challenging to analyze due to their sheer size. As stated before, the
 traces have a total size of 8 TiB when stored in the format provided by
 Google. Even when broken down to table ``files'', unitary sizes still
 reach the single tebibyte mark (namely for \texttt{machine\_configs},
 the largest table in the trace).
 Due to this constraints, a careful data engineering based approach was
 used when reproducing the 2015 DSN paper analysis. Bleeding edge data
 science technologies like Apache Spark were used to achieve efficient
 and parallelized computations. This approach is discussed with further
 detail in the following section.
 \hypertarget{project-requirements-and-analysis}{%
 \section{Project requirements and
 analysis}\label{project-requirements-and-analysis}}
 \textbf{TBD} (describe our objective with this analysis in detail)
 \hypertarget{analysis-methodology}{%
 \section{Analysis methodology}\label{analysis-methodology}}
 \textbf{TBD}
 \hypertarget{introduction-on-apache-spark}{%
 \subsection{Introduction on Apache
 Spark}\label{introduction-on-apache-spark}}
 Apache Spark is a unified analytics engine for large-scale data
 processing. In layman's terms, Spark is really useful to parallelize
 computations in a fast and streamlined way.
 In the scope of this thesis, Spark was used essentially as a Map-Reduce
 framework for computing aggregated results on the various tables. Due to
 the sharded nature of table ``files'', Spark is able to spawn a thread
 per file and run computations using all processors on the server
 machines used to run the analysis.
 Spark is also quite powerful since it provides automated thread pooling
 services, and it is able to efficiently store and cache intermediate
 computation on secondary storage without any additional effort required
 from the data engineer. This feature was especially useful due to the
 sheer size of the analyzed data, since the computations required to
 store up to 1TiB of intermediate data on disk.
 The chosen programming language for writing analysis scripts was Python.
 Spark has very powerful native Python bindings in the form of the
 \emph{PySpark} API, which were used to implement the various queries.
 \hypertarget{query-architecture}{%
 \subsection{Query architecture}\label{query-architecture}}
 In general, each query follows a general Map-Reduce template, where
 traces are first read, parsed, filtered by performing selections,
 projections and computing new derived fields. Then, the trace records
 are often grouped by one of their fields, clustering related data
 toghether before a reduce or fold operation is applied to each grouping.
 Most input data is in JSONL format and adheres to a schema Google
 profided in the form of a protobuffer specification\footnote{\href{https://github.com/google/cluster-data/blob/master/clusterdata_trace_format_v3.proto}{Google
  2019 Borg traces Protobuffer specification on Github}}.
 On of the main quirks in the traces is that fields that have a ``zero''
 value (i.e.~a value like 0 or the empty string) are often omitted in the
 JSON object records. When reading the traces in Apache Spark is
 therefore necessary to check for this possibility and populate those
 zero fields when omitted.
 Most queries use only two or three fields in each trace records, while
 the original records often are made of a couple of dozen fields. In
 order to save memory during the query, a projection is often applied to
 the data by the means of a .map() operation over the entire trace set,
 performed using Spark's RDD API.
 Another operation that is often necessary to perform prior to the
 Map-Reduce core of each query is a record filtering process, which is
 often motivated by the presence of incomplete data (i.e.~records which
 contain fields whose values is unknown). This filtering is performed
 using the .filter() operation of Spark's RDD API.
 The core of each query is often a groupBy followed by a map() operation
 on the aggregated data. The groupby groups the set of all records into
 several subsets of records each having something in common. Then, each
 of this small clusters is reduced with a .map() operation to a single
 record. The motivation behind this computation is often to analyze a
 time series of several different traces of programs. This is implemented
 by groupBy()-ing records by program id, and then map()-ing each program
 trace set by sorting by time the traces and computing the desired
 property in the form of a record.
 Sometimes intermediate results are saved in Spark's parquet format in
 order to compute and save intermediate results beforehand.
 \hypertarget{general-query-script-design}{%
 \subsection{General Query script
 design}\label{general-query-script-design}}
 \textbf{TBD}
 \hypertarget{ad-hoc-presentation-of-some-analysis-scripts}{%
 \subsection{Ad-Hoc presentation of some analysis
 scripts}\label{ad-hoc-presentation-of-some-analysis-scripts}}
 \textbf{TBD} (with diagrams)
 \hypertarget{analysis-and-observations}{%
 \section{Analysis and observations}\label{analysis-and-observations}}
 \hypertarget{overview-of-machine-configurations-in-each-cluster}{%
 \subsection{Overview of machine configurations in each
 cluster}\label{overview-of-machine-configurations-in-each-cluster}}
 \input{figures/machine_configs}
 Refer to figure \ref{fig:machineconfigs}.
 \textbf{Observations}:
 \begin{itemize}
 \tightlist
 \item
  machine configurations are definitely more varied than the ones in the
  2011 traces
 \item
  some clusters have more machine variability
 \end{itemize}
 \hypertarget{analysis-of-execution-time-per-each-execution-phase}{%
 \subsection{Analysis of execution time per each execution
 phase}\label{analysis-of-execution-time-per-each-execution-phase}}
 \input{figures/machine_time_waste}
 Refer to figures \ref{fig:machinetimewaste-abs} and
 \ref{fig:machinetimewaste-rel}.
 \textbf{Observations}:
 \begin{itemize}
 \tightlist
 \item
  Across all cluster almost 50\% of time is spent in ``unknown''
  transitions, i.e. there are some time slices that are related to a
  state transition that Google says are not ``typical'' transitions.
  This is mostly due to the trace log being intermittent when recording
  all state transitions.
 \item
  80\% of the time spent in KILL and LOST is unknown. This is
  predictable, since both states indicate that the job execution is not
  stable (in particular LOST is used when the state logging itself is
  unstable)
 \item
  From the absolute graph we see that the time ``wasted'' on non-finish
  terminated jobs is very significant
 \item
  Execution is the most significant task phase, followed by queuing time
  and scheduling time (``ready'' state)
 \item
  In the absolute graph we see that a significant amount of time is
  spent to re-schedule evicted jobs (``evicted'' state)
 \item
  Cluster A has unusually high queuing times
 \end{itemize}
 \hypertarget{task-slowdown}{%
 \subsection{Task slowdown}\label{task-slowdown}}
 \input{figures/task_slowdown}
 Refer to figure \ref{fig:taskslowdown}
 \textbf{Observations}:
 \begin{itemize}
 \tightlist
 \item
  Priority values are different from 0-11 values in the 2011 traces. A
  conversion table is provided by Google;
 \item
  For some priorities (e.g.~101 for cluster D) the relative number of
  finishing task is very low and the mean slowdown is very high (315).
  This behaviour differs from the relatively homogeneous values from the
  2011 traces.
 \item
  Some slowdown values cannot be computed since either some tasks have a
  0ns execution time or for some priorities no tasks in the traces
  terminate successfully. More raw data on those exception is in
  Jupyter.
 \item
  The \% of finishing jobs is relatively low comparing with the 2011
  traces.
 \end{itemize}
 \hypertarget{reserved-and-actual-resource-usage-of-tasks}{%
 \subsection{Reserved and actual resource usage of
 tasks}\label{reserved-and-actual-resource-usage-of-tasks}}
 \input{figures/spatial_resource_waste}
 Refer to figures \ref{fig:spatialresourcewaste-actual} and
 \ref{fig:spatialresourcewaste-requested}.
 \textbf{Observations}:
 \begin{itemize}
 \tightlist
 \item
  Most (mesasured and requested) resources are used by killed job, even
  more than in the 2011 traces.
 \item
  Behaviour is rather homogeneous across datacenters, with the exception
  of cluster G where a lot of LOST-terminated tasks acquired 70\% of
  both CPU and RAM
 \end{itemize}
 \hypertarget{correlation-between-task-events-metadata-and-task-termination}{%
 \subsection{Correlation between task events' metadata and task
 termination}\label{correlation-between-task-events-metadata-and-task-termination}}
 \input{figures/figure_7}
 Refer to figures \ref{fig:figureVII-a}, \ref{fig:figureVII-b}, and
 \ref{fig:figureVII-c}.
 \textbf{Observations}:
 \begin{itemize}
 \tightlist
 \item
  No smooth curves in this figure either, unlike 2011 traces
 \item
  The behaviour of curves for 7a (priority) is almost the opposite of
  2011, i.e. in-between priorities have higher kill rates while
  priorities at the extremum have lower kill rates. This could also be
  due bt the inherent distribution of job terminations;
 \item
  Event execution time curves are quite different than 2011, here it
  seems there is a good correlation between short task execution times
  and finish event rates, instead of the U shape curve in 2015 DSN
 \item
  In figure \ref{fig:figureVII-b} cluster behaviour seems quite uniform
 \item
  Machine concurrency seems to play little role in the event termination
  distribution, as for all concurrency factors the kill rate is at 90\%.
 \end{itemize}
 \hypertarget{correlation-between-task-events-resource-metadata-and-task-termination}{%
 \subsection{Correlation between task events' resource metadata and task
 termination}\label{correlation-between-task-events-resource-metadata-and-task-termination}}
 \hypertarget{correlation-between-job-events-metadata-and-job-termination}{%
 \subsection{Correlation between job events' metadata and job
 termination}\label{correlation-between-job-events-metadata-and-job-termination}}
 \input{figures/figure_9}
 Refer to figures \ref{fig:figureIX-a}, \ref{fig:figureIX-b}, and
 \ref{fig:figureIX-c}.
 \textbf{Observations}:
 \begin{itemize}
 \tightlist
 \item
  Behaviour between cluster varies a lot
 \item
  There are no ``smooth'' gradients in the various curves unlike in the
  2011 traces
 \item
  Killed jobs have higher event rates in general, and overall dominate
  all event rates measures
 \item
  There still seems to be a correlation between short execution job
  times and successfull final termination, and likewise for kills and
  higher job terminations
 \item
  Across all clusters, a machine locality factor of 1 seems to lead to
  the highest success event rate
 \end{itemize}
 \hypertarget{mean-number-of-tasks-and-event-distribution-per-task-type}{%
 \subsection{Mean number of tasks and event distribution per task
 type}\label{mean-number-of-tasks-and-event-distribution-per-task-type}}
 \input{figures/table_iii}
 Refer to figure \ref{fig:tableIII}.
 \textbf{Observations}:
 \begin{itemize}
 \tightlist
 \item
  The mean number of events per task is an order of magnitude higher
  than in the 2011 traces
 \item
  Generally speaking, the event type with higher mean is the termination
  event for the task
 \item
  The \# evts mean is higher than the sum of all other event type means,
  since it appears there are a lot more non-termination events in the
  2019 traces.
 \end{itemize}
 \hypertarget{mean-number-of-tasks-and-event-distribution-per-job-type}{%
 \subsection{Mean number of tasks and event distribution per job
 type}\label{mean-number-of-tasks-and-event-distribution-per-job-type}}
 \input{figures/table_iv}
 Refer to figure \ref{fig:tableIV}.
 \textbf{Observations}:
 \begin{itemize}
 \tightlist
 \item
  Again the mean number of tasks is significantly higher than the 2011
  traces, indicating a higher complexity of workloads
 \item
  Cluster A has no evicted jobs
 \item
  The number of events is however lower than the event means in the 2011
  traces
 \end{itemize}
 \hypertarget{probability-of-task-successful-termination-given-its-unsuccesful-events}{%
 \subsection{Probability of task successful termination given its
 unsuccesful
 events}\label{probability-of-task-successful-termination-given-its-unsuccesful-events}}
 \input{figures/figure_5}
 Refer to figure \ref{fig:figureV}.
 \textbf{Observations}:
 \begin{itemize}
 \tightlist
 \item
  Behaviour is very different from cluster to cluster
 \item
  There is no easy conclusion, unlike in 2011, on the correlation
  between succesful probability and \# of events of a specific type.
 \item
  Clusters B, C and D in particular have very unsmooth lines that vary a
  lot for small \# evts differences. This may be due to an uneven
  distribution of \# evts in the traces.
 \end{itemize}
 \hypertarget{potential-causes-of-unsuccesful-executions}{%
 \subsection{Potential causes of unsuccesful
 executions}\label{potential-causes-of-unsuccesful-executions}}
 \textbf{TBD}
 \hypertarget{implementation-issues-analysis-limitations}{%
 \section{Implementation issues -- Analysis
 limitations}\label{implementation-issues-analysis-limitations}}
 \hypertarget{discussion-on-unknown-fields}{%
 \subsection{Discussion on unknown
 fields}\label{discussion-on-unknown-fields}}
 \textbf{TBD}
 \hypertarget{limitation-on-computation-resources-required-for-the-analysis}{%
 \subsection{Limitation on computation resources required for the
 analysis}\label{limitation-on-computation-resources-required-for-the-analysis}}
 \textbf{TBD}
 \hypertarget{other-limitations}{%
 \subsection{Other limitations \ldots{}}\label{other-limitations}}
 \textbf{TBD}
 \hypertarget{conclusions-and-future-work-or-possible-developments}{%
 \section{Conclusions and future work or possible
 developments}\label{conclusions-and-future-work-or-possible-developments}}
 \textbf{TBD}
 \end{document}
--- a/status.ods
+++ b/status.ods
		`@ -1 +0,0 @@`
			`,maggicl,Apple2gs.local,16.05.2021 14:55,file:///Users/maggicl/Library/Application%20Support/LibreOffice/4;`