diff --git a/.gitignore b/.gitignore index 6c92308f..fa164ef1 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,293 @@ figure_9/*.parquet/ figure_9/?_task_count/ figure_9/?_machine_locality/ table_iii/*.parquet/ + +## Core latex/pdflatex auxiliary files: +*.aux +*.lof +*.log +*.lot +*.fls +*.out +*.toc +*.fmt +*.fot +*.cb +*.cb2 +.*.lb + +## Intermediate documents: +*.dvi +*.xdv +*-converted-to.* +# these rules might exclude image files for figures etc. +# *.ps +# *.eps +# *.pdf + +## Generated if empty string is given at "Please type another file name for output:" +.pdf + +## Bibliography auxiliary files (bibtex/biblatex/biber): +*.bbl +*.bcf +*.blg +*-blx.aux +*-blx.bib +*.run.xml + +## Build tool auxiliary files: +*.fdb_latexmk +*.synctex +*.synctex(busy) +*.synctex.gz +*.synctex.gz(busy) +*.pdfsync + +## Build tool directories for auxiliary files +# latexrun +latex.out/ + +## Auxiliary and intermediate files from other packages: +# algorithms +*.alg +*.loa + +# achemso +acs-*.bib + +# amsthm +*.thm + +# beamer +*.nav +*.pre +*.snm +*.vrb + +# changes +*.soc + +# comment +*.cut + +# cprotect +*.cpt + +# elsarticle (documentclass of Elsevier journals) +*.spl + +# endnotes +*.ent + +# fixme +*.lox + +# feynmf/feynmp +*.mf +*.mp +*.t[1-9] +*.t[1-9][0-9] +*.tfm + +#(r)(e)ledmac/(r)(e)ledpar +*.end +*.?end +*.[1-9] +*.[1-9][0-9] +*.[1-9][0-9][0-9] +*.[1-9]R +*.[1-9][0-9]R +*.[1-9][0-9][0-9]R +*.eledsec[1-9] +*.eledsec[1-9]R +*.eledsec[1-9][0-9] +*.eledsec[1-9][0-9]R +*.eledsec[1-9][0-9][0-9] +*.eledsec[1-9][0-9][0-9]R + +# glossaries +*.acn +*.acr +*.glg +*.glo +*.gls +*.glsdefs +*.lzo +*.lzs + +# uncomment this for glossaries-extra (will ignore makeindex's style files!) +# *.ist + +# gnuplottex +*-gnuplottex-* + +# gregoriotex +*.gaux +*.glog +*.gtex + +# htlatex +*.4ct +*.4tc +*.idv +*.lg +*.trc +*.xref + +# hyperref +*.brf + +# knitr +*-concordance.tex +# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files +# *.tikz +*-tikzDictionary + +# listings +*.lol + +# luatexja-ruby +*.ltjruby + +# makeidx +*.idx +*.ilg +*.ind + +# minitoc +*.maf +*.mlf +*.mlt +*.mtc[0-9]* +*.slf[0-9]* +*.slt[0-9]* +*.stc[0-9]* + +# minted +_minted* +*.pyg + +# morewrites +*.mw + +# newpax +*.newpax + +# nomencl +*.nlg +*.nlo +*.nls + +# pax +*.pax + +# pdfpcnotes +*.pdfpc + +# sagetex +*.sagetex.sage +*.sagetex.py +*.sagetex.scmd + +# scrwfile +*.wrt + +# sympy +*.sout +*.sympy +sympy-plots-for-*.tex/ + +# pdfcomment +*.upa +*.upb + +# pythontex +*.pytxcode +pythontex-files-*/ + +# tcolorbox +*.listing + +# thmtools +*.loe + +# TikZ & PGF +*.dpth +*.md5 +*.auxlock + +# todonotes +*.tdo + +# vhistory +*.hst +*.ver + +# easy-todo +*.lod + +# xcolor +*.xcp + +# xmpincl +*.xmpi + +# xindy +*.xdy + +# xypic precompiled matrices and outlines +*.xyc +*.xyd + +# endfloat +*.ttt +*.fff + +# Latexian +TSWLatexianTemp* + +## Editors: +# WinEdt +*.bak +*.sav + +# Texpad +.texpadtmp + +# LyX +*.lyx~ + +# Kile +*.backup + +# gummi +.*.swp + +# KBibTeX +*~[0-9]* + +# TeXnicCenter +*.tps + +# auto folder when using emacs and auctex +./auto/* +*.el + +# expex forward references with \gathertags +*-tags.tex + +# standalone packages +*.sta + +# Makeindex log files +*.lpz + +# xwatermark package +*.xwm + +# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib +# option is specified. Footnotes are the stored in a file with suffix Notes.bib. +# Uncomment the next line to have this generated file ignored. +#*Notes.bib + diff --git a/.~lock.status.ods# b/.~lock.status.ods# deleted file mode 100644 index 0eeee260..00000000 --- a/.~lock.status.ods# +++ /dev/null @@ -1 +0,0 @@ -,maggicl,Apple2gs.local,16.05.2021 14:55,file:///Users/maggicl/Library/Application%20Support/LibreOffice/4; \ No newline at end of file diff --git a/report/Claudio_Maggioni_report.md b/report/Claudio_Maggioni_report.md index 0b15d961..2356d2f6 100644 --- a/report/Claudio_Maggioni_report.md +++ b/report/Claudio_Maggioni_report.md @@ -52,7 +52,17 @@ header-includes: ## Rosà et al. 2015 DSN paper -**TBD** +In 2015, Dr. Andrea Rosà, Lydia Y. Chen, Prof. Walter Binder published a +research paper titled "Understanding the Dark Side of Big Data Clusters: +An Analysis beyond Failures" performing several analysis on Google's 2011 +Borg cluster traces. The salient conclusion of that research is that lots of +computation performed by Google would eventually fail, leading to large amounts +of computational power being wasted. + +Our aim with this thesis is to repeat the analysis performed in 2015 on the new +2019 dataset to find similarities and differences with the previous analysis, +and ulimately find if computational power is indeed wasted in this new workload +as well. ## Google Borg @@ -162,22 +172,30 @@ This approach is discussed with further detail in the following section. **TBD** -## Overview on challenging aspects of analysis (data size, schema, avaliable computation resources) - -**TBD** - ## Introduction on Apache Spark -**TBD** +Apache Spark is a unified analytics engine for large-scale data processing. In +layman's terms, Spark is really useful to parallelize computations in a fast and +streamlined way. -## General workflow description of apache spark workflow +In the scope of this thesis, Spark was used essentially as a Map-Reduce +framework for computing aggregated results on the various tables. Due to the +sharded nature of table "files", Spark is able to spawn a thread per file and +run computations using all processors on the server machines used to run the +analysis. -**TBD** (extract from the notes sent to Filippo shown below) +Spark is also quite powerful since it provides automated thread pooling +services, and it is able to efficiently store and cache intermediate computation +on secondary storage without any additional effort required from the data +engineer. This feature was especially useful due to the sheer size of the +analyzed data, since the computations required to store up to 1TiB of +intermediate data on disk. -The Google 2019 Borg cluster traces analysis were conducted by using Apache -Spark and its Python 3 API (pyspark). Spark was used to execute a series of -queries to perform various sums and aggregations over the entire dataset -provided by Google. +The chosen programming language for writing analysis scripts was Python. Spark +has very powerful native Python bindings in the form of the _PySpark_ API, which +were used to implement the various queries. + +## Query architecture In general, each query follows a general Map-Reduce template, where traces are first read, parsed, filtered by performing selections, projections and computing @@ -202,10 +220,10 @@ memory during the query, a projection is often applied to the data by the means of a .map() operation over the entire trace set, performed using Spark's RDD API. -Another operation that is often necessary to perform prior to the Map-Reduce core of -each query is a record filtering process, which is often motivated by the -presence of incomplete data (i.e. records which contain fields whose values is -unknown). This filtering is performed using the .filter() operation of Spark's +Another operation that is often necessary to perform prior to the Map-Reduce +core of each query is a record filtering process, which is often motivated by +the presence of incomplete data (i.e. records which contain fields whose values +is unknown). This filtering is performed using the .filter() operation of Spark's RDD API. The core of each query is often a groupBy followed by a map() operation on the @@ -222,6 +240,8 @@ compute and save intermediate results beforehand. ## General Query script design + + **TBD** ## Ad-Hoc presentation of some analysis scripts diff --git a/report/Claudio_Maggioni_report.pdf b/report/Claudio_Maggioni_report.pdf index 9077899f..9129ab3d 100644 Binary files a/report/Claudio_Maggioni_report.pdf and b/report/Claudio_Maggioni_report.pdf differ diff --git a/report/Claudio_Maggioni_report.tex b/report/Claudio_Maggioni_report.tex new file mode 100644 index 00000000..12e00f8d --- /dev/null +++ b/report/Claudio_Maggioni_report.tex @@ -0,0 +1,583 @@ +\documentclass{usiinfbachelorproject} +\title{Understanding and Comparing Unsuccessful Executions in Large Datacenters} +\author{Claudio Maggioni} + +\usepackage{amsmath} +\usepackage{subcaption} +\usepackage{booktabs} +\usepackage{graphicx} + +\captionsetup{labelfont={bf}} +%\subtitle{The (optional) subtitle} + +\versiondate{\today} + +\begin{committee} +\advisor[Universit\`a della Svizzera Italiana, +Switzerland]{Prof.}{Walter}{Binder} +\assistant[Universit\`a della Svizzera Italiana, +Switzerland]{Dr.}{Andrea}{Ros\'a} +\end{committee} + +\abstract{The project aims at comparing two different traces coming from large +datacenters, focusing in particular on unsuccessful executions of jobs and +tasks submitted by users. The objective of this project is to compare the +resource waste caused by unsuccessful executions, their impact on application +performance, and their root causes. We will show the strong negative impact on +CPU and RAM usage and on task slowdown. We will analyze patterns of +unsuccessful jobs and tasks, particularly focusing on their interdependency. +Moreover, we will uncover their root causes by inspecting key workload and +system attributes such asmachine locality and concurrency level.} + +\begin{document} + +\tableofcontents +\newpage + +\hypertarget{introduction-including-motivation}{% +\section{Introduction (including +Motivation)}\label{introduction-including-motivation}} + +\hypertarget{state-of-the-art}{% +\section{State of the Art}\label{state-of-the-art}} + +\hypertarget{introduction}{% +\subsection{Introduction}\label{introduction}} + +\textbf{TBD} + +\hypertarget{rosuxe0-et-al.-2015-dsn-paper}{% +\subsection{Rosà et al.~2015 DSN +paper}\label{rosuxe0-et-al.-2015-dsn-paper}} + +In 2015, Dr.~Andrea Rosà, Lydia Y. Chen, Prof.~Walter Binder published a +research paper titled ``Understanding the Dark Side of Big Data +Clusters: An Analysis beyond Failures'' performing several analysis on +Google's 2011 Borg cluster traces. The salient conclusion of that +research is that lots of computation performed by Google would +eventually fail, leading to large amounts of computational power being +wasted. + +Our aim with this thesis is to repeat the analysis performed in 2015 on +the new 2019 dataset to find similarities and differences with the +previous analysis, and ulimately find if computational power is indeed +wasted in this new workload as well. + +\hypertarget{google-borg}{% +\subsection{Google Borg}\label{google-borg}} + +Borg is Google's own cluster management software. Among the various +cluster management services it provides, the main ones are: job queuing, +scheduling, allocation, and deallocation due to higher priority +computations. + +The data this thesis is based on is from 8 Borg ``cells'' +(i.e.~clusters) spanning 8 different datacenters, all focused on +``compute'' (i.e.~computational oriented) workloads. The data collection +timespan matches the entire month of May 2019. + +In Google's lingo a ``job'' is a large unit of computational workload +made up of several ``tasks'', i.e.~a number of executions of single +executables running on a single machine. A job may run tasks +sequentially or in parallel, and the condition for a job's succesful +termination is nontrivial. + +Both tasks and jobs lifecyles are represented by several events, which +are encoded and stored in the trace as rows of various tables. Among the +information events provide, the field ``type'' provides information on +the execution status of the job or task. This field can have the +following values: + +\begin{itemize} +\tightlist +\item + \textbf{QUEUE}: The job or task was marked not eligible for scheduling + by Borg's scheduler, and thus Borg will move the job/task in a long + wait queue; +\item + \textbf{SUBMIT}: The job or task was submitted to Borg for execution; +\item + \textbf{ENABLE}: The job or task became eligible for scheduling; +\item + \textbf{SCHEDULE}: The job or task's execution started; +\item + \textbf{EVICT}: The job or task was terminated in order to free + computational resources for an higher priority job; +\item + \textbf{FAIL}: The job or task terminated its execution unsuccesfully + due to a failure; +\item + \textbf{FINISH}: The job or task terminated succesfully; +\item + \textbf{KILL}: The job or task terminated its execution because of a + manual request to stop it; +\item + \textbf{LOST}: It is assumed a job or task is has been terminated, but + due to missing data there is insufficent information to identify when + or how; +\item + \textbf{UPDATE\_PENDING}: The metadata (scheduling class, resource + requirements, \ldots) of the job/task was updated while the job was + waiting to be scheduled; +\item + \textbf{UPDATE\_RUNNING}: The metadata (scheduling class, resource + requirements, \ldots) of the job/task was updated while the job was in + execution; +\end{itemize} + +Figure \ref{fig:eventTypes} shows the expected transitions between event +types. + +\begin{figure} +\centering +\includegraphics{./figures/event_types.png} +\caption{Typical transitions between task/job event types according to +Google \label{fig:eventTypes}} +\end{figure} + +\hypertarget{traces-contents}{% +\subsection{Traces contents}\label{traces-contents}} + +The traces provided by Google contain mainly a collection of job and +task events spanning a month of execution of the 8 different clusters. +In addition to this data, some additional data on the machines' +configuration in terms of resources (i.e.~amount of CPU and RAM) and +additional machine-related metadata. + +Due to Google's policy, most identification related data (like job/task +IDs, raw resource amounts and other text values) were obfuscated prior +to the release of the traces. One obfuscation that is noteworthy in the +scope of this thesis is related to CPU and RAM amounts, which are +expressed respetively in NCUs (\emph{Normalized Compute Units}) and NMUs +(\emph{Normalized Memory Units}). + +NCUs and NMUs are defined based on the raw machine resource +distributions of the machines within the 8 clusters. A machine having 1 +NCU CPU power and 1 NMU memory size has the maximum amount of raw CPU +power and raw RAM size found in the clusters. While RAM size is measured +in bytes for normalization purposes, CPU power was measured in GCU +(\emph{Google Compute Units}), a proprietary CPU power measurement unit +used by Google that combines several parameters like number of +processors and cores, clock frequency, and architecture (i.e.~ISA). + +\hypertarget{overview-of-traces-format}{% +\subsection{Overview of traces' +format}\label{overview-of-traces-format}} + +The traces have a collective size of approximately 8TiB and are stored +in a Gzip-compressed JSONL (JSON lines) format, which means that each +table is represented by a single logical ``file'' (stored in several +file segments) where each carriage return separated line represents a +single record for that table. + +There are namely 5 different table ``files'': + +\begin{itemize} +\tightlist +\item + \texttt{machine\_configs}, which is a table containing each physical + machine's configuration and its evolution over time; +\item + \texttt{instance\_events}, which is a table of task events; +\item + \texttt{collection\_events}, which is a table of job events; +\item + \texttt{machine\_attributes}, which is a table containing (obfuscated) + metadata about each physical machine and its evolution over time; +\item + \texttt{instance\_usage}, which contains resource (CPU/RAM) measures + of jobs and tasks running on the single machines. +\end{itemize} + +The scope of this thesis focuses on the tables +\texttt{machine\_configs}, \texttt{instance\_events} and +\texttt{collection\_events}. + +\hypertarget{remark-on-traces-size}{% +\subsection{Remark on traces size}\label{remark-on-traces-size}} + +While the 2011 Google Borg traces were relatively small, with a total +size in the order of the tens of gigabytes, the 2019 traces are quite +challenging to analyze due to their sheer size. As stated before, the +traces have a total size of 8 TiB when stored in the format provided by +Google. Even when broken down to table ``files'', unitary sizes still +reach the single tebibyte mark (namely for \texttt{machine\_configs}, +the largest table in the trace). + +Due to this constraints, a careful data engineering based approach was +used when reproducing the 2015 DSN paper analysis. Bleeding edge data +science technologies like Apache Spark were used to achieve efficient +and parallelized computations. This approach is discussed with further +detail in the following section. + +\hypertarget{project-requirements-and-analysis}{% +\section{Project requirements and +analysis}\label{project-requirements-and-analysis}} + +\textbf{TBD} (describe our objective with this analysis in detail) + +\hypertarget{analysis-methodology}{% +\section{Analysis methodology}\label{analysis-methodology}} + +\textbf{TBD} + +\hypertarget{introduction-on-apache-spark}{% +\subsection{Introduction on Apache +Spark}\label{introduction-on-apache-spark}} + +Apache Spark is a unified analytics engine for large-scale data +processing. In layman's terms, Spark is really useful to parallelize +computations in a fast and streamlined way. + +In the scope of this thesis, Spark was used essentially as a Map-Reduce +framework for computing aggregated results on the various tables. Due to +the sharded nature of table ``files'', Spark is able to spawn a thread +per file and run computations using all processors on the server +machines used to run the analysis. + +Spark is also quite powerful since it provides automated thread pooling +services, and it is able to efficiently store and cache intermediate +computation on secondary storage without any additional effort required +from the data engineer. This feature was especially useful due to the +sheer size of the analyzed data, since the computations required to +store up to 1TiB of intermediate data on disk. + +The chosen programming language for writing analysis scripts was Python. +Spark has very powerful native Python bindings in the form of the +\emph{PySpark} API, which were used to implement the various queries. + +\hypertarget{query-architecture}{% +\subsection{Query architecture}\label{query-architecture}} + +In general, each query follows a general Map-Reduce template, where +traces are first read, parsed, filtered by performing selections, +projections and computing new derived fields. Then, the trace records +are often grouped by one of their fields, clustering related data +toghether before a reduce or fold operation is applied to each grouping. + +Most input data is in JSONL format and adheres to a schema Google +profided in the form of a protobuffer specification\footnote{\href{https://github.com/google/cluster-data/blob/master/clusterdata_trace_format_v3.proto}{Google + 2019 Borg traces Protobuffer specification on Github}}. + +On of the main quirks in the traces is that fields that have a ``zero'' +value (i.e.~a value like 0 or the empty string) are often omitted in the +JSON object records. When reading the traces in Apache Spark is +therefore necessary to check for this possibility and populate those +zero fields when omitted. + +Most queries use only two or three fields in each trace records, while +the original records often are made of a couple of dozen fields. In +order to save memory during the query, a projection is often applied to +the data by the means of a .map() operation over the entire trace set, +performed using Spark's RDD API. + +Another operation that is often necessary to perform prior to the +Map-Reduce core of each query is a record filtering process, which is +often motivated by the presence of incomplete data (i.e.~records which +contain fields whose values is unknown). This filtering is performed +using the .filter() operation of Spark's RDD API. + +The core of each query is often a groupBy followed by a map() operation +on the aggregated data. The groupby groups the set of all records into +several subsets of records each having something in common. Then, each +of this small clusters is reduced with a .map() operation to a single +record. The motivation behind this computation is often to analyze a +time series of several different traces of programs. This is implemented +by groupBy()-ing records by program id, and then map()-ing each program +trace set by sorting by time the traces and computing the desired +property in the form of a record. + +Sometimes intermediate results are saved in Spark's parquet format in +order to compute and save intermediate results beforehand. + +\hypertarget{general-query-script-design}{% +\subsection{General Query script +design}\label{general-query-script-design}} + +\textbf{TBD} + +\hypertarget{ad-hoc-presentation-of-some-analysis-scripts}{% +\subsection{Ad-Hoc presentation of some analysis +scripts}\label{ad-hoc-presentation-of-some-analysis-scripts}} + +\textbf{TBD} (with diagrams) + +\hypertarget{analysis-and-observations}{% +\section{Analysis and observations}\label{analysis-and-observations}} + +\hypertarget{overview-of-machine-configurations-in-each-cluster}{% +\subsection{Overview of machine configurations in each +cluster}\label{overview-of-machine-configurations-in-each-cluster}} + +\input{figures/machine_configs} + +Refer to figure \ref{fig:machineconfigs}. + +\textbf{Observations}: + +\begin{itemize} +\tightlist +\item + machine configurations are definitely more varied than the ones in the + 2011 traces +\item + some clusters have more machine variability +\end{itemize} + +\hypertarget{analysis-of-execution-time-per-each-execution-phase}{% +\subsection{Analysis of execution time per each execution +phase}\label{analysis-of-execution-time-per-each-execution-phase}} + +\input{figures/machine_time_waste} + +Refer to figures \ref{fig:machinetimewaste-abs} and +\ref{fig:machinetimewaste-rel}. + +\textbf{Observations}: + +\begin{itemize} +\tightlist +\item + Across all cluster almost 50\% of time is spent in ``unknown'' + transitions, i.e. there are some time slices that are related to a + state transition that Google says are not ``typical'' transitions. + This is mostly due to the trace log being intermittent when recording + all state transitions. +\item + 80\% of the time spent in KILL and LOST is unknown. This is + predictable, since both states indicate that the job execution is not + stable (in particular LOST is used when the state logging itself is + unstable) +\item + From the absolute graph we see that the time ``wasted'' on non-finish + terminated jobs is very significant +\item + Execution is the most significant task phase, followed by queuing time + and scheduling time (``ready'' state) +\item + In the absolute graph we see that a significant amount of time is + spent to re-schedule evicted jobs (``evicted'' state) +\item + Cluster A has unusually high queuing times +\end{itemize} + +\hypertarget{task-slowdown}{% +\subsection{Task slowdown}\label{task-slowdown}} + +\input{figures/task_slowdown} + +Refer to figure \ref{fig:taskslowdown} + +\textbf{Observations}: + +\begin{itemize} +\tightlist +\item + Priority values are different from 0-11 values in the 2011 traces. A + conversion table is provided by Google; +\item + For some priorities (e.g.~101 for cluster D) the relative number of + finishing task is very low and the mean slowdown is very high (315). + This behaviour differs from the relatively homogeneous values from the + 2011 traces. +\item + Some slowdown values cannot be computed since either some tasks have a + 0ns execution time or for some priorities no tasks in the traces + terminate successfully. More raw data on those exception is in + Jupyter. +\item + The \% of finishing jobs is relatively low comparing with the 2011 + traces. +\end{itemize} + +\hypertarget{reserved-and-actual-resource-usage-of-tasks}{% +\subsection{Reserved and actual resource usage of +tasks}\label{reserved-and-actual-resource-usage-of-tasks}} + +\input{figures/spatial_resource_waste} + +Refer to figures \ref{fig:spatialresourcewaste-actual} and +\ref{fig:spatialresourcewaste-requested}. + +\textbf{Observations}: + +\begin{itemize} +\tightlist +\item + Most (mesasured and requested) resources are used by killed job, even + more than in the 2011 traces. +\item + Behaviour is rather homogeneous across datacenters, with the exception + of cluster G where a lot of LOST-terminated tasks acquired 70\% of + both CPU and RAM +\end{itemize} + +\hypertarget{correlation-between-task-events-metadata-and-task-termination}{% +\subsection{Correlation between task events' metadata and task +termination}\label{correlation-between-task-events-metadata-and-task-termination}} + +\input{figures/figure_7} + +Refer to figures \ref{fig:figureVII-a}, \ref{fig:figureVII-b}, and +\ref{fig:figureVII-c}. + +\textbf{Observations}: + +\begin{itemize} +\tightlist +\item + No smooth curves in this figure either, unlike 2011 traces +\item + The behaviour of curves for 7a (priority) is almost the opposite of + 2011, i.e. in-between priorities have higher kill rates while + priorities at the extremum have lower kill rates. This could also be + due bt the inherent distribution of job terminations; +\item + Event execution time curves are quite different than 2011, here it + seems there is a good correlation between short task execution times + and finish event rates, instead of the U shape curve in 2015 DSN +\item + In figure \ref{fig:figureVII-b} cluster behaviour seems quite uniform +\item + Machine concurrency seems to play little role in the event termination + distribution, as for all concurrency factors the kill rate is at 90\%. +\end{itemize} + +\hypertarget{correlation-between-task-events-resource-metadata-and-task-termination}{% +\subsection{Correlation between task events' resource metadata and task +termination}\label{correlation-between-task-events-resource-metadata-and-task-termination}} + +\hypertarget{correlation-between-job-events-metadata-and-job-termination}{% +\subsection{Correlation between job events' metadata and job +termination}\label{correlation-between-job-events-metadata-and-job-termination}} + +\input{figures/figure_9} + +Refer to figures \ref{fig:figureIX-a}, \ref{fig:figureIX-b}, and +\ref{fig:figureIX-c}. + +\textbf{Observations}: + +\begin{itemize} +\tightlist +\item + Behaviour between cluster varies a lot +\item + There are no ``smooth'' gradients in the various curves unlike in the + 2011 traces +\item + Killed jobs have higher event rates in general, and overall dominate + all event rates measures +\item + There still seems to be a correlation between short execution job + times and successfull final termination, and likewise for kills and + higher job terminations +\item + Across all clusters, a machine locality factor of 1 seems to lead to + the highest success event rate +\end{itemize} + +\hypertarget{mean-number-of-tasks-and-event-distribution-per-task-type}{% +\subsection{Mean number of tasks and event distribution per task +type}\label{mean-number-of-tasks-and-event-distribution-per-task-type}} + +\input{figures/table_iii} + +Refer to figure \ref{fig:tableIII}. + +\textbf{Observations}: + +\begin{itemize} +\tightlist +\item + The mean number of events per task is an order of magnitude higher + than in the 2011 traces +\item + Generally speaking, the event type with higher mean is the termination + event for the task +\item + The \# evts mean is higher than the sum of all other event type means, + since it appears there are a lot more non-termination events in the + 2019 traces. +\end{itemize} + +\hypertarget{mean-number-of-tasks-and-event-distribution-per-job-type}{% +\subsection{Mean number of tasks and event distribution per job +type}\label{mean-number-of-tasks-and-event-distribution-per-job-type}} + +\input{figures/table_iv} + +Refer to figure \ref{fig:tableIV}. + +\textbf{Observations}: + +\begin{itemize} +\tightlist +\item + Again the mean number of tasks is significantly higher than the 2011 + traces, indicating a higher complexity of workloads +\item + Cluster A has no evicted jobs +\item + The number of events is however lower than the event means in the 2011 + traces +\end{itemize} + +\hypertarget{probability-of-task-successful-termination-given-its-unsuccesful-events}{% +\subsection{Probability of task successful termination given its +unsuccesful +events}\label{probability-of-task-successful-termination-given-its-unsuccesful-events}} + +\input{figures/figure_5} + +Refer to figure \ref{fig:figureV}. + +\textbf{Observations}: + +\begin{itemize} +\tightlist +\item + Behaviour is very different from cluster to cluster +\item + There is no easy conclusion, unlike in 2011, on the correlation + between succesful probability and \# of events of a specific type. +\item + Clusters B, C and D in particular have very unsmooth lines that vary a + lot for small \# evts differences. This may be due to an uneven + distribution of \# evts in the traces. +\end{itemize} + +\hypertarget{potential-causes-of-unsuccesful-executions}{% +\subsection{Potential causes of unsuccesful +executions}\label{potential-causes-of-unsuccesful-executions}} + +\textbf{TBD} + +\hypertarget{implementation-issues-analysis-limitations}{% +\section{Implementation issues -- Analysis +limitations}\label{implementation-issues-analysis-limitations}} + +\hypertarget{discussion-on-unknown-fields}{% +\subsection{Discussion on unknown +fields}\label{discussion-on-unknown-fields}} + +\textbf{TBD} + +\hypertarget{limitation-on-computation-resources-required-for-the-analysis}{% +\subsection{Limitation on computation resources required for the +analysis}\label{limitation-on-computation-resources-required-for-the-analysis}} + +\textbf{TBD} + +\hypertarget{other-limitations}{% +\subsection{Other limitations \ldots{}}\label{other-limitations}} + +\textbf{TBD} + +\hypertarget{conclusions-and-future-work-or-possible-developments}{% +\section{Conclusions and future work or possible +developments}\label{conclusions-and-future-work-or-possible-developments}} + +\textbf{TBD} + +\end{document} diff --git a/status.ods b/status.ods index 56b4e7aa..071ec28e 100644 Binary files a/status.ods and b/status.ods differ