test
This commit is contained in:
parent
0d935c2112
commit
3746bfc9c9
6 changed files with 909 additions and 17 deletions
290
.gitignore
vendored
290
.gitignore
vendored
|
@ -8,3 +8,293 @@ figure_9/*.parquet/
|
||||||
figure_9/?_task_count/
|
figure_9/?_task_count/
|
||||||
figure_9/?_machine_locality/
|
figure_9/?_machine_locality/
|
||||||
table_iii/*.parquet/
|
table_iii/*.parquet/
|
||||||
|
|
||||||
|
## Core latex/pdflatex auxiliary files:
|
||||||
|
*.aux
|
||||||
|
*.lof
|
||||||
|
*.log
|
||||||
|
*.lot
|
||||||
|
*.fls
|
||||||
|
*.out
|
||||||
|
*.toc
|
||||||
|
*.fmt
|
||||||
|
*.fot
|
||||||
|
*.cb
|
||||||
|
*.cb2
|
||||||
|
.*.lb
|
||||||
|
|
||||||
|
## Intermediate documents:
|
||||||
|
*.dvi
|
||||||
|
*.xdv
|
||||||
|
*-converted-to.*
|
||||||
|
# these rules might exclude image files for figures etc.
|
||||||
|
# *.ps
|
||||||
|
# *.eps
|
||||||
|
# *.pdf
|
||||||
|
|
||||||
|
## Generated if empty string is given at "Please type another file name for output:"
|
||||||
|
.pdf
|
||||||
|
|
||||||
|
## Bibliography auxiliary files (bibtex/biblatex/biber):
|
||||||
|
*.bbl
|
||||||
|
*.bcf
|
||||||
|
*.blg
|
||||||
|
*-blx.aux
|
||||||
|
*-blx.bib
|
||||||
|
*.run.xml
|
||||||
|
|
||||||
|
## Build tool auxiliary files:
|
||||||
|
*.fdb_latexmk
|
||||||
|
*.synctex
|
||||||
|
*.synctex(busy)
|
||||||
|
*.synctex.gz
|
||||||
|
*.synctex.gz(busy)
|
||||||
|
*.pdfsync
|
||||||
|
|
||||||
|
## Build tool directories for auxiliary files
|
||||||
|
# latexrun
|
||||||
|
latex.out/
|
||||||
|
|
||||||
|
## Auxiliary and intermediate files from other packages:
|
||||||
|
# algorithms
|
||||||
|
*.alg
|
||||||
|
*.loa
|
||||||
|
|
||||||
|
# achemso
|
||||||
|
acs-*.bib
|
||||||
|
|
||||||
|
# amsthm
|
||||||
|
*.thm
|
||||||
|
|
||||||
|
# beamer
|
||||||
|
*.nav
|
||||||
|
*.pre
|
||||||
|
*.snm
|
||||||
|
*.vrb
|
||||||
|
|
||||||
|
# changes
|
||||||
|
*.soc
|
||||||
|
|
||||||
|
# comment
|
||||||
|
*.cut
|
||||||
|
|
||||||
|
# cprotect
|
||||||
|
*.cpt
|
||||||
|
|
||||||
|
# elsarticle (documentclass of Elsevier journals)
|
||||||
|
*.spl
|
||||||
|
|
||||||
|
# endnotes
|
||||||
|
*.ent
|
||||||
|
|
||||||
|
# fixme
|
||||||
|
*.lox
|
||||||
|
|
||||||
|
# feynmf/feynmp
|
||||||
|
*.mf
|
||||||
|
*.mp
|
||||||
|
*.t[1-9]
|
||||||
|
*.t[1-9][0-9]
|
||||||
|
*.tfm
|
||||||
|
|
||||||
|
#(r)(e)ledmac/(r)(e)ledpar
|
||||||
|
*.end
|
||||||
|
*.?end
|
||||||
|
*.[1-9]
|
||||||
|
*.[1-9][0-9]
|
||||||
|
*.[1-9][0-9][0-9]
|
||||||
|
*.[1-9]R
|
||||||
|
*.[1-9][0-9]R
|
||||||
|
*.[1-9][0-9][0-9]R
|
||||||
|
*.eledsec[1-9]
|
||||||
|
*.eledsec[1-9]R
|
||||||
|
*.eledsec[1-9][0-9]
|
||||||
|
*.eledsec[1-9][0-9]R
|
||||||
|
*.eledsec[1-9][0-9][0-9]
|
||||||
|
*.eledsec[1-9][0-9][0-9]R
|
||||||
|
|
||||||
|
# glossaries
|
||||||
|
*.acn
|
||||||
|
*.acr
|
||||||
|
*.glg
|
||||||
|
*.glo
|
||||||
|
*.gls
|
||||||
|
*.glsdefs
|
||||||
|
*.lzo
|
||||||
|
*.lzs
|
||||||
|
|
||||||
|
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
|
||||||
|
# *.ist
|
||||||
|
|
||||||
|
# gnuplottex
|
||||||
|
*-gnuplottex-*
|
||||||
|
|
||||||
|
# gregoriotex
|
||||||
|
*.gaux
|
||||||
|
*.glog
|
||||||
|
*.gtex
|
||||||
|
|
||||||
|
# htlatex
|
||||||
|
*.4ct
|
||||||
|
*.4tc
|
||||||
|
*.idv
|
||||||
|
*.lg
|
||||||
|
*.trc
|
||||||
|
*.xref
|
||||||
|
|
||||||
|
# hyperref
|
||||||
|
*.brf
|
||||||
|
|
||||||
|
# knitr
|
||||||
|
*-concordance.tex
|
||||||
|
# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
|
||||||
|
# *.tikz
|
||||||
|
*-tikzDictionary
|
||||||
|
|
||||||
|
# listings
|
||||||
|
*.lol
|
||||||
|
|
||||||
|
# luatexja-ruby
|
||||||
|
*.ltjruby
|
||||||
|
|
||||||
|
# makeidx
|
||||||
|
*.idx
|
||||||
|
*.ilg
|
||||||
|
*.ind
|
||||||
|
|
||||||
|
# minitoc
|
||||||
|
*.maf
|
||||||
|
*.mlf
|
||||||
|
*.mlt
|
||||||
|
*.mtc[0-9]*
|
||||||
|
*.slf[0-9]*
|
||||||
|
*.slt[0-9]*
|
||||||
|
*.stc[0-9]*
|
||||||
|
|
||||||
|
# minted
|
||||||
|
_minted*
|
||||||
|
*.pyg
|
||||||
|
|
||||||
|
# morewrites
|
||||||
|
*.mw
|
||||||
|
|
||||||
|
# newpax
|
||||||
|
*.newpax
|
||||||
|
|
||||||
|
# nomencl
|
||||||
|
*.nlg
|
||||||
|
*.nlo
|
||||||
|
*.nls
|
||||||
|
|
||||||
|
# pax
|
||||||
|
*.pax
|
||||||
|
|
||||||
|
# pdfpcnotes
|
||||||
|
*.pdfpc
|
||||||
|
|
||||||
|
# sagetex
|
||||||
|
*.sagetex.sage
|
||||||
|
*.sagetex.py
|
||||||
|
*.sagetex.scmd
|
||||||
|
|
||||||
|
# scrwfile
|
||||||
|
*.wrt
|
||||||
|
|
||||||
|
# sympy
|
||||||
|
*.sout
|
||||||
|
*.sympy
|
||||||
|
sympy-plots-for-*.tex/
|
||||||
|
|
||||||
|
# pdfcomment
|
||||||
|
*.upa
|
||||||
|
*.upb
|
||||||
|
|
||||||
|
# pythontex
|
||||||
|
*.pytxcode
|
||||||
|
pythontex-files-*/
|
||||||
|
|
||||||
|
# tcolorbox
|
||||||
|
*.listing
|
||||||
|
|
||||||
|
# thmtools
|
||||||
|
*.loe
|
||||||
|
|
||||||
|
# TikZ & PGF
|
||||||
|
*.dpth
|
||||||
|
*.md5
|
||||||
|
*.auxlock
|
||||||
|
|
||||||
|
# todonotes
|
||||||
|
*.tdo
|
||||||
|
|
||||||
|
# vhistory
|
||||||
|
*.hst
|
||||||
|
*.ver
|
||||||
|
|
||||||
|
# easy-todo
|
||||||
|
*.lod
|
||||||
|
|
||||||
|
# xcolor
|
||||||
|
*.xcp
|
||||||
|
|
||||||
|
# xmpincl
|
||||||
|
*.xmpi
|
||||||
|
|
||||||
|
# xindy
|
||||||
|
*.xdy
|
||||||
|
|
||||||
|
# xypic precompiled matrices and outlines
|
||||||
|
*.xyc
|
||||||
|
*.xyd
|
||||||
|
|
||||||
|
# endfloat
|
||||||
|
*.ttt
|
||||||
|
*.fff
|
||||||
|
|
||||||
|
# Latexian
|
||||||
|
TSWLatexianTemp*
|
||||||
|
|
||||||
|
## Editors:
|
||||||
|
# WinEdt
|
||||||
|
*.bak
|
||||||
|
*.sav
|
||||||
|
|
||||||
|
# Texpad
|
||||||
|
.texpadtmp
|
||||||
|
|
||||||
|
# LyX
|
||||||
|
*.lyx~
|
||||||
|
|
||||||
|
# Kile
|
||||||
|
*.backup
|
||||||
|
|
||||||
|
# gummi
|
||||||
|
.*.swp
|
||||||
|
|
||||||
|
# KBibTeX
|
||||||
|
*~[0-9]*
|
||||||
|
|
||||||
|
# TeXnicCenter
|
||||||
|
*.tps
|
||||||
|
|
||||||
|
# auto folder when using emacs and auctex
|
||||||
|
./auto/*
|
||||||
|
*.el
|
||||||
|
|
||||||
|
# expex forward references with \gathertags
|
||||||
|
*-tags.tex
|
||||||
|
|
||||||
|
# standalone packages
|
||||||
|
*.sta
|
||||||
|
|
||||||
|
# Makeindex log files
|
||||||
|
*.lpz
|
||||||
|
|
||||||
|
# xwatermark package
|
||||||
|
*.xwm
|
||||||
|
|
||||||
|
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
|
||||||
|
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
|
||||||
|
# Uncomment the next line to have this generated file ignored.
|
||||||
|
#*Notes.bib
|
||||||
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
,maggicl,Apple2gs.local,16.05.2021 14:55,file:///Users/maggicl/Library/Application%20Support/LibreOffice/4;
|
|
|
@ -52,7 +52,17 @@ header-includes:
|
||||||
|
|
||||||
## Rosà et al. 2015 DSN paper
|
## Rosà et al. 2015 DSN paper
|
||||||
|
|
||||||
**TBD**
|
In 2015, Dr. Andrea Rosà, Lydia Y. Chen, Prof. Walter Binder published a
|
||||||
|
research paper titled "Understanding the Dark Side of Big Data Clusters:
|
||||||
|
An Analysis beyond Failures" performing several analysis on Google's 2011
|
||||||
|
Borg cluster traces. The salient conclusion of that research is that lots of
|
||||||
|
computation performed by Google would eventually fail, leading to large amounts
|
||||||
|
of computational power being wasted.
|
||||||
|
|
||||||
|
Our aim with this thesis is to repeat the analysis performed in 2015 on the new
|
||||||
|
2019 dataset to find similarities and differences with the previous analysis,
|
||||||
|
and ulimately find if computational power is indeed wasted in this new workload
|
||||||
|
as well.
|
||||||
|
|
||||||
## Google Borg
|
## Google Borg
|
||||||
|
|
||||||
|
@ -162,22 +172,30 @@ This approach is discussed with further detail in the following section.
|
||||||
|
|
||||||
**TBD**
|
**TBD**
|
||||||
|
|
||||||
## Overview on challenging aspects of analysis (data size, schema, avaliable computation resources)
|
|
||||||
|
|
||||||
**TBD**
|
|
||||||
|
|
||||||
## Introduction on Apache Spark
|
## Introduction on Apache Spark
|
||||||
|
|
||||||
**TBD**
|
Apache Spark is a unified analytics engine for large-scale data processing. In
|
||||||
|
layman's terms, Spark is really useful to parallelize computations in a fast and
|
||||||
|
streamlined way.
|
||||||
|
|
||||||
## General workflow description of apache spark workflow
|
In the scope of this thesis, Spark was used essentially as a Map-Reduce
|
||||||
|
framework for computing aggregated results on the various tables. Due to the
|
||||||
|
sharded nature of table "files", Spark is able to spawn a thread per file and
|
||||||
|
run computations using all processors on the server machines used to run the
|
||||||
|
analysis.
|
||||||
|
|
||||||
**TBD** (extract from the notes sent to Filippo shown below)
|
Spark is also quite powerful since it provides automated thread pooling
|
||||||
|
services, and it is able to efficiently store and cache intermediate computation
|
||||||
|
on secondary storage without any additional effort required from the data
|
||||||
|
engineer. This feature was especially useful due to the sheer size of the
|
||||||
|
analyzed data, since the computations required to store up to 1TiB of
|
||||||
|
intermediate data on disk.
|
||||||
|
|
||||||
The Google 2019 Borg cluster traces analysis were conducted by using Apache
|
The chosen programming language for writing analysis scripts was Python. Spark
|
||||||
Spark and its Python 3 API (pyspark). Spark was used to execute a series of
|
has very powerful native Python bindings in the form of the _PySpark_ API, which
|
||||||
queries to perform various sums and aggregations over the entire dataset
|
were used to implement the various queries.
|
||||||
provided by Google.
|
|
||||||
|
## Query architecture
|
||||||
|
|
||||||
In general, each query follows a general Map-Reduce template, where traces are
|
In general, each query follows a general Map-Reduce template, where traces are
|
||||||
first read, parsed, filtered by performing selections, projections and computing
|
first read, parsed, filtered by performing selections, projections and computing
|
||||||
|
@ -202,10 +220,10 @@ memory during the query, a projection is often applied to the data by the means
|
||||||
of a .map() operation over the entire trace set, performed using Spark's RDD
|
of a .map() operation over the entire trace set, performed using Spark's RDD
|
||||||
API.
|
API.
|
||||||
|
|
||||||
Another operation that is often necessary to perform prior to the Map-Reduce core of
|
Another operation that is often necessary to perform prior to the Map-Reduce
|
||||||
each query is a record filtering process, which is often motivated by the
|
core of each query is a record filtering process, which is often motivated by
|
||||||
presence of incomplete data (i.e. records which contain fields whose values is
|
the presence of incomplete data (i.e. records which contain fields whose values
|
||||||
unknown). This filtering is performed using the .filter() operation of Spark's
|
is unknown). This filtering is performed using the .filter() operation of Spark's
|
||||||
RDD API.
|
RDD API.
|
||||||
|
|
||||||
The core of each query is often a groupBy followed by a map() operation on the
|
The core of each query is often a groupBy followed by a map() operation on the
|
||||||
|
@ -222,6 +240,8 @@ compute and save intermediate results beforehand.
|
||||||
|
|
||||||
## General Query script design
|
## General Query script design
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
**TBD**
|
**TBD**
|
||||||
|
|
||||||
## Ad-Hoc presentation of some analysis scripts
|
## Ad-Hoc presentation of some analysis scripts
|
||||||
|
|
Binary file not shown.
583
report/Claudio_Maggioni_report.tex
Normal file
583
report/Claudio_Maggioni_report.tex
Normal file
|
@ -0,0 +1,583 @@
|
||||||
|
\documentclass{usiinfbachelorproject}
|
||||||
|
\title{Understanding and Comparing Unsuccessful Executions in Large Datacenters}
|
||||||
|
\author{Claudio Maggioni}
|
||||||
|
|
||||||
|
\usepackage{amsmath}
|
||||||
|
\usepackage{subcaption}
|
||||||
|
\usepackage{booktabs}
|
||||||
|
\usepackage{graphicx}
|
||||||
|
|
||||||
|
\captionsetup{labelfont={bf}}
|
||||||
|
%\subtitle{The (optional) subtitle}
|
||||||
|
|
||||||
|
\versiondate{\today}
|
||||||
|
|
||||||
|
\begin{committee}
|
||||||
|
\advisor[Universit\`a della Svizzera Italiana,
|
||||||
|
Switzerland]{Prof.}{Walter}{Binder}
|
||||||
|
\assistant[Universit\`a della Svizzera Italiana,
|
||||||
|
Switzerland]{Dr.}{Andrea}{Ros\'a}
|
||||||
|
\end{committee}
|
||||||
|
|
||||||
|
\abstract{The project aims at comparing two different traces coming from large
|
||||||
|
datacenters, focusing in particular on unsuccessful executions of jobs and
|
||||||
|
tasks submitted by users. The objective of this project is to compare the
|
||||||
|
resource waste caused by unsuccessful executions, their impact on application
|
||||||
|
performance, and their root causes. We will show the strong negative impact on
|
||||||
|
CPU and RAM usage and on task slowdown. We will analyze patterns of
|
||||||
|
unsuccessful jobs and tasks, particularly focusing on their interdependency.
|
||||||
|
Moreover, we will uncover their root causes by inspecting key workload and
|
||||||
|
system attributes such asmachine locality and concurrency level.}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
\tableofcontents
|
||||||
|
\newpage
|
||||||
|
|
||||||
|
\hypertarget{introduction-including-motivation}{%
|
||||||
|
\section{Introduction (including
|
||||||
|
Motivation)}\label{introduction-including-motivation}}
|
||||||
|
|
||||||
|
\hypertarget{state-of-the-art}{%
|
||||||
|
\section{State of the Art}\label{state-of-the-art}}
|
||||||
|
|
||||||
|
\hypertarget{introduction}{%
|
||||||
|
\subsection{Introduction}\label{introduction}}
|
||||||
|
|
||||||
|
\textbf{TBD}
|
||||||
|
|
||||||
|
\hypertarget{rosuxe0-et-al.-2015-dsn-paper}{%
|
||||||
|
\subsection{Rosà et al.~2015 DSN
|
||||||
|
paper}\label{rosuxe0-et-al.-2015-dsn-paper}}
|
||||||
|
|
||||||
|
In 2015, Dr.~Andrea Rosà, Lydia Y. Chen, Prof.~Walter Binder published a
|
||||||
|
research paper titled ``Understanding the Dark Side of Big Data
|
||||||
|
Clusters: An Analysis beyond Failures'' performing several analysis on
|
||||||
|
Google's 2011 Borg cluster traces. The salient conclusion of that
|
||||||
|
research is that lots of computation performed by Google would
|
||||||
|
eventually fail, leading to large amounts of computational power being
|
||||||
|
wasted.
|
||||||
|
|
||||||
|
Our aim with this thesis is to repeat the analysis performed in 2015 on
|
||||||
|
the new 2019 dataset to find similarities and differences with the
|
||||||
|
previous analysis, and ulimately find if computational power is indeed
|
||||||
|
wasted in this new workload as well.
|
||||||
|
|
||||||
|
\hypertarget{google-borg}{%
|
||||||
|
\subsection{Google Borg}\label{google-borg}}
|
||||||
|
|
||||||
|
Borg is Google's own cluster management software. Among the various
|
||||||
|
cluster management services it provides, the main ones are: job queuing,
|
||||||
|
scheduling, allocation, and deallocation due to higher priority
|
||||||
|
computations.
|
||||||
|
|
||||||
|
The data this thesis is based on is from 8 Borg ``cells''
|
||||||
|
(i.e.~clusters) spanning 8 different datacenters, all focused on
|
||||||
|
``compute'' (i.e.~computational oriented) workloads. The data collection
|
||||||
|
timespan matches the entire month of May 2019.
|
||||||
|
|
||||||
|
In Google's lingo a ``job'' is a large unit of computational workload
|
||||||
|
made up of several ``tasks'', i.e.~a number of executions of single
|
||||||
|
executables running on a single machine. A job may run tasks
|
||||||
|
sequentially or in parallel, and the condition for a job's succesful
|
||||||
|
termination is nontrivial.
|
||||||
|
|
||||||
|
Both tasks and jobs lifecyles are represented by several events, which
|
||||||
|
are encoded and stored in the trace as rows of various tables. Among the
|
||||||
|
information events provide, the field ``type'' provides information on
|
||||||
|
the execution status of the job or task. This field can have the
|
||||||
|
following values:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\tightlist
|
||||||
|
\item
|
||||||
|
\textbf{QUEUE}: The job or task was marked not eligible for scheduling
|
||||||
|
by Borg's scheduler, and thus Borg will move the job/task in a long
|
||||||
|
wait queue;
|
||||||
|
\item
|
||||||
|
\textbf{SUBMIT}: The job or task was submitted to Borg for execution;
|
||||||
|
\item
|
||||||
|
\textbf{ENABLE}: The job or task became eligible for scheduling;
|
||||||
|
\item
|
||||||
|
\textbf{SCHEDULE}: The job or task's execution started;
|
||||||
|
\item
|
||||||
|
\textbf{EVICT}: The job or task was terminated in order to free
|
||||||
|
computational resources for an higher priority job;
|
||||||
|
\item
|
||||||
|
\textbf{FAIL}: The job or task terminated its execution unsuccesfully
|
||||||
|
due to a failure;
|
||||||
|
\item
|
||||||
|
\textbf{FINISH}: The job or task terminated succesfully;
|
||||||
|
\item
|
||||||
|
\textbf{KILL}: The job or task terminated its execution because of a
|
||||||
|
manual request to stop it;
|
||||||
|
\item
|
||||||
|
\textbf{LOST}: It is assumed a job or task is has been terminated, but
|
||||||
|
due to missing data there is insufficent information to identify when
|
||||||
|
or how;
|
||||||
|
\item
|
||||||
|
\textbf{UPDATE\_PENDING}: The metadata (scheduling class, resource
|
||||||
|
requirements, \ldots) of the job/task was updated while the job was
|
||||||
|
waiting to be scheduled;
|
||||||
|
\item
|
||||||
|
\textbf{UPDATE\_RUNNING}: The metadata (scheduling class, resource
|
||||||
|
requirements, \ldots) of the job/task was updated while the job was in
|
||||||
|
execution;
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
Figure \ref{fig:eventTypes} shows the expected transitions between event
|
||||||
|
types.
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\centering
|
||||||
|
\includegraphics{./figures/event_types.png}
|
||||||
|
\caption{Typical transitions between task/job event types according to
|
||||||
|
Google \label{fig:eventTypes}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\hypertarget{traces-contents}{%
|
||||||
|
\subsection{Traces contents}\label{traces-contents}}
|
||||||
|
|
||||||
|
The traces provided by Google contain mainly a collection of job and
|
||||||
|
task events spanning a month of execution of the 8 different clusters.
|
||||||
|
In addition to this data, some additional data on the machines'
|
||||||
|
configuration in terms of resources (i.e.~amount of CPU and RAM) and
|
||||||
|
additional machine-related metadata.
|
||||||
|
|
||||||
|
Due to Google's policy, most identification related data (like job/task
|
||||||
|
IDs, raw resource amounts and other text values) were obfuscated prior
|
||||||
|
to the release of the traces. One obfuscation that is noteworthy in the
|
||||||
|
scope of this thesis is related to CPU and RAM amounts, which are
|
||||||
|
expressed respetively in NCUs (\emph{Normalized Compute Units}) and NMUs
|
||||||
|
(\emph{Normalized Memory Units}).
|
||||||
|
|
||||||
|
NCUs and NMUs are defined based on the raw machine resource
|
||||||
|
distributions of the machines within the 8 clusters. A machine having 1
|
||||||
|
NCU CPU power and 1 NMU memory size has the maximum amount of raw CPU
|
||||||
|
power and raw RAM size found in the clusters. While RAM size is measured
|
||||||
|
in bytes for normalization purposes, CPU power was measured in GCU
|
||||||
|
(\emph{Google Compute Units}), a proprietary CPU power measurement unit
|
||||||
|
used by Google that combines several parameters like number of
|
||||||
|
processors and cores, clock frequency, and architecture (i.e.~ISA).
|
||||||
|
|
||||||
|
\hypertarget{overview-of-traces-format}{%
|
||||||
|
\subsection{Overview of traces'
|
||||||
|
format}\label{overview-of-traces-format}}
|
||||||
|
|
||||||
|
The traces have a collective size of approximately 8TiB and are stored
|
||||||
|
in a Gzip-compressed JSONL (JSON lines) format, which means that each
|
||||||
|
table is represented by a single logical ``file'' (stored in several
|
||||||
|
file segments) where each carriage return separated line represents a
|
||||||
|
single record for that table.
|
||||||
|
|
||||||
|
There are namely 5 different table ``files'':
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\tightlist
|
||||||
|
\item
|
||||||
|
\texttt{machine\_configs}, which is a table containing each physical
|
||||||
|
machine's configuration and its evolution over time;
|
||||||
|
\item
|
||||||
|
\texttt{instance\_events}, which is a table of task events;
|
||||||
|
\item
|
||||||
|
\texttt{collection\_events}, which is a table of job events;
|
||||||
|
\item
|
||||||
|
\texttt{machine\_attributes}, which is a table containing (obfuscated)
|
||||||
|
metadata about each physical machine and its evolution over time;
|
||||||
|
\item
|
||||||
|
\texttt{instance\_usage}, which contains resource (CPU/RAM) measures
|
||||||
|
of jobs and tasks running on the single machines.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
The scope of this thesis focuses on the tables
|
||||||
|
\texttt{machine\_configs}, \texttt{instance\_events} and
|
||||||
|
\texttt{collection\_events}.
|
||||||
|
|
||||||
|
\hypertarget{remark-on-traces-size}{%
|
||||||
|
\subsection{Remark on traces size}\label{remark-on-traces-size}}
|
||||||
|
|
||||||
|
While the 2011 Google Borg traces were relatively small, with a total
|
||||||
|
size in the order of the tens of gigabytes, the 2019 traces are quite
|
||||||
|
challenging to analyze due to their sheer size. As stated before, the
|
||||||
|
traces have a total size of 8 TiB when stored in the format provided by
|
||||||
|
Google. Even when broken down to table ``files'', unitary sizes still
|
||||||
|
reach the single tebibyte mark (namely for \texttt{machine\_configs},
|
||||||
|
the largest table in the trace).
|
||||||
|
|
||||||
|
Due to this constraints, a careful data engineering based approach was
|
||||||
|
used when reproducing the 2015 DSN paper analysis. Bleeding edge data
|
||||||
|
science technologies like Apache Spark were used to achieve efficient
|
||||||
|
and parallelized computations. This approach is discussed with further
|
||||||
|
detail in the following section.
|
||||||
|
|
||||||
|
\hypertarget{project-requirements-and-analysis}{%
|
||||||
|
\section{Project requirements and
|
||||||
|
analysis}\label{project-requirements-and-analysis}}
|
||||||
|
|
||||||
|
\textbf{TBD} (describe our objective with this analysis in detail)
|
||||||
|
|
||||||
|
\hypertarget{analysis-methodology}{%
|
||||||
|
\section{Analysis methodology}\label{analysis-methodology}}
|
||||||
|
|
||||||
|
\textbf{TBD}
|
||||||
|
|
||||||
|
\hypertarget{introduction-on-apache-spark}{%
|
||||||
|
\subsection{Introduction on Apache
|
||||||
|
Spark}\label{introduction-on-apache-spark}}
|
||||||
|
|
||||||
|
Apache Spark is a unified analytics engine for large-scale data
|
||||||
|
processing. In layman's terms, Spark is really useful to parallelize
|
||||||
|
computations in a fast and streamlined way.
|
||||||
|
|
||||||
|
In the scope of this thesis, Spark was used essentially as a Map-Reduce
|
||||||
|
framework for computing aggregated results on the various tables. Due to
|
||||||
|
the sharded nature of table ``files'', Spark is able to spawn a thread
|
||||||
|
per file and run computations using all processors on the server
|
||||||
|
machines used to run the analysis.
|
||||||
|
|
||||||
|
Spark is also quite powerful since it provides automated thread pooling
|
||||||
|
services, and it is able to efficiently store and cache intermediate
|
||||||
|
computation on secondary storage without any additional effort required
|
||||||
|
from the data engineer. This feature was especially useful due to the
|
||||||
|
sheer size of the analyzed data, since the computations required to
|
||||||
|
store up to 1TiB of intermediate data on disk.
|
||||||
|
|
||||||
|
The chosen programming language for writing analysis scripts was Python.
|
||||||
|
Spark has very powerful native Python bindings in the form of the
|
||||||
|
\emph{PySpark} API, which were used to implement the various queries.
|
||||||
|
|
||||||
|
\hypertarget{query-architecture}{%
|
||||||
|
\subsection{Query architecture}\label{query-architecture}}
|
||||||
|
|
||||||
|
In general, each query follows a general Map-Reduce template, where
|
||||||
|
traces are first read, parsed, filtered by performing selections,
|
||||||
|
projections and computing new derived fields. Then, the trace records
|
||||||
|
are often grouped by one of their fields, clustering related data
|
||||||
|
toghether before a reduce or fold operation is applied to each grouping.
|
||||||
|
|
||||||
|
Most input data is in JSONL format and adheres to a schema Google
|
||||||
|
profided in the form of a protobuffer specification\footnote{\href{https://github.com/google/cluster-data/blob/master/clusterdata_trace_format_v3.proto}{Google
|
||||||
|
2019 Borg traces Protobuffer specification on Github}}.
|
||||||
|
|
||||||
|
On of the main quirks in the traces is that fields that have a ``zero''
|
||||||
|
value (i.e.~a value like 0 or the empty string) are often omitted in the
|
||||||
|
JSON object records. When reading the traces in Apache Spark is
|
||||||
|
therefore necessary to check for this possibility and populate those
|
||||||
|
zero fields when omitted.
|
||||||
|
|
||||||
|
Most queries use only two or three fields in each trace records, while
|
||||||
|
the original records often are made of a couple of dozen fields. In
|
||||||
|
order to save memory during the query, a projection is often applied to
|
||||||
|
the data by the means of a .map() operation over the entire trace set,
|
||||||
|
performed using Spark's RDD API.
|
||||||
|
|
||||||
|
Another operation that is often necessary to perform prior to the
|
||||||
|
Map-Reduce core of each query is a record filtering process, which is
|
||||||
|
often motivated by the presence of incomplete data (i.e.~records which
|
||||||
|
contain fields whose values is unknown). This filtering is performed
|
||||||
|
using the .filter() operation of Spark's RDD API.
|
||||||
|
|
||||||
|
The core of each query is often a groupBy followed by a map() operation
|
||||||
|
on the aggregated data. The groupby groups the set of all records into
|
||||||
|
several subsets of records each having something in common. Then, each
|
||||||
|
of this small clusters is reduced with a .map() operation to a single
|
||||||
|
record. The motivation behind this computation is often to analyze a
|
||||||
|
time series of several different traces of programs. This is implemented
|
||||||
|
by groupBy()-ing records by program id, and then map()-ing each program
|
||||||
|
trace set by sorting by time the traces and computing the desired
|
||||||
|
property in the form of a record.
|
||||||
|
|
||||||
|
Sometimes intermediate results are saved in Spark's parquet format in
|
||||||
|
order to compute and save intermediate results beforehand.
|
||||||
|
|
||||||
|
\hypertarget{general-query-script-design}{%
|
||||||
|
\subsection{General Query script
|
||||||
|
design}\label{general-query-script-design}}
|
||||||
|
|
||||||
|
\textbf{TBD}
|
||||||
|
|
||||||
|
\hypertarget{ad-hoc-presentation-of-some-analysis-scripts}{%
|
||||||
|
\subsection{Ad-Hoc presentation of some analysis
|
||||||
|
scripts}\label{ad-hoc-presentation-of-some-analysis-scripts}}
|
||||||
|
|
||||||
|
\textbf{TBD} (with diagrams)
|
||||||
|
|
||||||
|
\hypertarget{analysis-and-observations}{%
|
||||||
|
\section{Analysis and observations}\label{analysis-and-observations}}
|
||||||
|
|
||||||
|
\hypertarget{overview-of-machine-configurations-in-each-cluster}{%
|
||||||
|
\subsection{Overview of machine configurations in each
|
||||||
|
cluster}\label{overview-of-machine-configurations-in-each-cluster}}
|
||||||
|
|
||||||
|
\input{figures/machine_configs}
|
||||||
|
|
||||||
|
Refer to figure \ref{fig:machineconfigs}.
|
||||||
|
|
||||||
|
\textbf{Observations}:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\tightlist
|
||||||
|
\item
|
||||||
|
machine configurations are definitely more varied than the ones in the
|
||||||
|
2011 traces
|
||||||
|
\item
|
||||||
|
some clusters have more machine variability
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\hypertarget{analysis-of-execution-time-per-each-execution-phase}{%
|
||||||
|
\subsection{Analysis of execution time per each execution
|
||||||
|
phase}\label{analysis-of-execution-time-per-each-execution-phase}}
|
||||||
|
|
||||||
|
\input{figures/machine_time_waste}
|
||||||
|
|
||||||
|
Refer to figures \ref{fig:machinetimewaste-abs} and
|
||||||
|
\ref{fig:machinetimewaste-rel}.
|
||||||
|
|
||||||
|
\textbf{Observations}:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\tightlist
|
||||||
|
\item
|
||||||
|
Across all cluster almost 50\% of time is spent in ``unknown''
|
||||||
|
transitions, i.e. there are some time slices that are related to a
|
||||||
|
state transition that Google says are not ``typical'' transitions.
|
||||||
|
This is mostly due to the trace log being intermittent when recording
|
||||||
|
all state transitions.
|
||||||
|
\item
|
||||||
|
80\% of the time spent in KILL and LOST is unknown. This is
|
||||||
|
predictable, since both states indicate that the job execution is not
|
||||||
|
stable (in particular LOST is used when the state logging itself is
|
||||||
|
unstable)
|
||||||
|
\item
|
||||||
|
From the absolute graph we see that the time ``wasted'' on non-finish
|
||||||
|
terminated jobs is very significant
|
||||||
|
\item
|
||||||
|
Execution is the most significant task phase, followed by queuing time
|
||||||
|
and scheduling time (``ready'' state)
|
||||||
|
\item
|
||||||
|
In the absolute graph we see that a significant amount of time is
|
||||||
|
spent to re-schedule evicted jobs (``evicted'' state)
|
||||||
|
\item
|
||||||
|
Cluster A has unusually high queuing times
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\hypertarget{task-slowdown}{%
|
||||||
|
\subsection{Task slowdown}\label{task-slowdown}}
|
||||||
|
|
||||||
|
\input{figures/task_slowdown}
|
||||||
|
|
||||||
|
Refer to figure \ref{fig:taskslowdown}
|
||||||
|
|
||||||
|
\textbf{Observations}:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\tightlist
|
||||||
|
\item
|
||||||
|
Priority values are different from 0-11 values in the 2011 traces. A
|
||||||
|
conversion table is provided by Google;
|
||||||
|
\item
|
||||||
|
For some priorities (e.g.~101 for cluster D) the relative number of
|
||||||
|
finishing task is very low and the mean slowdown is very high (315).
|
||||||
|
This behaviour differs from the relatively homogeneous values from the
|
||||||
|
2011 traces.
|
||||||
|
\item
|
||||||
|
Some slowdown values cannot be computed since either some tasks have a
|
||||||
|
0ns execution time or for some priorities no tasks in the traces
|
||||||
|
terminate successfully. More raw data on those exception is in
|
||||||
|
Jupyter.
|
||||||
|
\item
|
||||||
|
The \% of finishing jobs is relatively low comparing with the 2011
|
||||||
|
traces.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\hypertarget{reserved-and-actual-resource-usage-of-tasks}{%
|
||||||
|
\subsection{Reserved and actual resource usage of
|
||||||
|
tasks}\label{reserved-and-actual-resource-usage-of-tasks}}
|
||||||
|
|
||||||
|
\input{figures/spatial_resource_waste}
|
||||||
|
|
||||||
|
Refer to figures \ref{fig:spatialresourcewaste-actual} and
|
||||||
|
\ref{fig:spatialresourcewaste-requested}.
|
||||||
|
|
||||||
|
\textbf{Observations}:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\tightlist
|
||||||
|
\item
|
||||||
|
Most (mesasured and requested) resources are used by killed job, even
|
||||||
|
more than in the 2011 traces.
|
||||||
|
\item
|
||||||
|
Behaviour is rather homogeneous across datacenters, with the exception
|
||||||
|
of cluster G where a lot of LOST-terminated tasks acquired 70\% of
|
||||||
|
both CPU and RAM
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\hypertarget{correlation-between-task-events-metadata-and-task-termination}{%
|
||||||
|
\subsection{Correlation between task events' metadata and task
|
||||||
|
termination}\label{correlation-between-task-events-metadata-and-task-termination}}
|
||||||
|
|
||||||
|
\input{figures/figure_7}
|
||||||
|
|
||||||
|
Refer to figures \ref{fig:figureVII-a}, \ref{fig:figureVII-b}, and
|
||||||
|
\ref{fig:figureVII-c}.
|
||||||
|
|
||||||
|
\textbf{Observations}:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\tightlist
|
||||||
|
\item
|
||||||
|
No smooth curves in this figure either, unlike 2011 traces
|
||||||
|
\item
|
||||||
|
The behaviour of curves for 7a (priority) is almost the opposite of
|
||||||
|
2011, i.e. in-between priorities have higher kill rates while
|
||||||
|
priorities at the extremum have lower kill rates. This could also be
|
||||||
|
due bt the inherent distribution of job terminations;
|
||||||
|
\item
|
||||||
|
Event execution time curves are quite different than 2011, here it
|
||||||
|
seems there is a good correlation between short task execution times
|
||||||
|
and finish event rates, instead of the U shape curve in 2015 DSN
|
||||||
|
\item
|
||||||
|
In figure \ref{fig:figureVII-b} cluster behaviour seems quite uniform
|
||||||
|
\item
|
||||||
|
Machine concurrency seems to play little role in the event termination
|
||||||
|
distribution, as for all concurrency factors the kill rate is at 90\%.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\hypertarget{correlation-between-task-events-resource-metadata-and-task-termination}{%
|
||||||
|
\subsection{Correlation between task events' resource metadata and task
|
||||||
|
termination}\label{correlation-between-task-events-resource-metadata-and-task-termination}}
|
||||||
|
|
||||||
|
\hypertarget{correlation-between-job-events-metadata-and-job-termination}{%
|
||||||
|
\subsection{Correlation between job events' metadata and job
|
||||||
|
termination}\label{correlation-between-job-events-metadata-and-job-termination}}
|
||||||
|
|
||||||
|
\input{figures/figure_9}
|
||||||
|
|
||||||
|
Refer to figures \ref{fig:figureIX-a}, \ref{fig:figureIX-b}, and
|
||||||
|
\ref{fig:figureIX-c}.
|
||||||
|
|
||||||
|
\textbf{Observations}:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\tightlist
|
||||||
|
\item
|
||||||
|
Behaviour between cluster varies a lot
|
||||||
|
\item
|
||||||
|
There are no ``smooth'' gradients in the various curves unlike in the
|
||||||
|
2011 traces
|
||||||
|
\item
|
||||||
|
Killed jobs have higher event rates in general, and overall dominate
|
||||||
|
all event rates measures
|
||||||
|
\item
|
||||||
|
There still seems to be a correlation between short execution job
|
||||||
|
times and successfull final termination, and likewise for kills and
|
||||||
|
higher job terminations
|
||||||
|
\item
|
||||||
|
Across all clusters, a machine locality factor of 1 seems to lead to
|
||||||
|
the highest success event rate
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\hypertarget{mean-number-of-tasks-and-event-distribution-per-task-type}{%
|
||||||
|
\subsection{Mean number of tasks and event distribution per task
|
||||||
|
type}\label{mean-number-of-tasks-and-event-distribution-per-task-type}}
|
||||||
|
|
||||||
|
\input{figures/table_iii}
|
||||||
|
|
||||||
|
Refer to figure \ref{fig:tableIII}.
|
||||||
|
|
||||||
|
\textbf{Observations}:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\tightlist
|
||||||
|
\item
|
||||||
|
The mean number of events per task is an order of magnitude higher
|
||||||
|
than in the 2011 traces
|
||||||
|
\item
|
||||||
|
Generally speaking, the event type with higher mean is the termination
|
||||||
|
event for the task
|
||||||
|
\item
|
||||||
|
The \# evts mean is higher than the sum of all other event type means,
|
||||||
|
since it appears there are a lot more non-termination events in the
|
||||||
|
2019 traces.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\hypertarget{mean-number-of-tasks-and-event-distribution-per-job-type}{%
|
||||||
|
\subsection{Mean number of tasks and event distribution per job
|
||||||
|
type}\label{mean-number-of-tasks-and-event-distribution-per-job-type}}
|
||||||
|
|
||||||
|
\input{figures/table_iv}
|
||||||
|
|
||||||
|
Refer to figure \ref{fig:tableIV}.
|
||||||
|
|
||||||
|
\textbf{Observations}:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\tightlist
|
||||||
|
\item
|
||||||
|
Again the mean number of tasks is significantly higher than the 2011
|
||||||
|
traces, indicating a higher complexity of workloads
|
||||||
|
\item
|
||||||
|
Cluster A has no evicted jobs
|
||||||
|
\item
|
||||||
|
The number of events is however lower than the event means in the 2011
|
||||||
|
traces
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\hypertarget{probability-of-task-successful-termination-given-its-unsuccesful-events}{%
|
||||||
|
\subsection{Probability of task successful termination given its
|
||||||
|
unsuccesful
|
||||||
|
events}\label{probability-of-task-successful-termination-given-its-unsuccesful-events}}
|
||||||
|
|
||||||
|
\input{figures/figure_5}
|
||||||
|
|
||||||
|
Refer to figure \ref{fig:figureV}.
|
||||||
|
|
||||||
|
\textbf{Observations}:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\tightlist
|
||||||
|
\item
|
||||||
|
Behaviour is very different from cluster to cluster
|
||||||
|
\item
|
||||||
|
There is no easy conclusion, unlike in 2011, on the correlation
|
||||||
|
between succesful probability and \# of events of a specific type.
|
||||||
|
\item
|
||||||
|
Clusters B, C and D in particular have very unsmooth lines that vary a
|
||||||
|
lot for small \# evts differences. This may be due to an uneven
|
||||||
|
distribution of \# evts in the traces.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\hypertarget{potential-causes-of-unsuccesful-executions}{%
|
||||||
|
\subsection{Potential causes of unsuccesful
|
||||||
|
executions}\label{potential-causes-of-unsuccesful-executions}}
|
||||||
|
|
||||||
|
\textbf{TBD}
|
||||||
|
|
||||||
|
\hypertarget{implementation-issues-analysis-limitations}{%
|
||||||
|
\section{Implementation issues -- Analysis
|
||||||
|
limitations}\label{implementation-issues-analysis-limitations}}
|
||||||
|
|
||||||
|
\hypertarget{discussion-on-unknown-fields}{%
|
||||||
|
\subsection{Discussion on unknown
|
||||||
|
fields}\label{discussion-on-unknown-fields}}
|
||||||
|
|
||||||
|
\textbf{TBD}
|
||||||
|
|
||||||
|
\hypertarget{limitation-on-computation-resources-required-for-the-analysis}{%
|
||||||
|
\subsection{Limitation on computation resources required for the
|
||||||
|
analysis}\label{limitation-on-computation-resources-required-for-the-analysis}}
|
||||||
|
|
||||||
|
\textbf{TBD}
|
||||||
|
|
||||||
|
\hypertarget{other-limitations}{%
|
||||||
|
\subsection{Other limitations \ldots{}}\label{other-limitations}}
|
||||||
|
|
||||||
|
\textbf{TBD}
|
||||||
|
|
||||||
|
\hypertarget{conclusions-and-future-work-or-possible-developments}{%
|
||||||
|
\section{Conclusions and future work or possible
|
||||||
|
developments}\label{conclusions-and-future-work-or-possible-developments}}
|
||||||
|
|
||||||
|
\textbf{TBD}
|
||||||
|
|
||||||
|
\end{document}
|
BIN
status.ods
BIN
status.ods
Binary file not shown.
Loading…
Reference in a new issue