# Statuses total time
Sums the times instances spend in one of each states in the diagram saved as
"statuses.drawio". Unknown times are summed as "unknown"

In [3]:
import json
import sys
import pandas
import seaborn as sns
import matplotlib as mpl
mpl.use("pgf")
mpl.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})
import matplotlib.pyplot as plt

In [4]:
# QUEUE = set(["0-2", "1-2", "assumptions:", "1-1", "1-0"])
# RESUB = set(["4-1", "4-0", "5-1", "6-1", "7-1", "8-1", "assumptions:", "5-0", "6-0", "7-0", \
#   "8-0"])
# READY = set(["0-3", "2-3", "0-9", "2-9", "9-3", "2-7", "2-8", "9-7", "9-8", \
#   "9-9", "0-7", "0-8", "assumptions:", "2-0", "2-4", "9-4", "9-1"])
# RUN = set(["3-1", "3-10", "3-4", "3-5", "3-6", "3-7", "3-8", "10-5", "10-6", \
#   "10-7", "10-8", "10-4", "10-10", "10-1", "assumptions:", "3-0", "10-0", "3-3"])

QUEUE = set(["0-2", "1-2"])
ENDED = set(["5-1", "6-1", "7-1", "8-1"])
READY = set(["0-3", "0-9", "2-3", "2-9", "9-3", "9-9"])
RUN = set(["3-1", "3-4", "3-5", "3-6", "3-7", "3-8", "3-10", "10-1", "10-4", "10-5", "10-6", "10-7", "10-8", "10-10"])
EVICT = set(["4-1", "4-0"])

In [5]:
DIR = "/Users/maggicl/git/bachelorThesis"

In [6]:
def to_name(et):
    if et == 4: 
        return 'EVICT'
    elif et == 5: 
        return 'FAIL'
    elif et == 6:
        return 'FINISH'
    elif et == 7:
        return 'KILL'
    elif et == 8:
        return 'LOST'
    else:
        return 'NO_TERM'

def create_df(cluster):
    obj = {}

    filename = DIR + "/machine_time_waste/" + cluster + "_state_changes.json"

    with open(filename, 'r') as f:
        obj = json.loads(f.read())
    
    data = {'Last termination': [], 'time_type': [], 'time_ms': []}
    totals = 0
    
    def add_record(et, tt, time):
        data['Last termination'].append(to_name(et))
        data['time_type'].append(tt)
        data['time_ms'].append(time / 1000)

    for pair in obj["data"]:
        qt = et = rt = xt = vt = ut = 0

        pair[0] = 0 if pair[0] is None else pair[0]
        
        # Filter useless terms
        if pair[0] not in [4,5,6,7]:
            continue

        x = pair[1]
        for k in x.keys():
            if k in QUEUE:
                qt += x[k]
            elif k in ENDED:
                et += x[k]
            elif k in READY:
                rt += x[k]
            elif k in RUN:
                xt += x[k]
            elif k in EVICT:
                vt += x[k]
            else:
                ut += x[k]

        add_record(pair[0], 'Queue', qt + rt)
        add_record(pair[0], 'Resubmission', et + vt)
        add_record(pair[0], 'Running', xt)
        add_record(pair[0], 'Unknown', ut)
        totals += (qt + et + rt + xt + vt + ut) / 1000
        
    return (pandas.DataFrame(data, columns=['Last termination', 'time_type', 'time_ms']), totals)

## Graph 1: Absolute total time spent per status per "last termination" type

In [7]:
def graph_1(df, cluster):
    #sns.set_theme(style="ticks")
    g = sns.histplot(df, x="Last termination", weights="time_ms", shrink=.5,
                 hue="time_type", multiple="dodge", discrete=True, legend=False)
    g.set_yscale("log")
    g.set_ylabel("Total (milliseconds)") 
    g.set_title("Cluster " + cluster + ": Absolute total time spent per status per \"last termination\" type")
    #g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

## Graph 2: Relative total time spent per status per "last termination" type

Values are proportions on total for each "last termination" type

In [8]:
def graph_2(df, cluster, totals):
    df = df.copy()
    for i in [4,5,6,7]:
        df.loc[df["Last termination"] == to_name(i), "time_ms"] = df["time_ms"][df["Last termination"] == to_name(i)] / totals

    df.rename(columns = {'time_type': 'Execution phase'}, inplace = True)
    
    h = sns.histplot(df, x="Last termination", 
                     weights="time_ms", shrink=.5, common_bins=True,
                     hue="Execution phase", multiple="stack", discrete=True, legend=True)
    #ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    h.set_title(("Cluster " + cluster.upper()) if cluster != "all" else "All clusters" if cluster == "all" else "2011 traces")

In [9]:
dft = None
tts = None

for cluster in "a":
    df, totals = create_df(cluster)
    
    print(df)

    #plt.figure(figsize=(10,8))
    #graph_1(df, cluster)
    plt.figure(figsize=(4,3))
    graph_2(df, cluster, totals)
    plt.savefig('../report/figures/machine_time_waste/cluster_%s.pgf' % cluster)
    
    if dft is None:
        dft = df
        tts = totals
    else:
        dft.loc[:, "time_ms"] = dft["time_ms"].add(df["time_ms"], fill_value=0)
        tts += totals

#plt.figure(figsize=(10,8))
#graph_1(dft, "all")
plt.figure(figsize=(4,3))
graph_2(dft, "all", tts)
plt.savefig('../report/figures/machine_time_waste/cluster_all.pgf')

d2011 = {'Last termination': ["EVICT"] * 4 + ["FAIL"] * 4 + ["FINISH"] * 4 + ["KILL"] * 4,
         'time_type': ["Queue", "Resubmission", "Running", "Unknown"] * 4,
         'time_ms': [2.5, 0., 17.5, 0, 0, 0, 5, 0, 1, 0, 39, 0, 5, 1, 30, 0]}

d2011 = pandas.DataFrame(d2011, columns=['Last termination', 'time_type', 'time_ms'])
plt.figure(figsize=(4,3))
graph_2(d2011, "2011", 100)
plt.savefig('../report/figures/machine_time_waste/cluster_2011.pgf')


   Last termination     time_type       time_ms
0             EVICT         Queue  1.049774e+12
1             EVICT  Resubmission  5.530617e+08
2             EVICT       Running  3.218063e+13
3             EVICT       Unknown  3.383291e+12
4              FAIL         Queue  9.483261e+11
5              FAIL  Resubmission  7.150500e+01
6              FAIL       Running  7.265195e+12
7              FAIL       Unknown  2.799674e+12
8            FINISH         Queue  3.317009e+13
9            FINISH  Resubmission  1.828825e+07
10           FINISH       Running  3.788436e+13
11           FINISH       Unknown  2.482661e+13
12             KILL         Queue  7.482888e+13
13             KILL  Resubmission  1.211419e+11
14             KILL       Running  6.311166e+14
15             KILL       Unknown  1.207792e+15


In [10]:
dft["time_ms"] /= tts
print(dft)

   Last termination     time_type       time_ms
0             EVICT         Queue  5.102510e-04
1             EVICT  Resubmission  2.688201e-07
2             EVICT       Running  1.564165e-02
3             EVICT       Unknown  1.644476e-03
4              FAIL         Queue  4.609415e-04
5              FAIL  Resubmission  3.475558e-14
6              FAIL       Running  3.531306e-03
7              FAIL       Unknown  1.360804e-03
8            FINISH         Queue  1.612259e-02
9            FINISH  Resubmission  8.889150e-09
10           FINISH       Running  1.841400e-02
11           FINISH       Unknown  1.206717e-02
12             KILL         Queue  3.637118e-02
13             KILL  Resubmission  5.888201e-05
14             KILL       Running  3.067593e-01
15             KILL       Unknown  5.870572e-01
