bachelorThesis/figure_8/figure8-cd-only.py

#!/usr/bin/env python3
# coding: utf-8

import os
import json
import pandas as pd
import findspark
findspark.init()
import pyspark
import pyspark.sql
import sys
import gzip
from pyspark import AccumulatorParam
from pyspark.sql.functions import lit
from pyspark.sql import Window
from pyspark.sql.types import *
from decimal import *

CHECKDIR = "/home/claudio/google_2019/thesis_queries/figure_8/" 

if len(sys.argv) is not 4:
    print(sys.argv[0] + " {cluster} {tmpdir} {maxram}")
    sys.exit()

cluster=sys.argv[1]

if os.path.exists(CHECKDIR + cluster + "_figure8cd.json"):
    print("already computed")
    sys.exit()

if os.path.exists(CHECKDIR + cluster + "_figure8cd_working"):
    print("already in execution")
    sys.exit()

os.system("touch " + CHECKDIR + cluster + "_figure8cd_working")


spark = pyspark.sql.SparkSession.builder \
  .appName("task_slowdown") \
  .config("spark.driver.maxResultSize", "128g") \
  .config("spark.local.dir", sys.argv[2]) \
  .config("spark.driver.memory", sys.argv[3]) \
  .getOrCreate()
sc = spark.sparkContext

def tabid(x):
    return Decimal(x.collection_id) + Decimal(x.instance_index) / Decimal(2**64)

#
# READING INSTANCE EVENTS DATA
#
#dfepath = "/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz"
dfepath = "/home/claudio/" + cluster + "/" + cluster + "_instance_events*.json.gz"
#dfepath = "/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_test.json"
df = spark.read.json(dfepath)

# READING MACHINE EVENTS DATA, sort them and save them as broadcast variable
print("Starting to read machine events...")
dfm = pd.read_csv("/home/claudio/google_2019/machine_events/" + cluster + "_machine_events.csv", converters={
    'time': lambda x: -1 if x == '' else int(x), 
    'machine_id': lambda x: str(x),
    'capacity.cpus': lambda x: -1 if x == '' else Decimal(x),
    'capacity.memory': lambda x: -1 if x == '' else Decimal(x)})
print("Dropping remove events...")
dfm = dfm[(dfm.type!=2)&(dfm.time!=-1)&(dfm["capacity.cpus"]!=-1)&(dfm["capacity.memory"]!=-1)]
print("Dropping missing data events...")
dfm = dfm[dfm.missing_data_reason.isnull()]
print("Projecting on useful columns...")
dfm = dfm[["time", "machine_id", "capacity.cpus", "capacity.memory"]]
print("Sorting by time...")
dfm = dfm.sort_values(by=["machine_id", "time"])
print("Converting to broadcast variable...")
dfm = sc.broadcast([tuple(r) for r in dfm.to_numpy()])
print("Done with machine events.")

df = df.rdd \
    .filter(lambda x: x.time is not None and x.type is not None and x.machine_id is not None and
            x.instance_index is not None and x.collection_id is not None and x.resource_request is not None and
            x.resource_request.cpus is not None and x.resource_request.memory is not None) \
    .map(lambda x: [tabid(x), int(x.time), int(x.type),
        Decimal(x.resource_request.cpus), Decimal(x.resource_request.memory), x.machine_id]) \
    .toDF(["id", "time", "type", "rcpu", "rram", "mid"])


def get_machine_time_resources(machine_id, time):
    def aux(i, j):
        if i == j:
            return i if dfm.value[i][1] == machine_id else None
        elif i + 1 == j:
            if dfm.value[i][1] == machine_id:
                return i
            elif dfm.value[j][1] == machine_id:
                return j
            else:
                return None

        mid = (i + j) // 2

        if dfm.value[mid][1] > machine_id:
            return aux(i, mid - 1)
        elif dfm.value[mid][1] < machine_id:
            return aux(mid + 1, j)
        elif dfm.value[mid][0] > time:
            return aux(i, mid)
        elif dfm.value[mid][0] < time:
            return aux(mid, j)
        else:
            return mid

    return aux(0, len(dfm.value)-1)

def increment_reserv_bucket(bucket, taskid, value):
    if value < 0:
        idx = 0
    else:
        idx = 40 if value >= 1 else (int(value * 40) + 1)
    
    if taskid not in bucket:
        bucket[taskid] = [0] * 41
    bucket[taskid][idx] += 1 

def bucket_sum_per_termination(bucket, last_term_by_id):
    result = {-1: None, 4: None, 5: None, 6: None, 7: None, 8: None}
    for tid, vs in bucket.items():
        term = last_term_by_id[tid]
        if result[term] is None:
            result[term] = vs
        else:
            result[term] = [sum(x) for x in zip(result[term], vs)]
    return result

def for_each_joined(x):
    machine_id = x[0]
    ts = x[1]

    ts = filter(lambda t: t.time is not None, ts);
    ts = sorted(ts, key=lambda x: x.time)

    last_req_by_id = {} # map taskid -> last known req [cpu, ram] (data removed when task terminates)
    
    cpu_reservs_by_id = {} 
    ram_reservs_by_id = {}

    last_term_by_id = {} # map taskid -> last termination
    start = get_machine_time_resources(machine_id, 0)
    end = get_machine_time_resources(machine_id, 6_000_000_000_000)    
    machine_logs = None if start is None or end is None else dfm.value[start:(end+1)]
    
    for i, t in enumerate(ts):
        if machine_logs is not None and len(machine_logs) > 1 and machine_logs[1][0] >= t.time:
            machine_logs.pop(0)
        if t.id not in last_term_by_id:
            last_term_by_id[t.id] = -1
        if t.rcpu is not None and t.rram is not None:
            last_req_by_id[t.id] = (t.rcpu, t.rram)
        # 8b
        tot_req = [sum(x) for x in zip(*last_req_by_id.values())]
        if machine_logs is not None:
            reserv_cpu = tot_req[0] / machine_logs[0][2]
            reserv_ram = tot_req[1] / machine_logs[0][3]
        else:
            reserv_cpu = -1
            reserv_ram = -1
        increment_reserv_bucket(cpu_reservs_by_id, t.id, reserv_cpu)
        increment_reserv_bucket(ram_reservs_by_id, t.id, reserv_ram)
        if t.type >= 4 and t.type <= 8:
            last_term_by_id[t.id] = t.type

    resobj = {'rscpu': cpu_reservs_by_id, 'rsram': ram_reservs_by_id}   
    
    for k, v in resobj.items():
        resobj[k] = bucket_sum_per_termination(v, last_term_by_id)

    return resobj

def fold_resobjs(ro1, ro2):
    if ro1 is None:
        return ro2
    elif ro2 is None:
        return ro1
    else:
        for k in ro1.keys():
            for kk in ro1[k].keys():
                if ro1[k][kk] is None:
                    ro1[k][kk] = ro2[k][kk]
                elif ro2[k][kk] is None:
                    continue
                else:
                    ro1[k][kk] = [sum(x) for x in zip(ro1[k][kk], ro2[k][kk])]
        return ro1
 
import random

result = df.rdd \
    .groupBy(lambda x: x.mid) \
    .partitionBy(1000, lambda x: random.randint(0, 1000-1)) \
    .map(for_each_joined) \
    .fold(None, fold_resobjs)

d = os.path.dirname(os.path.realpath(__file__))

with open(d + "/" + cluster + "_figure8cd.json", "w") as f:
    json.dump(result, f)

os.system("rm " + CHECKDIR + cluster + "_figure8cd_working")

# vim: set ts=4 sw=4 et tw=120:
More query results 2021-04-16 10:29:34 +00:00			`#!/usr/bin/env python3`
			`# coding: utf-8`

			`import os`
			`import json`
			`import pandas as pd`
			`import findspark`
			`findspark.init()`
			`import pyspark`
			`import pyspark.sql`
			`import sys`
			`import gzip`
			`from pyspark import AccumulatorParam`
			`from pyspark.sql.functions import lit`
			`from pyspark.sql import Window`
			`from pyspark.sql.types import *`
			`from decimal import *`

more results for 7c 2021-04-19 12:27:09 +00:00			`CHECKDIR = "/home/claudio/google_2019/thesis_queries/figure_8/"`

More query results 2021-04-16 10:29:34 +00:00			`if len(sys.argv) is not 4:`
			`print(sys.argv[0] + " {cluster} {tmpdir} {maxram}")`
			`sys.exit()`

			`cluster=sys.argv[1]`

more results for 7c 2021-04-19 12:27:09 +00:00			`if os.path.exists(CHECKDIR + cluster + "_figure8cd.json"):`
			`print("already computed")`
			`sys.exit()`

			`if os.path.exists(CHECKDIR + cluster + "_figure8cd_working"):`
			`print("already in execution")`
			`sys.exit()`

			`os.system("touch " + CHECKDIR + cluster + "_figure8cd_working")`


More query results 2021-04-16 10:29:34 +00:00			`spark = pyspark.sql.SparkSession.builder \`
			`.appName("task_slowdown") \`
			`.config("spark.driver.maxResultSize", "128g") \`
			`.config("spark.local.dir", sys.argv[2]) \`
			`.config("spark.driver.memory", sys.argv[3]) \`
			`.getOrCreate()`
			`sc = spark.sparkContext`

more results for 7c 2021-04-19 12:27:09 +00:00			`def tabid(x):`
			`return Decimal(x.collection_id) + Decimal(x.instance_index) / Decimal(2**64)`

More query results 2021-04-16 10:29:34 +00:00			`#`
			`# READING INSTANCE EVENTS DATA`
			`#`
correction 9c 2021-04-27 14:33:37 +00:00			`#dfepath = "/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz"`
			`dfepath = "/home/claudio/" + cluster + "/" + cluster + "_instance_events*.json.gz"`
more results for 7c 2021-04-19 12:27:09 +00:00			`#dfepath = "/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_test.json"`
More query results 2021-04-16 10:29:34 +00:00			`df = spark.read.json(dfepath)`

			`# READING MACHINE EVENTS DATA, sort them and save them as broadcast variable`
			`print("Starting to read machine events...")`
correction 9c 2021-04-27 14:33:37 +00:00			`dfm = pd.read_csv("/home/claudio/google_2019/machine_events/" + cluster + "_machine_events.csv", converters={`
More query results 2021-04-16 10:29:34 +00:00			`'time': lambda x: -1 if x == '' else int(x),`
			`'machine_id': lambda x: str(x),`
			`'capacity.cpus': lambda x: -1 if x == '' else Decimal(x),`
			`'capacity.memory': lambda x: -1 if x == '' else Decimal(x)})`
			`print("Dropping remove events...")`
			`dfm = dfm[(dfm.type!=2)&(dfm.time!=-1)&(dfm["capacity.cpus"]!=-1)&(dfm["capacity.memory"]!=-1)]`
			`print("Dropping missing data events...")`
			`dfm = dfm[dfm.missing_data_reason.isnull()]`
			`print("Projecting on useful columns...")`
			`dfm = dfm[["time", "machine_id", "capacity.cpus", "capacity.memory"]]`
			`print("Sorting by time...")`
			`dfm = dfm.sort_values(by=["machine_id", "time"])`
			`print("Converting to broadcast variable...")`
			`dfm = sc.broadcast([tuple(r) for r in dfm.to_numpy()])`
			`print("Done with machine events.")`

more results for 7c 2021-04-19 12:27:09 +00:00			`df = df.rdd \`
More query results 2021-04-16 10:29:34 +00:00			`.filter(lambda x: x.time is not None and x.type is not None and x.machine_id is not None and`
			`x.instance_index is not None and x.collection_id is not None and x.resource_request is not None and`
			`x.resource_request.cpus is not None and x.resource_request.memory is not None) \`
			`.map(lambda x: [tabid(x), int(x.time), int(x.type),`
			`Decimal(x.resource_request.cpus), Decimal(x.resource_request.memory), x.machine_id]) \`
			`.toDF(["id", "time", "type", "rcpu", "rram", "mid"])`


			`def get_machine_time_resources(machine_id, time):`
			`def aux(i, j):`
			`if i == j:`
more results for 7c 2021-04-19 12:27:09 +00:00			`return i if dfm.value[i][1] == machine_id else None`
More query results 2021-04-16 10:29:34 +00:00			`elif i + 1 == j:`
			`if dfm.value[i][1] == machine_id:`
more results for 7c 2021-04-19 12:27:09 +00:00			`return i`
More query results 2021-04-16 10:29:34 +00:00			`elif dfm.value[j][1] == machine_id:`
more results for 7c 2021-04-19 12:27:09 +00:00			`return j`
More query results 2021-04-16 10:29:34 +00:00			`else:`
			`return None`

			`mid = (i + j) // 2`

			`if dfm.value[mid][1] > machine_id:`
			`return aux(i, mid - 1)`
			`elif dfm.value[mid][1] < machine_id:`
			`return aux(mid + 1, j)`
			`elif dfm.value[mid][0] > time:`
			`return aux(i, mid)`
			`elif dfm.value[mid][0] < time:`
			`return aux(mid, j)`
			`else:`
			`return mid`

			`return aux(0, len(dfm.value)-1)`

			`def increment_reserv_bucket(bucket, taskid, value):`
			`if value < 0:`
			`idx = 0`
			`else:`
			`idx = 40 if value >= 1 else (int(value * 40) + 1)`

			`if taskid not in bucket:`
Notebook done for figure 8 2021-04-17 10:50:40 +00:00			`bucket[taskid] = [0] * 41`
More query results 2021-04-16 10:29:34 +00:00			`bucket[taskid][idx] += 1`

			`def bucket_sum_per_termination(bucket, last_term_by_id):`
			`result = {-1: None, 4: None, 5: None, 6: None, 7: None, 8: None}`
			`for tid, vs in bucket.items():`
			`term = last_term_by_id[tid]`
			`if result[term] is None:`
			`result[term] = vs`
			`else:`
			`result[term] = [sum(x) for x in zip(result[term], vs)]`
			`return result`

			`def for_each_joined(x):`
			`machine_id = x[0]`
			`ts = x[1]`

done figure 7 results 2021-04-20 07:06:24 +00:00			`ts = filter(lambda t: t.time is not None, ts);`
More query results 2021-04-16 10:29:34 +00:00			`ts = sorted(ts, key=lambda x: x.time)`
done figure 7 results 2021-04-20 07:06:24 +00:00
More query results 2021-04-16 10:29:34 +00:00			`last_req_by_id = {} # map taskid -> last known req [cpu, ram] (data removed when task terminates)`

			`cpu_reservs_by_id = {}`
			`ram_reservs_by_id = {}`

			`last_term_by_id = {} # map taskid -> last termination`
			`start = get_machine_time_resources(machine_id, 0)`
			`end = get_machine_time_resources(machine_id, 6_000_000_000_000)`
			`machine_logs = None if start is None or end is None else dfm.value[start:(end+1)]`

			`for i, t in enumerate(ts):`
			`if machine_logs is not None and len(machine_logs) > 1 and machine_logs[1][0] >= t.time:`
			`machine_logs.pop(0)`
			`if t.id not in last_term_by_id:`
			`last_term_by_id[t.id] = -1`
			`if t.rcpu is not None and t.rram is not None:`
			`last_req_by_id[t.id] = (t.rcpu, t.rram)`
			`# 8b`
			`tot_req = [sum(x) for x in zip(*last_req_by_id.values())]`
			`if machine_logs is not None:`
			`reserv_cpu = tot_req[0] / machine_logs[0][2]`
			`reserv_ram = tot_req[1] / machine_logs[0][3]`
			`else:`
			`reserv_cpu = -1`
			`reserv_ram = -1`
			`increment_reserv_bucket(cpu_reservs_by_id, t.id, reserv_cpu)`
			`increment_reserv_bucket(ram_reservs_by_id, t.id, reserv_ram)`
			`if t.type >= 4 and t.type <= 8:`
			`last_term_by_id[t.id] = t.type`

			`resobj = {'rscpu': cpu_reservs_by_id, 'rsram': ram_reservs_by_id}`

			`for k, v in resobj.items():`
			`resobj[k] = bucket_sum_per_termination(v, last_term_by_id)`

			`return resobj`

			`def fold_resobjs(ro1, ro2):`
			`if ro1 is None:`
			`return ro2`
			`elif ro2 is None:`
			`return ro1`
			`else:`
			`for k in ro1.keys():`
			`for kk in ro1[k].keys():`
			`if ro1[k][kk] is None:`
			`ro1[k][kk] = ro2[k][kk]`
			`elif ro2[k][kk] is None:`
			`continue`
			`else:`
			`ro1[k][kk] = [sum(x) for x in zip(ro1[k][kk], ro2[k][kk])]`
			`return ro1`

			`import random`

			`result = df.rdd \`
			`.groupBy(lambda x: x.mid) \`
more results for 7c 2021-04-19 12:27:09 +00:00			`.partitionBy(1000, lambda x: random.randint(0, 1000-1)) \`
More query results 2021-04-16 10:29:34 +00:00			`.map(for_each_joined) \`
			`.fold(None, fold_resobjs)`

			`d = os.path.dirname(os.path.realpath(__file__))`

more results for 7c 2021-04-19 12:27:09 +00:00			`with open(d + "/" + cluster + "_figure8cd.json", "w") as f:`
More query results 2021-04-16 10:29:34 +00:00			`json.dump(result, f)`

more results for 7c 2021-04-19 12:27:09 +00:00			`os.system("rm " + CHECKDIR + cluster + "_figure8cd_working")`

More query results 2021-04-16 10:29:34 +00:00			`# vim: set ts=4 sw=4 et tw=120:`