bachelorThesis/task_slowdown/task_slowdown.py

100 lines
2.8 KiB
Python
Executable File

#!/usr/bin/env python3
# coding: utf-8
import json
import pandas
from IPython import display
import findspark
findspark.init()
import pyspark
import pyspark.sql
import sys
from pyspark.sql.functions import col, lag, when, concat_ws, last, first
from pyspark.sql import Window
from pyspark.sql.types import LongType
cluster=sys.argv[1]
spark = pyspark.sql.SparkSession.builder \
.appName("task_slowdown") \
.config("spark.local.dir", "/run/tmpfiles.d/spark") \
.config("spark.driver.memory", "124g") \
.getOrCreate()
sc = spark.sparkContext
df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz")
# df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_test.json")
df.printSchema()
df.show()
non = sc.accumulator(0)
tot = sc.accumulator(0)
def for_each_task(ts):
global none
global tot
ts = sorted(ts, key=lambda x: x["time"])
last_term = None
priority = None
responding = False
resp_burst_start = None
resp_burst_type = None
resp_time = []
resp_time_last = 0
for i,t in enumerate(ts):
if t["priority"] is not None and priority is None:
priority = t["priority"]
if responding:
resp_burst_type.append(t["type"])
if t["type"] >= 4 and t["type"] <= 8:
last_term = t["type"]
if responding:
# This response time interval has ended, so record the time delta and term
resp_time.append((t["time"] - resp_burst_start, resp_burst_type))
responding = False
if (not responding) and (t["type"] < 4 or t["type"] > 8):
resp_burst_start = t["time"]
resp_burst_type = [t["type"]]
responding = True
tot.add(1)
if last_term != 6:
non.add(1)
return (priority, resp_time) if last_term == 5 else None
def cleanup(x):
return {
"time": int(x.time),
"type": 0 if x.type is None else int(x.type),
"id": x.collection_id + "-" + x.instance_index,
"priority": 0 if x.priority is None else int(x.priority)
}
df2 = df.rdd \
.filter(lambda x: x.collection_type is None or x.collection_type == 0) \
.filter(lambda x: x.type is not None and x.time is not None
and x.instance_index is not None and x.collection_id is not None) \
.map(cleanup) \
.groupBy(lambda x: x["id"]) \
.mapValues(for_each_task) \
.filter(lambda x: x[1] is not None) \
.map(lambda x: x[1]) \
.groupBy(lambda x: x[0]) \
.mapValues(lambda x: [e[1] for e in x])
a = {"val": df2.collect(), "tot": tot.value, "non": non.value}
with open(cluster + "_state_changes.json", "w") as out:
json.dump(a, out)