#!/usr/bin/env python3 # coding: utf-8 import json import pandas from IPython import display import findspark findspark.init() import pyspark import pyspark.sql import sys from pyspark.sql.functions import col, lag, when, concat_ws, last, first from pyspark.sql import Window from pyspark.sql.types import LongType cluster=sys.argv[1] spark = pyspark.sql.SparkSession.builder \ .appName("task_slowdown") \ .config("spark.local.dir", "/run/tmpfiles.d/spark") \ .config("spark.driver.memory", "124g") \ .getOrCreate() sc = spark.sparkContext df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz") # df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_test.json") df.printSchema() df.show() non = sc.accumulator(0) tot = sc.accumulator(0) def for_each_task(ts): global none global tot ts = sorted(ts, key=lambda x: x["time"]) last_term = None priority = None responding = False resp_burst_start = None resp_burst_type = None resp_time = [] resp_time_last = 0 for i,t in enumerate(ts): if t["priority"] is not None and priority is None: priority = t["priority"] if responding: resp_burst_type.append(t["type"]) if t["type"] >= 4 and t["type"] <= 8: last_term = t["type"] if responding: # This response time interval has ended, so record the time delta and term resp_time.append((t["time"] - resp_burst_start, resp_burst_type)) responding = False if (not responding) and (t["type"] < 4 or t["type"] > 8): resp_burst_start = t["time"] resp_burst_type = [t["type"]] responding = True tot.add(1) if last_term != 6: non.add(1) return (priority, resp_time) if last_term == 5 else None def cleanup(x): return { "time": int(x.time), "type": 0 if x.type is None else int(x.type), "id": x.collection_id + "-" + x.instance_index, "priority": 0 if x.priority is None else int(x.priority) } df2 = df.rdd \ .filter(lambda x: x.collection_type is None or x.collection_type == 0) \ .filter(lambda x: x.type is not None and x.time is not None and x.instance_index is not None and x.collection_id is not None) \ .map(cleanup) \ .groupBy(lambda x: x["id"]) \ .mapValues(for_each_task) \ .filter(lambda x: x[1] is not None) \ .map(lambda x: x[1]) \ .groupBy(lambda x: x[0]) \ .mapValues(lambda x: [e[1] for e in x]) a = {"val": df2.collect(), "tot": tot.value, "non": non.value} with open(cluster + "_state_changes.json", "w") as out: json.dump(a, out)