From 7cfebd24093859909fc143beaa8e96b34603ea0c Mon Sep 17 00:00:00 2001 From: "Claudio Maggioni (maggicl)" Date: Mon, 22 Feb 2021 13:12:38 +0000 Subject: [PATCH] Tidied machine_time_waste analysis --- machine_time_waste/machine_time_waste.py | 103 ++++++++++++------ ...b => machine_time_waste_inefficient.ipynb} | 0 .../machine_time_waste_inefficient.py | 81 ++++++++++++++ machine_time_waste/machine_time_waste_rdd.py | 103 ------------------ 4 files changed, 153 insertions(+), 134 deletions(-) rename machine_time_waste/{machine_time_waste.ipynb => machine_time_waste_inefficient.ipynb} (100%) create mode 100755 machine_time_waste/machine_time_waste_inefficient.py delete mode 100755 machine_time_waste/machine_time_waste_rdd.py diff --git a/machine_time_waste/machine_time_waste.py b/machine_time_waste/machine_time_waste.py index b8bf92e4..9aa69862 100755 --- a/machine_time_waste/machine_time_waste.py +++ b/machine_time_waste/machine_time_waste.py @@ -3,6 +3,24 @@ # # Temporal impact: machine time waste +# This analysis is meant to analyse the time spend by instance events doing submission, queueing, and execution. This +# preliminary script sums the total time spent by instance executions to transition from each event type to another. +# Additionaly, time sums are partitioned by the last termination state of the instance they belong (i.e. the last +# 4<=x<=8 event type for that instance). +# Please note that events with either missing time, type, instance_index or collection_id are ignored. Total number of +# instance events in the trace and filtered number of events are saved in the output. + +# ## Data representation +# Total and filtered totals mentioned before are under "total" and "filtered" attributes in the root of the generated +# JSON object. The "data" atrribute is a list of pairs of final instance termination states and the corresponding list +# of time totals per each transition. Each transition total is represented in the form of "x-y" where x is the last +# event type prior to the transition and "y" is the new event detected. Times are calculated by summing all event times +# "y" subtracting the nearest event of type "x" for each instance. If an event "x" is repeated multiple times +# immediately after an event of the same type, only the first event in chronological order is considered. If however +# after multiple repetitions of the event "x" the trace for that instance terminates, an "x-x" time sum is registered by +# computing the difference between the last and the first event of "x" type. Times are represented in microseconds. + +import json import pandas from IPython import display import findspark @@ -15,15 +33,20 @@ from pyspark.sql.functions import col, lag, when, concat_ws, last, first from pyspark.sql import Window from pyspark.sql.types import LongType -cluster="b" +if len(sys.argv) != 2 or len(sys.argv[1]) != 1: + print("usage: " + sys.argv[0] + " {cluster}", file=sys.stderr) + sys.exit(1) + +cluster=sys.argv[1] spark = pyspark.sql.SparkSession.builder \ .appName("machine_time_waste") \ - .config("spark.local.dir", "/run/tmpfiles.d/spark") \ + .config("spark.local.dir", "/tmp/ramdisk/spark") \ .config("spark.driver.memory", "124g") \ .getOrCreate() df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz") +# df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_test.json") df.printSchema() @@ -41,39 +64,57 @@ df2 = df \ .select("time", "type", "id") df2.show() -print("Total: " + str(df.count())) -print("Filtered: " + str(df2.count())) -# my_window = Window.partitionBy("machine_id", "id").orderBy(df2.time.asc()) +total = df.count() +filtered = df2.count() -w2 = Window.partitionBy("id").orderBy(df2.time.asc()).rowsBetween(Window.currentRow, Window.unboundedFollowing) +print("Total: " + str(total)) +print("Filtered: " + str(filtered)) + +r = df2.rdd + +def for_each_task(ts): + ts = sorted(ts, key=lambda x: x.time) + last_term = None + prev = None + tr = {} + + for i,t in enumerate(ts): + if prev is not None and t.type == prev.type: # remove useless transitions + if (i == len(ts)-1): # if last + tr[str(prev.type) + "-" + str(t.type)] = t.time - prev.time # keep "loops" if last + else: + continue + if t.type >= 4 and t.type <= 8: + last_term = t.type + if prev is not None: + tr[str(prev.type) + "-" + str(t.type)] = t.time - prev.time + prev = t + return {"last_term": last_term, 'tr': tr} + +def sum_values(ds): + dsum = {} + for dt in ds: + d = dt["tr"] + for key in d: + if key not in dsum: + dsum[key] = d[key] + else: + dsum[key] += d[key] + return dsum + +r2 = r \ + .groupBy(lambda x: x.id) \ + .mapValues(for_each_task) \ + .map(lambda x: x[1]) \ + .groupBy(lambda x: x["last_term"]) \ + .mapValues(sum_values) \ + .collect() + +with open(cluster + "_state_changes.json", "w") as out: + json.dump({"filtered": filtered, "total": total, "data": r2}, out) # .withColumn("prev_time", lag(df2.time).over(my_window)) \ # .withColumn("prev_type", lag(df2.type).over(my_window)) \ -df3 = df2 \ - .withColumn("t3_time", when((df2.type != 3), None).otherwise(df2.time)) \ - .withColumn("t45678_time", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.time)) \ - .withColumn("t45678_type", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.type)) \ - .withColumn("t01_time", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.time)) \ - .withColumn("t01_type", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.type)) \ - .withColumn("next_time", when(df2.type == 3, first(col("t45678_time"), True).over(w2)) \ - .when((df2.type == 0) | (df2.type == 1), first(col("t3_time"), True).over(w2)) \ - .when((df2.type >= 4) | (df2.type <= 8), first(col("t01_time"), True).over(w2)) \ - .otherwise(None)) \ - .withColumn("next_type", when(df2.type == 3, first(col("t45678_type"), True).over(w2)) \ - .when((df2.type == 0) | (df2.type == 1), 3) \ - .when((df2.type >= 4) | (df2.type <= 8), first(col("t01_type"), True).over(w2)) \ - .otherwise(None)) \ - .withColumn("last_term_type", last(col("t45678_type"), True).over(w2)) \ - .withColumn("time_delta", col("next_time") - col("time")) \ - .select("id", "time", "type", "last_term_type", "time_delta", "t01_time", \ - "t01_type", "t3_time", "t45678_time", "t45678_type", "next_time", "next_type") - -df4 = df3.where(df3.next_type.isNotNull()).groupby("type", "next_type", "last_term_type").sum("time_delta") - -# df3.orderBy(df3.machine_id, df3.time).show(n=100) -# df3.printSchema() -df4.show(n=1000000) -df4.write.csv("/home/claudio/google_2019/thesis_queries/machine_time_waste/" + cluster + "_state_change.csv") # vim: set ts=2 sw=2 et tw=120: diff --git a/machine_time_waste/machine_time_waste.ipynb b/machine_time_waste/machine_time_waste_inefficient.ipynb similarity index 100% rename from machine_time_waste/machine_time_waste.ipynb rename to machine_time_waste/machine_time_waste_inefficient.ipynb diff --git a/machine_time_waste/machine_time_waste_inefficient.py b/machine_time_waste/machine_time_waste_inefficient.py new file mode 100755 index 00000000..0a21b034 --- /dev/null +++ b/machine_time_waste/machine_time_waste_inefficient.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# # Temporal impact: machine time waste + +# Inefficient version of the machine_time_waste.py script in the same dir. Please ignore, kept for future need + +import pandas +from IPython import display +import findspark +findspark.init() +import pyspark +import pyspark.sql +import sys + +from pyspark.sql.functions import col, lag, when, concat_ws, last, first +from pyspark.sql import Window +from pyspark.sql.types import LongType + +cluster="b" + +spark = pyspark.sql.SparkSession.builder \ + .appName("machine_time_waste") \ + .config("spark.local.dir", "/run/tmpfiles.d/spark") \ + .config("spark.driver.memory", "124g") \ + .getOrCreate() + +df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz") + +df.printSchema() + +df.show() + +# .filter(df.collection_type == 0) \ +df2 = df \ + .withColumn("time", col("time").cast(LongType())) \ + .withColumn("type", col("type").cast(LongType())) \ + .withColumn("type", when(col("type").isNull(), 0).otherwise(col("type"))) \ + .withColumn("id", concat_ws("-", "collection_id", "instance_index")) \ + .where(col("time").isNotNull()) \ + .where(col("type").isNotNull()) \ + .where((col("instance_index").isNotNull()) & (col("collection_id").isNotNull())) \ + .select("time", "type", "id") + +df2.show() +print("Total: " + str(df.count())) +print("Filtered: " + str(df2.count())) + +# my_window = Window.partitionBy("machine_id", "id").orderBy(df2.time.asc()) + +w2 = Window.partitionBy("id").orderBy(df2.time.asc()).rowsBetween(Window.currentRow, Window.unboundedFollowing) + +# .withColumn("prev_time", lag(df2.time).over(my_window)) \ +# .withColumn("prev_type", lag(df2.type).over(my_window)) \ +df3 = df2 \ + .withColumn("t3_time", when((df2.type != 3), None).otherwise(df2.time)) \ + .withColumn("t45678_time", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.time)) \ + .withColumn("t45678_type", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.type)) \ + .withColumn("t01_time", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.time)) \ + .withColumn("t01_type", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.type)) \ + .withColumn("next_time", when(df2.type == 3, first(col("t45678_time"), True).over(w2)) \ + .when((df2.type == 0) | (df2.type == 1), first(col("t3_time"), True).over(w2)) \ + .when((df2.type >= 4) | (df2.type <= 8), first(col("t01_time"), True).over(w2)) \ + .otherwise(None)) \ + .withColumn("next_type", when(df2.type == 3, first(col("t45678_type"), True).over(w2)) \ + .when((df2.type == 0) | (df2.type == 1), 3) \ + .when((df2.type >= 4) | (df2.type <= 8), first(col("t01_type"), True).over(w2)) \ + .otherwise(None)) \ + .withColumn("last_term_type", last(col("t45678_type"), True).over(w2)) \ + .withColumn("time_delta", col("next_time") - col("time")) \ + .select("id", "time", "type", "last_term_type", "time_delta", "t01_time", \ + "t01_type", "t3_time", "t45678_time", "t45678_type", "next_time", "next_type") + +df4 = df3.where(df3.next_type.isNotNull()).groupby("type", "next_type", "last_term_type").sum("time_delta") + +# df3.orderBy(df3.machine_id, df3.time).show(n=100) +# df3.printSchema() +df4.show(n=1000000) +df4.write.csv("/home/claudio/google_2019/thesis_queries/machine_time_waste/" + cluster + "_state_change.csv") + +# vim: set ts=2 sw=2 et tw=120: diff --git a/machine_time_waste/machine_time_waste_rdd.py b/machine_time_waste/machine_time_waste_rdd.py deleted file mode 100755 index 33f66829..00000000 --- a/machine_time_waste/machine_time_waste_rdd.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 - -# # Temporal impact: machine time waste - -import json -import pandas -from IPython import display -import findspark -findspark.init() -import pyspark -import pyspark.sql -import sys - -from pyspark.sql.functions import col, lag, when, concat_ws, last, first -from pyspark.sql import Window -from pyspark.sql.types import LongType - -if len(sys.argv) != 2 or len(sys.argv[1]) != 1: - print("usage: " + sys.argv[0] + " {cluster}", file=sys.stderr) - sys.exit(1) - -cluster=sys.argv[1] - -spark = pyspark.sql.SparkSession.builder \ - .appName("machine_time_waste") \ - .config("spark.local.dir", "/tmp/ramdisk/spark") \ - .config("spark.driver.memory", "124g") \ - .getOrCreate() - -df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz") -# df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_test.json") - -df.printSchema() - -df.show() - -# .filter(df.collection_type == 0) \ -df2 = df \ - .withColumn("time", col("time").cast(LongType())) \ - .withColumn("type", col("type").cast(LongType())) \ - .withColumn("type", when(col("type").isNull(), 0).otherwise(col("type"))) \ - .withColumn("id", concat_ws("-", "collection_id", "instance_index")) \ - .where(col("time").isNotNull()) \ - .where(col("type").isNotNull()) \ - .where((col("instance_index").isNotNull()) & (col("collection_id").isNotNull())) \ - .select("time", "type", "id") - -df2.show() - -total = df.count() -filtered = df2.count() - -print("Total: " + str(total)) -print("Filtered: " + str(filtered)) - -r = df2.rdd - -def for_each_task(ts): - ts = sorted(ts, key=lambda x: x.time) - last_term = None - prev = None - tr = {} - - for i,t in enumerate(ts): - if prev is not None and t.type == prev.type: # remove useless transitions - if (i == len(ts)-1): # if last - tr[str(prev.type) + "-" + str(t.type)] = t.time - prev.time # keep "loops" if last - else: - continue - if t.type >= 4 and t.type <= 8: - last_term = t.type - if prev is not None: - tr[str(prev.type) + "-" + str(t.type)] = t.time - prev.time - prev = t - return {"last_term": last_term, 'tr': tr} - -def sum_values(ds): - dsum = {} - for dt in ds: - d = dt["tr"] - for key in d: - if key not in dsum: - dsum[key] = d[key] - else: - dsum[key] += d[key] - return dsum - -r2 = r \ - .groupBy(lambda x: x.id) \ - .mapValues(for_each_task) \ - .map(lambda x: x[1]) \ - .groupBy(lambda x: x["last_term"]) \ - .mapValues(sum_values) \ - .collect() - -with open(cluster + "_state_changes.json", "w") as out: - json.dump({"filtered": filtered, "total": total, "data": r2}, out) - -# .withColumn("prev_time", lag(df2.time).over(my_window)) \ -# .withColumn("prev_type", lag(df2.type).over(my_window)) \ - -# vim: set ts=2 sw=2 et tw=120: