bachelorThesis/machine_time_waste/machine_time_waste.py

121 lines
4.3 KiB
Python
Raw Permalink Normal View History

2021-02-19 18:47:56 +00:00
#!/usr/bin/env python3
# coding: utf-8
# # Temporal impact: machine time waste
2021-02-22 13:12:38 +00:00
# This analysis is meant to analyse the time spend by instance events doing submission, queueing, and execution. This
# preliminary script sums the total time spent by instance executions to transition from each event type to another.
# Additionaly, time sums are partitioned by the last termination state of the instance they belong (i.e. the last
# 4<=x<=8 event type for that instance).
# Please note that events with either missing time, type, instance_index or collection_id are ignored. Total number of
# instance events in the trace and filtered number of events are saved in the output.
# ## Data representation
# Total and filtered totals mentioned before are under "total" and "filtered" attributes in the root of the generated
# JSON object. The "data" atrribute is a list of pairs of final instance termination states and the corresponding list
# of time totals per each transition. Each transition total is represented in the form of "x-y" where x is the last
# event type prior to the transition and "y" is the new event detected. Times are calculated by summing all event times
# "y" subtracting the nearest event of type "x" for each instance. If an event "x" is repeated multiple times
# immediately after an event of the same type, only the first event in chronological order is considered. If however
# after multiple repetitions of the event "x" the trace for that instance terminates, an "x-x" time sum is registered by
# computing the difference between the last and the first event of "x" type. Times are represented in microseconds.
import json
2021-02-19 18:47:56 +00:00
import pandas
from IPython import display
import findspark
findspark.init()
import pyspark
import pyspark.sql
import sys
from pyspark.sql.functions import col, lag, when, concat_ws, last, first
from pyspark.sql import Window
from pyspark.sql.types import LongType
2021-02-22 13:12:38 +00:00
if len(sys.argv) != 2 or len(sys.argv[1]) != 1:
print("usage: " + sys.argv[0] + " {cluster}", file=sys.stderr)
sys.exit(1)
cluster=sys.argv[1]
2021-02-19 18:47:56 +00:00
spark = pyspark.sql.SparkSession.builder \
.appName("machine_time_waste") \
2021-02-22 13:12:38 +00:00
.config("spark.local.dir", "/tmp/ramdisk/spark") \
2021-02-19 18:47:56 +00:00
.config("spark.driver.memory", "124g") \
.getOrCreate()
df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz")
2021-02-22 13:12:38 +00:00
# df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_test.json")
2021-02-19 18:47:56 +00:00
df.printSchema()
df.show()
# .filter(df.collection_type == 0) \
df2 = df \
.withColumn("time", col("time").cast(LongType())) \
.withColumn("type", col("type").cast(LongType())) \
.withColumn("type", when(col("type").isNull(), 0).otherwise(col("type"))) \
.withColumn("id", concat_ws("-", "collection_id", "instance_index")) \
.where(col("time").isNotNull()) \
.where(col("type").isNotNull()) \
.where((col("instance_index").isNotNull()) & (col("collection_id").isNotNull())) \
.select("time", "type", "id")
df2.show()
2021-02-22 13:12:38 +00:00
total = df.count()
filtered = df2.count()
print("Total: " + str(total))
print("Filtered: " + str(filtered))
r = df2.rdd
def for_each_task(ts):
ts = sorted(ts, key=lambda x: x.time)
last_term = None
prev = None
tr = {}
for i,t in enumerate(ts):
if prev is not None and t.type == prev.type: # remove useless transitions
if (i == len(ts)-1): # if last
tr[str(prev.type) + "-" + str(t.type)] = t.time - prev.time # keep "loops" if last
else:
continue
if t.type >= 4 and t.type <= 8:
last_term = t.type
if prev is not None:
tr[str(prev.type) + "-" + str(t.type)] = t.time - prev.time
prev = t
return {"last_term": last_term, 'tr': tr}
def sum_values(ds):
dsum = {}
for dt in ds:
d = dt["tr"]
for key in d:
if key not in dsum:
dsum[key] = d[key]
else:
dsum[key] += d[key]
return dsum
r2 = r \
.groupBy(lambda x: x.id) \
.mapValues(for_each_task) \
.map(lambda x: x[1]) \
.groupBy(lambda x: x["last_term"]) \
.mapValues(sum_values) \
.collect()
with open(cluster + "_state_changes.json", "w") as out:
json.dump({"filtered": filtered, "total": total, "data": r2}, out)
2021-02-19 18:47:56 +00:00
# .withColumn("prev_time", lag(df2.time).over(my_window)) \
# .withColumn("prev_type", lag(df2.type).over(my_window)) \
# vim: set ts=2 sw=2 et tw=120: