80 lines
3.2 KiB
Python
80 lines
3.2 KiB
Python
|
#!/usr/bin/env python3
|
||
|
# coding: utf-8
|
||
|
|
||
|
# # Temporal impact: machine time waste
|
||
|
|
||
|
import pandas
|
||
|
from IPython import display
|
||
|
import findspark
|
||
|
findspark.init()
|
||
|
import pyspark
|
||
|
import pyspark.sql
|
||
|
import sys
|
||
|
|
||
|
from pyspark.sql.functions import col, lag, when, concat_ws, last, first
|
||
|
from pyspark.sql import Window
|
||
|
from pyspark.sql.types import LongType
|
||
|
|
||
|
cluster="b"
|
||
|
|
||
|
spark = pyspark.sql.SparkSession.builder \
|
||
|
.appName("machine_time_waste") \
|
||
|
.config("spark.local.dir", "/run/tmpfiles.d/spark") \
|
||
|
.config("spark.driver.memory", "124g") \
|
||
|
.getOrCreate()
|
||
|
|
||
|
df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz")
|
||
|
|
||
|
df.printSchema()
|
||
|
|
||
|
df.show()
|
||
|
|
||
|
# .filter(df.collection_type == 0) \
|
||
|
df2 = df \
|
||
|
.withColumn("time", col("time").cast(LongType())) \
|
||
|
.withColumn("type", col("type").cast(LongType())) \
|
||
|
.withColumn("type", when(col("type").isNull(), 0).otherwise(col("type"))) \
|
||
|
.withColumn("id", concat_ws("-", "collection_id", "instance_index")) \
|
||
|
.where(col("time").isNotNull()) \
|
||
|
.where(col("type").isNotNull()) \
|
||
|
.where((col("instance_index").isNotNull()) & (col("collection_id").isNotNull())) \
|
||
|
.select("time", "type", "id")
|
||
|
|
||
|
df2.show()
|
||
|
print("Total: " + str(df.count()))
|
||
|
print("Filtered: " + str(df2.count()))
|
||
|
|
||
|
# my_window = Window.partitionBy("machine_id", "id").orderBy(df2.time.asc())
|
||
|
|
||
|
w2 = Window.partitionBy("id").orderBy(df2.time.asc()).rowsBetween(Window.currentRow, Window.unboundedFollowing)
|
||
|
|
||
|
# .withColumn("prev_time", lag(df2.time).over(my_window)) \
|
||
|
# .withColumn("prev_type", lag(df2.type).over(my_window)) \
|
||
|
df3 = df2 \
|
||
|
.withColumn("t3_time", when((df2.type != 3), None).otherwise(df2.time)) \
|
||
|
.withColumn("t45678_time", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.time)) \
|
||
|
.withColumn("t45678_type", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.type)) \
|
||
|
.withColumn("t01_time", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.time)) \
|
||
|
.withColumn("t01_type", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.type)) \
|
||
|
.withColumn("next_time", when(df2.type == 3, first(col("t45678_time"), True).over(w2)) \
|
||
|
.when((df2.type == 0) | (df2.type == 1), first(col("t3_time"), True).over(w2)) \
|
||
|
.when((df2.type >= 4) | (df2.type <= 8), first(col("t01_time"), True).over(w2)) \
|
||
|
.otherwise(None)) \
|
||
|
.withColumn("next_type", when(df2.type == 3, first(col("t45678_type"), True).over(w2)) \
|
||
|
.when((df2.type == 0) | (df2.type == 1), 3) \
|
||
|
.when((df2.type >= 4) | (df2.type <= 8), first(col("t01_type"), True).over(w2)) \
|
||
|
.otherwise(None)) \
|
||
|
.withColumn("last_term_type", last(col("t45678_type"), True).over(w2)) \
|
||
|
.withColumn("time_delta", col("next_time") - col("time")) \
|
||
|
.select("id", "time", "type", "last_term_type", "time_delta", "t01_time", \
|
||
|
"t01_type", "t3_time", "t45678_time", "t45678_type", "next_time", "next_type")
|
||
|
|
||
|
df4 = df3.where(df3.next_type.isNotNull()).groupby("type", "next_type", "last_term_type").sum("time_delta")
|
||
|
|
||
|
# df3.orderBy(df3.machine_id, df3.time).show(n=100)
|
||
|
# df3.printSchema()
|
||
|
df4.show(n=1000000)
|
||
|
df4.write.csv("/home/claudio/google_2019/thesis_queries/machine_time_waste/" + cluster + "_state_change.csv")
|
||
|
|
||
|
# vim: set ts=2 sw=2 et tw=120:
|