bachelorThesis/machine_time_waste/machine_time_waste.py

#!/usr/bin/env python3
# coding: utf-8

# # Temporal impact: machine time waste

import pandas
from IPython import display
import findspark
findspark.init()
import pyspark
import pyspark.sql
import sys

from pyspark.sql.functions import col, lag, when, concat_ws, last, first
from pyspark.sql import Window
from pyspark.sql.types import LongType

cluster="b"

spark = pyspark.sql.SparkSession.builder \
  .appName("machine_time_waste") \
  .config("spark.local.dir", "/run/tmpfiles.d/spark") \
  .config("spark.driver.memory", "124g") \
  .getOrCreate()

df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz")

df.printSchema()

df.show()

#  .filter(df.collection_type == 0) \
df2 = df \
  .withColumn("time", col("time").cast(LongType())) \
  .withColumn("type", col("type").cast(LongType())) \
  .withColumn("type", when(col("type").isNull(), 0).otherwise(col("type"))) \
  .withColumn("id", concat_ws("-", "collection_id", "instance_index")) \
  .where(col("time").isNotNull()) \
  .where(col("type").isNotNull()) \
  .where((col("instance_index").isNotNull()) & (col("collection_id").isNotNull())) \
  .select("time", "type", "id")

df2.show()
print("Total: " + str(df.count()))
print("Filtered: " + str(df2.count()))

# my_window = Window.partitionBy("machine_id", "id").orderBy(df2.time.asc())

w2 = Window.partitionBy("id").orderBy(df2.time.asc()).rowsBetween(Window.currentRow, Window.unboundedFollowing)

# .withColumn("prev_time", lag(df2.time).over(my_window)) \
# .withColumn("prev_type", lag(df2.type).over(my_window)) \
df3 = df2 \
	.withColumn("t3_time", when((df2.type != 3), None).otherwise(df2.time)) \
	.withColumn("t45678_time", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.time)) \
	.withColumn("t45678_type", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.type)) \
	.withColumn("t01_time", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.time)) \
	.withColumn("t01_type", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.type)) \
	.withColumn("next_time", when(df2.type == 3, first(col("t45678_time"), True).over(w2)) \
		.when((df2.type == 0) | (df2.type == 1), first(col("t3_time"), True).over(w2)) \
    .when((df2.type >= 4) | (df2.type <= 8), first(col("t01_time"), True).over(w2)) \
    .otherwise(None)) \
	.withColumn("next_type", when(df2.type == 3, first(col("t45678_type"), True).over(w2)) \
    .when((df2.type == 0) | (df2.type == 1), 3) \
    .when((df2.type >= 4) | (df2.type <= 8), first(col("t01_type"), True).over(w2)) \
    .otherwise(None)) \
	.withColumn("last_term_type", last(col("t45678_type"), True).over(w2)) \
	.withColumn("time_delta", col("next_time") - col("time")) \
   .select("id", "time", "type", "last_term_type", "time_delta", "t01_time", \
     "t01_type", "t3_time", "t45678_time", "t45678_type", "next_time", "next_type")

df4 = df3.where(df3.next_type.isNotNull()).groupby("type", "next_type", "last_term_type").sum("time_delta")

# df3.orderBy(df3.machine_id, df3.time).show(n=100)
# df3.printSchema()
df4.show(n=1000000)
df4.write.csv("/home/claudio/google_2019/thesis_queries/machine_time_waste/" + cluster + "_state_change.csv")

# vim: set ts=2 sw=2 et tw=120:
machine time waste 2021-02-19 18:47:56 +00:00			`#!/usr/bin/env python3`
			`# coding: utf-8`

			`# # Temporal impact: machine time waste`

			`import pandas`
			`from IPython import display`
			`import findspark`
			`findspark.init()`
			`import pyspark`
			`import pyspark.sql`
			`import sys`

			`from pyspark.sql.functions import col, lag, when, concat_ws, last, first`
			`from pyspark.sql import Window`
			`from pyspark.sql.types import LongType`

			`cluster="b"`

			`spark = pyspark.sql.SparkSession.builder \`
			`.appName("machine_time_waste") \`
			`.config("spark.local.dir", "/run/tmpfiles.d/spark") \`
			`.config("spark.driver.memory", "124g") \`
			`.getOrCreate()`

			`df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz")`

			`df.printSchema()`

			`df.show()`

			`# .filter(df.collection_type == 0) \`
			`df2 = df \`
			`.withColumn("time", col("time").cast(LongType())) \`
			`.withColumn("type", col("type").cast(LongType())) \`
			`.withColumn("type", when(col("type").isNull(), 0).otherwise(col("type"))) \`
			`.withColumn("id", concat_ws("-", "collection_id", "instance_index")) \`
			`.where(col("time").isNotNull()) \`
			`.where(col("type").isNotNull()) \`
			`.where((col("instance_index").isNotNull()) & (col("collection_id").isNotNull())) \`
			`.select("time", "type", "id")`

			`df2.show()`
			`print("Total: " + str(df.count()))`
			`print("Filtered: " + str(df2.count()))`

			`# my_window = Window.partitionBy("machine_id", "id").orderBy(df2.time.asc())`

			`w2 = Window.partitionBy("id").orderBy(df2.time.asc()).rowsBetween(Window.currentRow, Window.unboundedFollowing)`

			`# .withColumn("prev_time", lag(df2.time).over(my_window)) \`
			`# .withColumn("prev_type", lag(df2.type).over(my_window)) \`
			`df3 = df2 \`
			`.withColumn("t3_time", when((df2.type != 3), None).otherwise(df2.time)) \`
			`.withColumn("t45678_time", when((df2.type < 4) \| (df2.type > 8), None).otherwise(df2.time)) \`
			`.withColumn("t45678_type", when((df2.type < 4) \| (df2.type > 8), None).otherwise(df2.type)) \`
			`.withColumn("t01_time", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.time)) \`
			`.withColumn("t01_type", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.type)) \`
			`.withColumn("next_time", when(df2.type == 3, first(col("t45678_time"), True).over(w2)) \`
			`.when((df2.type == 0) \| (df2.type == 1), first(col("t3_time"), True).over(w2)) \`
			`.when((df2.type >= 4) \| (df2.type <= 8), first(col("t01_time"), True).over(w2)) \`
			`.otherwise(None)) \`
			`.withColumn("next_type", when(df2.type == 3, first(col("t45678_type"), True).over(w2)) \`
			`.when((df2.type == 0) \| (df2.type == 1), 3) \`
			`.when((df2.type >= 4) \| (df2.type <= 8), first(col("t01_type"), True).over(w2)) \`
			`.otherwise(None)) \`
			`.withColumn("last_term_type", last(col("t45678_type"), True).over(w2)) \`
			`.withColumn("time_delta", col("next_time") - col("time")) \`
			`.select("id", "time", "type", "last_term_type", "time_delta", "t01_time", \`
			`"t01_type", "t3_time", "t45678_time", "t45678_type", "next_time", "next_type")`

			`df4 = df3.where(df3.next_type.isNotNull()).groupby("type", "next_type", "last_term_type").sum("time_delta")`

			`# df3.orderBy(df3.machine_id, df3.time).show(n=100)`
			`# df3.printSchema()`
			`df4.show(n=1000000)`
			`df4.write.csv("/home/claudio/google_2019/thesis_queries/machine_time_waste/" + cluster + "_state_change.csv")`

			`# vim: set ts=2 sw=2 et tw=120:`