From 1bce593fd6435c4180cb26d65ccf85039f8e52cf Mon Sep 17 00:00:00 2001 From: "Claudio Maggioni (maggicl)" Date: Fri, 19 Feb 2021 18:47:56 +0000 Subject: [PATCH] machine time waste --- .../machine_time_waste-checkpoint.ipynb | 895 ++++++++++++++++++ machine_time_waste/b_state_changes.json | 3 + machine_time_waste/d_state_changes.json | 3 + machine_time_waste/g_state_changes.json | 3 + machine_time_waste/h_state_changes.json | 3 + machine_time_waste/machine_time_waste.ipynb | 895 ++++++++++++++++++ machine_time_waste/machine_time_waste.py | 79 ++ machine_time_waste/machine_time_waste_rdd.py | 103 ++ 8 files changed, 1984 insertions(+) create mode 100644 machine_time_waste/.ipynb_checkpoints/machine_time_waste-checkpoint.ipynb create mode 100644 machine_time_waste/b_state_changes.json create mode 100644 machine_time_waste/d_state_changes.json create mode 100644 machine_time_waste/g_state_changes.json create mode 100644 machine_time_waste/h_state_changes.json create mode 100644 machine_time_waste/machine_time_waste.ipynb create mode 100755 machine_time_waste/machine_time_waste.py create mode 100755 machine_time_waste/machine_time_waste_rdd.py diff --git a/machine_time_waste/.ipynb_checkpoints/machine_time_waste-checkpoint.ipynb b/machine_time_waste/.ipynb_checkpoints/machine_time_waste-checkpoint.ipynb new file mode 100644 index 00000000..40fa4661 --- /dev/null +++ b/machine_time_waste/.ipynb_checkpoints/machine_time_waste-checkpoint.ipynb @@ -0,0 +1,895 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "proper-gnome", + "metadata": {}, + "source": [ + "# Temporal impact: machine time waste" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "fantastic-harrison", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "import pandas\n", + "from IPython import display\n", + "import findspark\n", + "findspark.init()\n", + "import pyspark\n", + "import pyspark.sql\n", + "import sys\n", + "\n", + "from pyspark.sql.functions import col, lag, when, concat_ws, last, first\n", + "from pyspark.sql import Window\n", + "from pyspark.sql.types import LongType" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "failing-rebecca", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "cluster=\"b\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "transsexual-baptist", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:root:Exception while sending command.\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1207, in send_command\n", + " raise Py4JNetworkError(\"Answer from Java side is empty\")\n", + "py4j.protocol.Py4JNetworkError: Answer from Java side is empty\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1033, in send_command\n", + " response = connection.send_command(command)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1212, in send_command\n", + " \"Error while receiving\", e, proto.ERROR_ON_RECEIVE)\n", + "py4j.protocol.Py4JNetworkError: Error while receiving\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n" + ] + }, + { + "ename": "Py4JError", + "evalue": "An error occurred while calling o26.json", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mPy4JError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mgetOrCreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/claudio/google_2019/instance_events/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcluster\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcluster\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"_instance_events*.json.gz\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/spark/python/pyspark/sql/readwriter.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, path, schema, primitivesAsString, prefersDecimal, allowComments, allowUnquotedFieldNames, allowSingleQuotes, allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, mode, columnNameOfCorruptRecord, dateFormat, timestampFormat, multiLine, allowUnquotedControlChars, lineSep, samplingRatio, dropFieldIfAllNull, encoding, locale, pathGlobFilter, recursiveFileLookup)\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 299\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 300\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jreader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_spark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jvm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPythonUtils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoSeq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 301\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRDD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1303\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1304\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1305\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1306\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1307\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/spark/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 129\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0mconverted\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconvert_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 334\u001b[0m raise Py4JError(\n\u001b[1;32m 335\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 336\u001b[0;31m format(target_id, \".\", name))\n\u001b[0m\u001b[1;32m 337\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mtype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0manswer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mPy4JError\u001b[0m: An error occurred while calling o26.json" + ] + } + ], + "source": [ + "spark = pyspark.sql.SparkSession.builder \\\n", + " .appName(\"machine_time_waste\") \\\n", + " .getOrCreate()\n", + "\n", + "df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "juvenile-absolute", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "lucky-western", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "normal-settlement", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# .filter(df.collection_type == 0) \\\n", + "df2 = df \\\n", + " .withColumn(\"time\", col(\"time\").cast(LongType())) \\\n", + " .withColumn(\"type\", col(\"type\").cast(LongType())) \\\n", + " .withColumn(\"type\", when(col(\"type\").isNull(), 0).otherwise(col(\"type\"))) \\\n", + " .withColumn(\"id\", concat_ws(\"-\", \"collection_id\", \"instance_index\")) \\\n", + " .where(col(\"time\").isNotNull()) \\\n", + " .where(col(\"type\").isNotNull()) \\\n", + " .where((col(\"instance_index\").isNotNull()) & (col(\"collection_id\").isNotNull())) \\\n", + " .select(\"machine_id\", \"id\", \"time\", \"type\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "typical-homeless", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "df2.show()\n", + "print(\"Total: \" + str(df.count()))\n", + "print(\"Filtered: \" + str(df2.count()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "collect-saying", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# my_window = Window.partitionBy(\"machine_id\", \"id\").orderBy(df2.time.asc())\n", + "\n", + "w2 = Window.partitionBy(\"id\").orderBy(df2.time.asc()).rowsBetween(Window.currentRow, Window.unboundedFollowing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cooperative-appraisal", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# .withColumn(\"prev_time\", lag(df2.time).over(my_window)) \\\n", + "# .withColumn(\"prev_type\", lag(df2.type).over(my_window)) \\\n", + "\n", + "df3 = df2 \\\n", + " .withColumn(\"t3_time\", when((df2.type != 3), None).otherwise(df2.time)) \\\n", + " .withColumn(\"t45678_time\", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.time)) \\\n", + " .withColumn(\"t45678_type\", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.type)) \\\n", + " .withColumn(\"t01_time\", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.time)) \\\n", + " .withColumn(\"t01_type\", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.type)) \\\n", + " .withColumn(\"next_time\", when(df2.type == 3, first(col(\"t45678_time\"), ignorenulls=True).over(w2)) \\\n", + " .when((df2.type == 0) | (df2.type == 1), first(col(\"t3_time\"), ignorenulls=True).over(w2)) \\\n", + " .when((df2.type >= 4) | (df2.type <= 8), first(col(\"t01_time\"), ignorenulls=True).over(w2)) \\\n", + " .otherwise(None)) \\\n", + " .withColumn(\"next_type\", when(df2.type == 3, first(col(\"t45678_type\"), ignorenulls=True).over(w2)) \\\n", + " .when((df2.type == 0) | (df2.type == 1), 3) \\\n", + " .when((df2.type >= 4) | (df2.type <= 8), first(col(\"t01_type\"), ignorenulls=True).over(w2)) \\\n", + " .otherwise(None)) \\\n", + " .withColumn(\"last_term_type\", last(col(\"t45678_type\"), ignorenulls=True).over(w2)) \\\n", + " .withColumn(\"time_delta\", col(\"next_time\") - col(\"time\")) \\\n", + " .select(\"machine_id\", \"id\", \"time\", \"type\", \"last_term_type\", \"time_delta\", \"t01_time\", \"t01_type\", \"t3_time\", \"t45678_time\", \"t45678_type\", \"next_time\", \"next_type\") \\" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ideal-angle", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "df4 = df3.where(df3.next_type.isNotNull()).groupby(\"type\", \"next_type\", \"last_term_type\").sum(\"time_delta\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "working-difficulty", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# df3.orderBy(df3.machine_id, df3.time).show(n=100)\n", + "# df3.printSchema()\n", + "df4.show(n=1000000)\n", + "df4.write.csv(\"/home/claudio/google_2019/thesis_queries/machine_time_waste/\" + cluster + \"_state_change.csv\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/machine_time_waste/b_state_changes.json b/machine_time_waste/b_state_changes.json new file mode 100644 index 00000000..73d98f42 --- /dev/null +++ b/machine_time_waste/b_state_changes.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a252a7fff64755328867564af428a6e3a2f692ccf84805a60030b6cd4c311b91 +size 6851 diff --git a/machine_time_waste/d_state_changes.json b/machine_time_waste/d_state_changes.json new file mode 100644 index 00000000..c9f6014a --- /dev/null +++ b/machine_time_waste/d_state_changes.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18ff1f1529868b2f4b86216af8bf6c29c0b5b4b79505667309d7a73f3eee3ff6 +size 7303 diff --git a/machine_time_waste/g_state_changes.json b/machine_time_waste/g_state_changes.json new file mode 100644 index 00000000..ec2ad4a1 --- /dev/null +++ b/machine_time_waste/g_state_changes.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d8c25c7fa066a1a4c5fc2c9d3e92e0f777e2d131112fdd226b9cf59d84515e +size 6856 diff --git a/machine_time_waste/h_state_changes.json b/machine_time_waste/h_state_changes.json new file mode 100644 index 00000000..0de0ba63 --- /dev/null +++ b/machine_time_waste/h_state_changes.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65b23215eea5fe494eaf1c8618824622736fcdf07740867330e8443c354b8b64 +size 7076 diff --git a/machine_time_waste/machine_time_waste.ipynb b/machine_time_waste/machine_time_waste.ipynb new file mode 100644 index 00000000..40fa4661 --- /dev/null +++ b/machine_time_waste/machine_time_waste.ipynb @@ -0,0 +1,895 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "proper-gnome", + "metadata": {}, + "source": [ + "# Temporal impact: machine time waste" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "fantastic-harrison", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "import pandas\n", + "from IPython import display\n", + "import findspark\n", + "findspark.init()\n", + "import pyspark\n", + "import pyspark.sql\n", + "import sys\n", + "\n", + "from pyspark.sql.functions import col, lag, when, concat_ws, last, first\n", + "from pyspark.sql import Window\n", + "from pyspark.sql.types import LongType" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "failing-rebecca", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "cluster=\"b\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "transsexual-baptist", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:root:Exception while sending command.\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1207, in send_command\n", + " raise Py4JNetworkError(\"Answer from Java side is empty\")\n", + "py4j.protocol.Py4JNetworkError: Answer from Java side is empty\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1033, in send_command\n", + " response = connection.send_command(command)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1212, in send_command\n", + " \"Error while receiving\", e, proto.ERROR_ON_RECEIVE)\n", + "py4j.protocol.Py4JNetworkError: Error while receiving\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n", + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"\", line 5, in \n", + " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", + " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", + " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", + " answer, self.gateway_client, self.target_id, self.name)\n", + " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", + " return f(*a, **kw)\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", + " format(target_id, \".\", name))\n", + "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", + " stb = value._render_traceback_()\n", + "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [Errno 111] Connection refused\n" + ] + }, + { + "ename": "Py4JError", + "evalue": "An error occurred while calling o26.json", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mPy4JError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mgetOrCreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/claudio/google_2019/instance_events/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcluster\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcluster\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"_instance_events*.json.gz\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/spark/python/pyspark/sql/readwriter.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, path, schema, primitivesAsString, prefersDecimal, allowComments, allowUnquotedFieldNames, allowSingleQuotes, allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, mode, columnNameOfCorruptRecord, dateFormat, timestampFormat, multiLine, allowUnquotedControlChars, lineSep, samplingRatio, dropFieldIfAllNull, encoding, locale, pathGlobFilter, recursiveFileLookup)\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 299\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 300\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jreader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_spark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jvm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPythonUtils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoSeq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 301\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRDD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1303\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1304\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1305\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1306\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1307\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/spark/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 129\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0mconverted\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconvert_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 334\u001b[0m raise Py4JError(\n\u001b[1;32m 335\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 336\u001b[0;31m format(target_id, \".\", name))\n\u001b[0m\u001b[1;32m 337\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mtype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0manswer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mPy4JError\u001b[0m: An error occurred while calling o26.json" + ] + } + ], + "source": [ + "spark = pyspark.sql.SparkSession.builder \\\n", + " .appName(\"machine_time_waste\") \\\n", + " .getOrCreate()\n", + "\n", + "df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "juvenile-absolute", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "lucky-western", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "normal-settlement", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# .filter(df.collection_type == 0) \\\n", + "df2 = df \\\n", + " .withColumn(\"time\", col(\"time\").cast(LongType())) \\\n", + " .withColumn(\"type\", col(\"type\").cast(LongType())) \\\n", + " .withColumn(\"type\", when(col(\"type\").isNull(), 0).otherwise(col(\"type\"))) \\\n", + " .withColumn(\"id\", concat_ws(\"-\", \"collection_id\", \"instance_index\")) \\\n", + " .where(col(\"time\").isNotNull()) \\\n", + " .where(col(\"type\").isNotNull()) \\\n", + " .where((col(\"instance_index\").isNotNull()) & (col(\"collection_id\").isNotNull())) \\\n", + " .select(\"machine_id\", \"id\", \"time\", \"type\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "typical-homeless", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "df2.show()\n", + "print(\"Total: \" + str(df.count()))\n", + "print(\"Filtered: \" + str(df2.count()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "collect-saying", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# my_window = Window.partitionBy(\"machine_id\", \"id\").orderBy(df2.time.asc())\n", + "\n", + "w2 = Window.partitionBy(\"id\").orderBy(df2.time.asc()).rowsBetween(Window.currentRow, Window.unboundedFollowing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cooperative-appraisal", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# .withColumn(\"prev_time\", lag(df2.time).over(my_window)) \\\n", + "# .withColumn(\"prev_type\", lag(df2.type).over(my_window)) \\\n", + "\n", + "df3 = df2 \\\n", + " .withColumn(\"t3_time\", when((df2.type != 3), None).otherwise(df2.time)) \\\n", + " .withColumn(\"t45678_time\", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.time)) \\\n", + " .withColumn(\"t45678_type\", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.type)) \\\n", + " .withColumn(\"t01_time\", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.time)) \\\n", + " .withColumn(\"t01_type\", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.type)) \\\n", + " .withColumn(\"next_time\", when(df2.type == 3, first(col(\"t45678_time\"), ignorenulls=True).over(w2)) \\\n", + " .when((df2.type == 0) | (df2.type == 1), first(col(\"t3_time\"), ignorenulls=True).over(w2)) \\\n", + " .when((df2.type >= 4) | (df2.type <= 8), first(col(\"t01_time\"), ignorenulls=True).over(w2)) \\\n", + " .otherwise(None)) \\\n", + " .withColumn(\"next_type\", when(df2.type == 3, first(col(\"t45678_type\"), ignorenulls=True).over(w2)) \\\n", + " .when((df2.type == 0) | (df2.type == 1), 3) \\\n", + " .when((df2.type >= 4) | (df2.type <= 8), first(col(\"t01_type\"), ignorenulls=True).over(w2)) \\\n", + " .otherwise(None)) \\\n", + " .withColumn(\"last_term_type\", last(col(\"t45678_type\"), ignorenulls=True).over(w2)) \\\n", + " .withColumn(\"time_delta\", col(\"next_time\") - col(\"time\")) \\\n", + " .select(\"machine_id\", \"id\", \"time\", \"type\", \"last_term_type\", \"time_delta\", \"t01_time\", \"t01_type\", \"t3_time\", \"t45678_time\", \"t45678_type\", \"next_time\", \"next_type\") \\" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ideal-angle", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "df4 = df3.where(df3.next_type.isNotNull()).groupby(\"type\", \"next_type\", \"last_term_type\").sum(\"time_delta\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "working-difficulty", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# df3.orderBy(df3.machine_id, df3.time).show(n=100)\n", + "# df3.printSchema()\n", + "df4.show(n=1000000)\n", + "df4.write.csv(\"/home/claudio/google_2019/thesis_queries/machine_time_waste/\" + cluster + \"_state_change.csv\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/machine_time_waste/machine_time_waste.py b/machine_time_waste/machine_time_waste.py new file mode 100755 index 00000000..b8bf92e4 --- /dev/null +++ b/machine_time_waste/machine_time_waste.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# # Temporal impact: machine time waste + +import pandas +from IPython import display +import findspark +findspark.init() +import pyspark +import pyspark.sql +import sys + +from pyspark.sql.functions import col, lag, when, concat_ws, last, first +from pyspark.sql import Window +from pyspark.sql.types import LongType + +cluster="b" + +spark = pyspark.sql.SparkSession.builder \ + .appName("machine_time_waste") \ + .config("spark.local.dir", "/run/tmpfiles.d/spark") \ + .config("spark.driver.memory", "124g") \ + .getOrCreate() + +df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz") + +df.printSchema() + +df.show() + +# .filter(df.collection_type == 0) \ +df2 = df \ + .withColumn("time", col("time").cast(LongType())) \ + .withColumn("type", col("type").cast(LongType())) \ + .withColumn("type", when(col("type").isNull(), 0).otherwise(col("type"))) \ + .withColumn("id", concat_ws("-", "collection_id", "instance_index")) \ + .where(col("time").isNotNull()) \ + .where(col("type").isNotNull()) \ + .where((col("instance_index").isNotNull()) & (col("collection_id").isNotNull())) \ + .select("time", "type", "id") + +df2.show() +print("Total: " + str(df.count())) +print("Filtered: " + str(df2.count())) + +# my_window = Window.partitionBy("machine_id", "id").orderBy(df2.time.asc()) + +w2 = Window.partitionBy("id").orderBy(df2.time.asc()).rowsBetween(Window.currentRow, Window.unboundedFollowing) + +# .withColumn("prev_time", lag(df2.time).over(my_window)) \ +# .withColumn("prev_type", lag(df2.type).over(my_window)) \ +df3 = df2 \ + .withColumn("t3_time", when((df2.type != 3), None).otherwise(df2.time)) \ + .withColumn("t45678_time", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.time)) \ + .withColumn("t45678_type", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.type)) \ + .withColumn("t01_time", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.time)) \ + .withColumn("t01_type", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.type)) \ + .withColumn("next_time", when(df2.type == 3, first(col("t45678_time"), True).over(w2)) \ + .when((df2.type == 0) | (df2.type == 1), first(col("t3_time"), True).over(w2)) \ + .when((df2.type >= 4) | (df2.type <= 8), first(col("t01_time"), True).over(w2)) \ + .otherwise(None)) \ + .withColumn("next_type", when(df2.type == 3, first(col("t45678_type"), True).over(w2)) \ + .when((df2.type == 0) | (df2.type == 1), 3) \ + .when((df2.type >= 4) | (df2.type <= 8), first(col("t01_type"), True).over(w2)) \ + .otherwise(None)) \ + .withColumn("last_term_type", last(col("t45678_type"), True).over(w2)) \ + .withColumn("time_delta", col("next_time") - col("time")) \ + .select("id", "time", "type", "last_term_type", "time_delta", "t01_time", \ + "t01_type", "t3_time", "t45678_time", "t45678_type", "next_time", "next_type") + +df4 = df3.where(df3.next_type.isNotNull()).groupby("type", "next_type", "last_term_type").sum("time_delta") + +# df3.orderBy(df3.machine_id, df3.time).show(n=100) +# df3.printSchema() +df4.show(n=1000000) +df4.write.csv("/home/claudio/google_2019/thesis_queries/machine_time_waste/" + cluster + "_state_change.csv") + +# vim: set ts=2 sw=2 et tw=120: diff --git a/machine_time_waste/machine_time_waste_rdd.py b/machine_time_waste/machine_time_waste_rdd.py new file mode 100755 index 00000000..33f66829 --- /dev/null +++ b/machine_time_waste/machine_time_waste_rdd.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# # Temporal impact: machine time waste + +import json +import pandas +from IPython import display +import findspark +findspark.init() +import pyspark +import pyspark.sql +import sys + +from pyspark.sql.functions import col, lag, when, concat_ws, last, first +from pyspark.sql import Window +from pyspark.sql.types import LongType + +if len(sys.argv) != 2 or len(sys.argv[1]) != 1: + print("usage: " + sys.argv[0] + " {cluster}", file=sys.stderr) + sys.exit(1) + +cluster=sys.argv[1] + +spark = pyspark.sql.SparkSession.builder \ + .appName("machine_time_waste") \ + .config("spark.local.dir", "/tmp/ramdisk/spark") \ + .config("spark.driver.memory", "124g") \ + .getOrCreate() + +df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_instance_events*.json.gz") +# df = spark.read.json("/home/claudio/google_2019/instance_events/" + cluster + "/" + cluster + "_test.json") + +df.printSchema() + +df.show() + +# .filter(df.collection_type == 0) \ +df2 = df \ + .withColumn("time", col("time").cast(LongType())) \ + .withColumn("type", col("type").cast(LongType())) \ + .withColumn("type", when(col("type").isNull(), 0).otherwise(col("type"))) \ + .withColumn("id", concat_ws("-", "collection_id", "instance_index")) \ + .where(col("time").isNotNull()) \ + .where(col("type").isNotNull()) \ + .where((col("instance_index").isNotNull()) & (col("collection_id").isNotNull())) \ + .select("time", "type", "id") + +df2.show() + +total = df.count() +filtered = df2.count() + +print("Total: " + str(total)) +print("Filtered: " + str(filtered)) + +r = df2.rdd + +def for_each_task(ts): + ts = sorted(ts, key=lambda x: x.time) + last_term = None + prev = None + tr = {} + + for i,t in enumerate(ts): + if prev is not None and t.type == prev.type: # remove useless transitions + if (i == len(ts)-1): # if last + tr[str(prev.type) + "-" + str(t.type)] = t.time - prev.time # keep "loops" if last + else: + continue + if t.type >= 4 and t.type <= 8: + last_term = t.type + if prev is not None: + tr[str(prev.type) + "-" + str(t.type)] = t.time - prev.time + prev = t + return {"last_term": last_term, 'tr': tr} + +def sum_values(ds): + dsum = {} + for dt in ds: + d = dt["tr"] + for key in d: + if key not in dsum: + dsum[key] = d[key] + else: + dsum[key] += d[key] + return dsum + +r2 = r \ + .groupBy(lambda x: x.id) \ + .mapValues(for_each_task) \ + .map(lambda x: x[1]) \ + .groupBy(lambda x: x["last_term"]) \ + .mapValues(sum_values) \ + .collect() + +with open(cluster + "_state_changes.json", "w") as out: + json.dump({"filtered": filtered, "total": total, "data": r2}, out) + +# .withColumn("prev_time", lag(df2.time).over(my_window)) \ +# .withColumn("prev_type", lag(df2.type).over(my_window)) \ + +# vim: set ts=2 sw=2 et tw=120: