diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..e532dd83
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+**/.ipynb_checkpoints/
diff --git a/machine_configs/.ipynb_checkpoints/machine_configs-checkpoint.ipynb b/machine_configs/.ipynb_checkpoints/machine_configs-checkpoint.ipynb
deleted file mode 100644
index bf43e41b..00000000
--- a/machine_configs/.ipynb_checkpoints/machine_configs-checkpoint.ipynb
+++ /dev/null
@@ -1,1592 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "built-symbol",
- "metadata": {},
- "source": [
- "# Machine configurations\n",
- "\n",
- "This query returns all the distinct NCU/NMU configurations in the borg clusters, including how many machines ids match for any specific configuration.\n",
- "\n",
- "Please note that for simplicity's sake the we are technically counting the number of ADD or UPDATE events for each configuration, and not the actual count of machines. Therefore a machine configuration may change over time and count twice or more."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "stuffed-lightning",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "\n",
- "# For pretty printing\n",
- "from IPython.display import display\n",
- "\n",
- "# Disables row ellipsis\n",
- "pd.set_option('display.max_rows', 200)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "upper-lloyd",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load all machine event rows in a single DataFrame, and add a \"cluster\" column to differentiate\n",
- "# between clusters\n",
- "df = None\n",
- "for l in \"abcdefgh\":\n",
- " dfl = pd.read_csv(\"~/google_2019/machine_events/\" + l + \"_machine_events.csv\")\n",
- " dfl[\"cluster\"] = l\n",
- " if df is None:\n",
- " df = dfl\n",
- " else:\n",
- " df = pd.concat([df, dfl], axis=0)\n",
- "\n",
- "# Filter only ADD or UPDATE events\n",
- "df = df[(df.type==1)|(df.type==3)]\n",
- "\n",
- "# P.S: ADD=1, REMOVE=2, UPDATE=3\n",
- " \n",
- "df = df[[\"capacity.cpus\", \"capacity.memory\", \"cluster\", \n",
- " \"missing_data_reason\", \"machine_id\"]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "presidential-farmer",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " capacity.cpus | \n",
- " capacity.memory | \n",
- " cluster | \n",
- " machine_id | \n",
- "
\n",
- " \n",
- " missing_data_reason | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " NaN | \n",
- " 523781 | \n",
- " 523781 | \n",
- " 532510 | \n",
- " 532510 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " capacity.cpus capacity.memory cluster machine_id\n",
- "missing_data_reason \n",
- "NaN 523781 523781 532510 532510"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Checking if we need to deal with particular missing data\n",
- "# No columns returned, so missing data can be safely ignored\n",
- "df.groupby(by=[\"missing_data_reason\"], dropna=False).count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "informative-vietnam",
- "metadata": {},
- "outputs": [],
- "source": [
- "def do_group_by(df):\n",
- " # Exclude \"cluster\" column and perform group-by\n",
- " dfg = df[df.columns.difference(['cluster'])]. \\\n",
- " groupby(by=[\"capacity.cpus\",\"capacity.memory\"], \n",
- " dropna=False).count()\n",
- " \n",
- " # Compute relative number of machines\n",
- " total_machines = dfg['machine_id'].sum()\n",
- " dfg[\"machine_id_perc\"] = dfg[\"machine_id\"] * 100 / total_machines\n",
- " \n",
- " # Sort descending\n",
- " dfg = dfg.sort_values(\"machine_id_perc\", ascending=False)\n",
- " \n",
- " display(dfg)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "pretty-taiwan",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "For cluster a:\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " machine_id | \n",
- " machine_id_perc | \n",
- "
\n",
- " \n",
- " capacity.cpus | \n",
- " capacity.memory | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0.591797 | \n",
- " 0.333496 | \n",
- " 29487 | \n",
- " 34.758469 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 0.500000 | \n",
- " 13440 | \n",
- " 15.842705 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.333496 | \n",
- " 12495 | \n",
- " 14.728764 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.333496 | \n",
- " 9057 | \n",
- " 10.676144 | \n",
- "
\n",
- " \n",
- " 0.166748 | \n",
- " 5265 | \n",
- " 6.206238 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.666992 | \n",
- " 4608 | \n",
- " 5.431784 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 4446 | \n",
- " 5.240823 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.166748 | \n",
- " 2484 | \n",
- " 2.928071 | \n",
- "
\n",
- " \n",
- " NaN | \n",
- " NaN | \n",
- " 1377 | \n",
- " 1.623170 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 0.500000 | \n",
- " 1143 | \n",
- " 1.347337 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 654 | \n",
- " 0.770917 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 0.250000 | \n",
- " 366 | \n",
- " 0.431431 | \n",
- "
\n",
- " \n",
- " 0.479492 | \n",
- " 0.250000 | \n",
- " 6 | \n",
- " 0.007073 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.250000 | \n",
- " 6 | \n",
- " 0.007073 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " machine_id machine_id_perc\n",
- "capacity.cpus capacity.memory \n",
- "0.591797 0.333496 29487 34.758469\n",
- "1.000000 0.500000 13440 15.842705\n",
- "0.708984 0.333496 12495 14.728764\n",
- "0.386719 0.333496 9057 10.676144\n",
- " 0.166748 5265 6.206238\n",
- "0.708984 0.666992 4608 5.431784\n",
- "1.000000 1.000000 4446 5.240823\n",
- "0.591797 0.166748 2484 2.928071\n",
- "NaN NaN 1377 1.623170\n",
- "0.958984 0.500000 1143 1.347337\n",
- " 1.000000 654 0.770917\n",
- "1.000000 0.250000 366 0.431431\n",
- "0.479492 0.250000 6 0.007073\n",
- "0.708984 0.250000 6 0.007073"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "For cluster b:\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " machine_id | \n",
- " machine_id_perc | \n",
- "
\n",
- " \n",
- " capacity.cpus | \n",
- " capacity.memory | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0.591797 | \n",
- " 0.333496 | \n",
- " 16184 | \n",
- " 31.982926 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 0.500000 | \n",
- " 9790 | \n",
- " 19.347061 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.333496 | \n",
- " 8448 | \n",
- " 16.694992 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 0.500000 | \n",
- " 5502 | \n",
- " 10.873088 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.666992 | \n",
- " 3832 | \n",
- " 7.572823 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 2214 | \n",
- " 4.375321 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.166748 | \n",
- " 2152 | \n",
- " 4.252796 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.333496 | \n",
- " 816 | \n",
- " 1.612584 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 1.000000 | \n",
- " 618 | \n",
- " 1.221296 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.666992 | \n",
- " 500 | \n",
- " 0.988103 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.166748 | \n",
- " 412 | \n",
- " 0.814197 | \n",
- "
\n",
- " \n",
- " NaN | \n",
- " NaN | \n",
- " 134 | \n",
- " 0.264812 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " machine_id machine_id_perc\n",
- "capacity.cpus capacity.memory \n",
- "0.591797 0.333496 16184 31.982926\n",
- "1.000000 0.500000 9790 19.347061\n",
- "0.708984 0.333496 8448 16.694992\n",
- "0.958984 0.500000 5502 10.873088\n",
- "0.708984 0.666992 3832 7.572823\n",
- "1.000000 1.000000 2214 4.375321\n",
- "0.591797 0.166748 2152 4.252796\n",
- "0.386719 0.333496 816 1.612584\n",
- "0.958984 1.000000 618 1.221296\n",
- "0.591797 0.666992 500 0.988103\n",
- "0.386719 0.166748 412 0.814197\n",
- "NaN NaN 134 0.264812"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "For cluster c:\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " machine_id | \n",
- " machine_id_perc | \n",
- "
\n",
- " \n",
- " capacity.cpus | \n",
- " capacity.memory | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0.259277 | \n",
- " 0.166748 | \n",
- " 15754 | \n",
- " 24.439204 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.333496 | \n",
- " 11104 | \n",
- " 17.225652 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.333496 | \n",
- " 10404 | \n",
- " 16.139741 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 0.500000 | \n",
- " 6634 | \n",
- " 10.291334 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 0.500000 | \n",
- " 5654 | \n",
- " 8.771059 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.166748 | \n",
- " 3580 | \n",
- " 5.553660 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.666992 | \n",
- " 2900 | \n",
- " 4.498774 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 2736 | \n",
- " 4.244361 | \n",
- "
\n",
- " \n",
- " 0.250000 | \n",
- " 2132 | \n",
- " 3.307375 | \n",
- "
\n",
- " \n",
- " NaN | \n",
- " NaN | \n",
- " 1466 | \n",
- " 2.274208 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 1.000000 | \n",
- " 766 | \n",
- " 1.188297 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.333496 | \n",
- " 620 | \n",
- " 0.961807 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 0.250000 | \n",
- " 600 | \n",
- " 0.930781 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.166748 | \n",
- " 112 | \n",
- " 0.173746 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " machine_id machine_id_perc\n",
- "capacity.cpus capacity.memory \n",
- "0.259277 0.166748 15754 24.439204\n",
- "0.386719 0.333496 11104 17.225652\n",
- "0.591797 0.333496 10404 16.139741\n",
- "0.958984 0.500000 6634 10.291334\n",
- "1.000000 0.500000 5654 8.771059\n",
- "0.386719 0.166748 3580 5.553660\n",
- "0.708984 0.666992 2900 4.498774\n",
- "1.000000 1.000000 2736 4.244361\n",
- " 0.250000 2132 3.307375\n",
- "NaN NaN 1466 2.274208\n",
- "0.958984 1.000000 766 1.188297\n",
- "0.708984 0.333496 620 0.961807\n",
- "0.958984 0.250000 600 0.930781\n",
- "0.591797 0.166748 112 0.173746"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "For cluster d:\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " machine_id | \n",
- " machine_id_perc | \n",
- "
\n",
- " \n",
- " capacity.cpus | \n",
- " capacity.memory | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0.591797 | \n",
- " 0.333496 | \n",
- " 28394 | \n",
- " 45.288376 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.333496 | \n",
- " 8402 | \n",
- " 13.401174 | \n",
- "
\n",
- " \n",
- " 0.259277 | \n",
- " 0.166748 | \n",
- " 8020 | \n",
- " 12.791885 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.166748 | \n",
- " 5806 | \n",
- " 9.260559 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.666992 | \n",
- " 4380 | \n",
- " 6.986092 | \n",
- "
\n",
- " \n",
- " 0.333496 | \n",
- " 3924 | \n",
- " 6.258772 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.166748 | \n",
- " 2548 | \n",
- " 4.064055 | \n",
- "
\n",
- " \n",
- " NaN | \n",
- " NaN | \n",
- " 498 | \n",
- " 0.794309 | \n",
- "
\n",
- " \n",
- " 0.259277 | \n",
- " 0.333496 | \n",
- " 426 | \n",
- " 0.679469 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 0.500000 | \n",
- " 292 | \n",
- " 0.465739 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.250000 | \n",
- " 4 | \n",
- " 0.006380 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.500000 | \n",
- " 2 | \n",
- " 0.003190 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " machine_id machine_id_perc\n",
- "capacity.cpus capacity.memory \n",
- "0.591797 0.333496 28394 45.288376\n",
- "0.386719 0.333496 8402 13.401174\n",
- "0.259277 0.166748 8020 12.791885\n",
- "0.386719 0.166748 5806 9.260559\n",
- "0.708984 0.666992 4380 6.986092\n",
- " 0.333496 3924 6.258772\n",
- "0.591797 0.166748 2548 4.064055\n",
- "NaN NaN 498 0.794309\n",
- "0.259277 0.333496 426 0.679469\n",
- "1.000000 0.500000 292 0.465739\n",
- "0.591797 0.250000 4 0.006380\n",
- "0.708984 0.500000 2 0.003190"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "For cluster e:\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " machine_id | \n",
- " machine_id_perc | \n",
- "
\n",
- " \n",
- " capacity.cpus | \n",
- " capacity.memory | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0.259277 | \n",
- " 0.166748 | \n",
- " 38452 | \n",
- " 48.202377 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.333496 | \n",
- " 11786 | \n",
- " 14.774608 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 0.500000 | \n",
- " 8646 | \n",
- " 10.838389 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.666992 | \n",
- " 7606 | \n",
- " 9.534674 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 0.500000 | \n",
- " 5586 | \n",
- " 7.002457 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.166748 | \n",
- " 4470 | \n",
- " 5.603470 | \n",
- "
\n",
- " \n",
- " 0.259277 | \n",
- " 0.333496 | \n",
- " 1268 | \n",
- " 1.589530 | \n",
- "
\n",
- " \n",
- " 0.083374 | \n",
- " 634 | \n",
- " 0.794765 | \n",
- "
\n",
- " \n",
- " NaN | \n",
- " NaN | \n",
- " 536 | \n",
- " 0.671915 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.333496 | \n",
- " 324 | \n",
- " 0.406158 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 0.250000 | \n",
- " 268 | \n",
- " 0.335957 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 138 | \n",
- " 0.172993 | \n",
- "
\n",
- " \n",
- " 0.500000 | \n",
- " 0.062500 | \n",
- " 54 | \n",
- " 0.067693 | \n",
- "
\n",
- " \n",
- " 0.250000 | \n",
- " 4 | \n",
- " 0.005014 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " machine_id machine_id_perc\n",
- "capacity.cpus capacity.memory \n",
- "0.259277 0.166748 38452 48.202377\n",
- "0.708984 0.333496 11786 14.774608\n",
- "0.958984 0.500000 8646 10.838389\n",
- "0.708984 0.666992 7606 9.534674\n",
- "1.000000 0.500000 5586 7.002457\n",
- "0.386719 0.166748 4470 5.603470\n",
- "0.259277 0.333496 1268 1.589530\n",
- " 0.083374 634 0.794765\n",
- "NaN NaN 536 0.671915\n",
- "0.591797 0.333496 324 0.406158\n",
- "1.000000 0.250000 268 0.335957\n",
- " 1.000000 138 0.172993\n",
- "0.500000 0.062500 54 0.067693\n",
- " 0.250000 4 0.005014"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "For cluster f:\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " machine_id | \n",
- " machine_id_perc | \n",
- "
\n",
- " \n",
- " capacity.cpus | \n",
- " capacity.memory | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1.000000 | \n",
- " 0.500000 | \n",
- " 41340 | \n",
- " 66.396839 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.333496 | \n",
- " 6878 | \n",
- " 11.046866 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.333496 | \n",
- " 5564 | \n",
- " 8.936430 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 0.500000 | \n",
- " 2172 | \n",
- " 3.488484 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.166748 | \n",
- " 1544 | \n",
- " 2.479843 | \n",
- "
\n",
- " \n",
- " NaN | \n",
- " NaN | \n",
- " 1432 | \n",
- " 2.299958 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.666992 | \n",
- " 1244 | \n",
- " 1.998008 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 0.250000 | \n",
- " 792 | \n",
- " 1.272044 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 1.000000 | \n",
- " 536 | \n",
- " 0.860878 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.333496 | \n",
- " 398 | \n",
- " 0.639234 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 344 | \n",
- " 0.552504 | \n",
- "
\n",
- " \n",
- " 0.500000 | \n",
- " 0.250000 | \n",
- " 18 | \n",
- " 0.028910 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " machine_id machine_id_perc\n",
- "capacity.cpus capacity.memory \n",
- "1.000000 0.500000 41340 66.396839\n",
- "0.708984 0.333496 6878 11.046866\n",
- "0.591797 0.333496 5564 8.936430\n",
- "0.958984 0.500000 2172 3.488484\n",
- "0.386719 0.166748 1544 2.479843\n",
- "NaN NaN 1432 2.299958\n",
- "0.708984 0.666992 1244 1.998008\n",
- "1.000000 0.250000 792 1.272044\n",
- "0.958984 1.000000 536 0.860878\n",
- "0.386719 0.333496 398 0.639234\n",
- "1.000000 1.000000 344 0.552504\n",
- "0.500000 0.250000 18 0.028910"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "For cluster g:\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " machine_id | \n",
- " machine_id_perc | \n",
- "
\n",
- " \n",
- " capacity.cpus | \n",
- " capacity.memory | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0.259277 | \n",
- " 0.166748 | \n",
- " 15852 | \n",
- " 22.892958 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 0.500000 | \n",
- " 11808 | \n",
- " 17.052741 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.333496 | \n",
- " 7968 | \n",
- " 11.507134 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.333496 | \n",
- " 7830 | \n",
- " 11.307839 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.166748 | \n",
- " 4690 | \n",
- " 6.773150 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.666992 | \n",
- " 4258 | \n",
- " 6.149269 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 0.500000 | \n",
- " 4196 | \n",
- " 6.059731 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.333496 | \n",
- " 3864 | \n",
- " 5.580267 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.166748 | \n",
- " 2606 | \n",
- " 3.763503 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 0.250000 | \n",
- " 2100 | \n",
- " 3.032754 | \n",
- "
\n",
- " \n",
- " NaN | \n",
- " NaN | \n",
- " 1566 | \n",
- " 2.261568 | \n",
- "
\n",
- " \n",
- " 0.259277 | \n",
- " 0.333496 | \n",
- " 1330 | \n",
- " 1.920744 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 1.000000 | \n",
- " 778 | \n",
- " 1.123563 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 378 | \n",
- " 0.545896 | \n",
- "
\n",
- " \n",
- " 0.500000 | \n",
- " 0.250000 | \n",
- " 12 | \n",
- " 0.017330 | \n",
- "
\n",
- " \n",
- " 0.479492 | \n",
- " 0.250000 | \n",
- " 6 | \n",
- " 0.008665 | \n",
- "
\n",
- " \n",
- " 0.500000 | \n",
- " 2 | \n",
- " 0.002888 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " machine_id machine_id_perc\n",
- "capacity.cpus capacity.memory \n",
- "0.259277 0.166748 15852 22.892958\n",
- "1.000000 0.500000 11808 17.052741\n",
- "0.708984 0.333496 7968 11.507134\n",
- "0.591797 0.333496 7830 11.307839\n",
- "0.386719 0.166748 4690 6.773150\n",
- "0.708984 0.666992 4258 6.149269\n",
- "0.958984 0.500000 4196 6.059731\n",
- "0.386719 0.333496 3864 5.580267\n",
- "0.591797 0.166748 2606 3.763503\n",
- "1.000000 0.250000 2100 3.032754\n",
- "NaN NaN 1566 2.261568\n",
- "0.259277 0.333496 1330 1.920744\n",
- "0.958984 1.000000 778 1.123563\n",
- "1.000000 1.000000 378 0.545896\n",
- "0.500000 0.250000 12 0.017330\n",
- "0.479492 0.250000 6 0.008665\n",
- " 0.500000 2 0.002888"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "For cluster h:\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " machine_id | \n",
- " machine_id_perc | \n",
- "
\n",
- " \n",
- " capacity.cpus | \n",
- " capacity.memory | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1.000000 | \n",
- " 0.500000 | \n",
- " 36324 | \n",
- " 61.946178 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.333496 | \n",
- " 4826 | \n",
- " 8.230158 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.333496 | \n",
- " 3682 | \n",
- " 6.279205 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 0.500000 | \n",
- " 2858 | \n",
- " 4.873973 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.333496 | \n",
- " 2596 | \n",
- " 4.427163 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 2030 | \n",
- " 3.461919 | \n",
- "
\n",
- " \n",
- " 0.250000 | \n",
- " 1892 | \n",
- " 3.226577 | \n",
- "
\n",
- " \n",
- " NaN | \n",
- " NaN | \n",
- " 1720 | \n",
- " 2.933251 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.166748 | \n",
- " 1244 | \n",
- " 2.121491 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.666992 | \n",
- " 766 | \n",
- " 1.306320 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.666992 | \n",
- " 500 | \n",
- " 0.852689 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 1.000000 | \n",
- " 200 | \n",
- " 0.341076 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " machine_id machine_id_perc\n",
- "capacity.cpus capacity.memory \n",
- "1.000000 0.500000 36324 61.946178\n",
- "0.591797 0.333496 4826 8.230158\n",
- "0.708984 0.333496 3682 6.279205\n",
- "0.958984 0.500000 2858 4.873973\n",
- "0.386719 0.333496 2596 4.427163\n",
- "1.000000 1.000000 2030 3.461919\n",
- " 0.250000 1892 3.226577\n",
- "NaN NaN 1720 2.933251\n",
- "0.386719 0.166748 1244 2.121491\n",
- "0.708984 0.666992 766 1.306320\n",
- "0.591797 0.666992 500 0.852689\n",
- "0.958984 1.000000 200 0.341076"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- " For all clusters:\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " machine_id | \n",
- " machine_id_perc | \n",
- "
\n",
- " \n",
- " capacity.cpus | \n",
- " capacity.memory | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1.000000 | \n",
- " 0.500000 | \n",
- " 124234 | \n",
- " 23.329891 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.333496 | \n",
- " 103013 | \n",
- " 19.344801 | \n",
- "
\n",
- " \n",
- " 0.259277 | \n",
- " 0.166748 | \n",
- " 78078 | \n",
- " 14.662260 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.333496 | \n",
- " 55801 | \n",
- " 10.478864 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.333496 | \n",
- " 36237 | \n",
- " 6.804943 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 0.500000 | \n",
- " 31151 | \n",
- " 5.849843 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.666992 | \n",
- " 29594 | \n",
- " 5.557454 | \n",
- "
\n",
- " \n",
- " 0.386719 | \n",
- " 0.166748 | \n",
- " 27011 | \n",
- " 5.072393 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 12286 | \n",
- " 2.307187 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.166748 | \n",
- " 9902 | \n",
- " 1.859496 | \n",
- "
\n",
- " \n",
- " NaN | \n",
- " NaN | \n",
- " 8729 | \n",
- " 1.639218 | \n",
- "
\n",
- " \n",
- " 1.000000 | \n",
- " 0.250000 | \n",
- " 7550 | \n",
- " 1.417814 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 1.000000 | \n",
- " 3552 | \n",
- " 0.667030 | \n",
- "
\n",
- " \n",
- " 0.259277 | \n",
- " 0.333496 | \n",
- " 3024 | \n",
- " 0.567877 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.666992 | \n",
- " 1000 | \n",
- " 0.187790 | \n",
- "
\n",
- " \n",
- " 0.259277 | \n",
- " 0.083374 | \n",
- " 634 | \n",
- " 0.119059 | \n",
- "
\n",
- " \n",
- " 0.958984 | \n",
- " 0.250000 | \n",
- " 600 | \n",
- " 0.112674 | \n",
- "
\n",
- " \n",
- " 0.500000 | \n",
- " 0.062500 | \n",
- " 54 | \n",
- " 0.010141 | \n",
- "
\n",
- " \n",
- " 0.250000 | \n",
- " 34 | \n",
- " 0.006385 | \n",
- "
\n",
- " \n",
- " 0.479492 | \n",
- " 0.250000 | \n",
- " 12 | \n",
- " 0.002253 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.250000 | \n",
- " 6 | \n",
- " 0.001127 | \n",
- "
\n",
- " \n",
- " 0.591797 | \n",
- " 0.250000 | \n",
- " 4 | \n",
- " 0.000751 | \n",
- "
\n",
- " \n",
- " 0.708984 | \n",
- " 0.500000 | \n",
- " 2 | \n",
- " 0.000376 | \n",
- "
\n",
- " \n",
- " 0.479492 | \n",
- " 0.500000 | \n",
- " 2 | \n",
- " 0.000376 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " machine_id machine_id_perc\n",
- "capacity.cpus capacity.memory \n",
- "1.000000 0.500000 124234 23.329891\n",
- "0.591797 0.333496 103013 19.344801\n",
- "0.259277 0.166748 78078 14.662260\n",
- "0.708984 0.333496 55801 10.478864\n",
- "0.386719 0.333496 36237 6.804943\n",
- "0.958984 0.500000 31151 5.849843\n",
- "0.708984 0.666992 29594 5.557454\n",
- "0.386719 0.166748 27011 5.072393\n",
- "1.000000 1.000000 12286 2.307187\n",
- "0.591797 0.166748 9902 1.859496\n",
- "NaN NaN 8729 1.639218\n",
- "1.000000 0.250000 7550 1.417814\n",
- "0.958984 1.000000 3552 0.667030\n",
- "0.259277 0.333496 3024 0.567877\n",
- "0.591797 0.666992 1000 0.187790\n",
- "0.259277 0.083374 634 0.119059\n",
- "0.958984 0.250000 600 0.112674\n",
- "0.500000 0.062500 54 0.010141\n",
- " 0.250000 34 0.006385\n",
- "0.479492 0.250000 12 0.002253\n",
- "0.708984 0.250000 6 0.001127\n",
- "0.591797 0.250000 4 0.000751\n",
- "0.708984 0.500000 2 0.000376\n",
- "0.479492 0.500000 2 0.000376"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Generate machine configurations table per cluster + a global table\n",
- "\n",
- "df = df[df.columns.difference(['missing_data_reason'])]\n",
- "\n",
- "for l in \"abcdefgh\":\n",
- " print(\"\\nFor cluster \" + l + \":\\n\")\n",
- " do_group_by(df[df.cluster==l])\n",
- "\n",
- "print(\"\\n For all clusters:\")\n",
- "do_group_by(df)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "supreme-hepatitis",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/machine_time_waste/.ipynb_checkpoints/machine_time_waste-checkpoint.ipynb b/machine_time_waste/.ipynb_checkpoints/machine_time_waste-checkpoint.ipynb
deleted file mode 100644
index 40fa4661..00000000
--- a/machine_time_waste/.ipynb_checkpoints/machine_time_waste-checkpoint.ipynb
+++ /dev/null
@@ -1,895 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "proper-gnome",
- "metadata": {},
- "source": [
- "# Temporal impact: machine time waste"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "fantastic-harrison",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- }
- },
- "outputs": [],
- "source": [
- "import pandas\n",
- "from IPython import display\n",
- "import findspark\n",
- "findspark.init()\n",
- "import pyspark\n",
- "import pyspark.sql\n",
- "import sys\n",
- "\n",
- "from pyspark.sql.functions import col, lag, when, concat_ws, last, first\n",
- "from pyspark.sql import Window\n",
- "from pyspark.sql.types import LongType"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "failing-rebecca",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- }
- },
- "outputs": [],
- "source": [
- "cluster=\"b\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "transsexual-baptist",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "ERROR:root:Exception while sending command.\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1207, in send_command\n",
- " raise Py4JNetworkError(\"Answer from Java side is empty\")\n",
- "py4j.protocol.Py4JNetworkError: Answer from Java side is empty\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1033, in send_command\n",
- " response = connection.send_command(command)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1212, in send_command\n",
- " \"Error while receiving\", e, proto.ERROR_ON_RECEIVE)\n",
- "py4j.protocol.Py4JNetworkError: Error while receiving\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n",
- "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n",
- " File \"\", line 5, in \n",
- " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n",
- " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n",
- " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n",
- " answer, self.gateway_client, self.target_id, self.name)\n",
- " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n",
- " return f(*a, **kw)\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n",
- " format(target_id, \".\", name))\n",
- "py4j.protocol.Py4JError: An error occurred while calling o26.json\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n",
- " stb = value._render_traceback_()\n",
- "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n",
- " connection = self.deque.pop()\n",
- "IndexError: pop from an empty deque\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n",
- " self.socket.connect((self.address, self.port))\n",
- "ConnectionRefusedError: [Errno 111] Connection refused\n"
- ]
- },
- {
- "ename": "Py4JError",
- "evalue": "An error occurred while calling o26.json",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mPy4JError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mgetOrCreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/claudio/google_2019/instance_events/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcluster\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcluster\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"_instance_events*.json.gz\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;32m/opt/spark/python/pyspark/sql/readwriter.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, path, schema, primitivesAsString, prefersDecimal, allowComments, allowUnquotedFieldNames, allowSingleQuotes, allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, mode, columnNameOfCorruptRecord, dateFormat, timestampFormat, multiLine, allowUnquotedControlChars, lineSep, samplingRatio, dropFieldIfAllNull, encoding, locale, pathGlobFilter, recursiveFileLookup)\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 299\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 300\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jreader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_spark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jvm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPythonUtils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoSeq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 301\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRDD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1303\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1304\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1305\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1306\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1307\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/opt/spark/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 129\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0mconverted\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconvert_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 334\u001b[0m raise Py4JError(\n\u001b[1;32m 335\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 336\u001b[0;31m format(target_id, \".\", name))\n\u001b[0m\u001b[1;32m 337\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mtype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0manswer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mPy4JError\u001b[0m: An error occurred while calling o26.json"
- ]
- }
- ],
- "source": [
- "spark = pyspark.sql.SparkSession.builder \\\n",
- " .appName(\"machine_time_waste\") \\\n",
- " .getOrCreate()\n",
- "\n",
- "df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "juvenile-absolute",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- }
- },
- "outputs": [],
- "source": [
- "df.printSchema()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "lucky-western",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- }
- },
- "outputs": [],
- "source": [
- "df.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "normal-settlement",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- }
- },
- "outputs": [],
- "source": [
- "# .filter(df.collection_type == 0) \\\n",
- "df2 = df \\\n",
- " .withColumn(\"time\", col(\"time\").cast(LongType())) \\\n",
- " .withColumn(\"type\", col(\"type\").cast(LongType())) \\\n",
- " .withColumn(\"type\", when(col(\"type\").isNull(), 0).otherwise(col(\"type\"))) \\\n",
- " .withColumn(\"id\", concat_ws(\"-\", \"collection_id\", \"instance_index\")) \\\n",
- " .where(col(\"time\").isNotNull()) \\\n",
- " .where(col(\"type\").isNotNull()) \\\n",
- " .where((col(\"instance_index\").isNotNull()) & (col(\"collection_id\").isNotNull())) \\\n",
- " .select(\"machine_id\", \"id\", \"time\", \"type\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "typical-homeless",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- }
- },
- "outputs": [],
- "source": [
- "df2.show()\n",
- "print(\"Total: \" + str(df.count()))\n",
- "print(\"Filtered: \" + str(df2.count()))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "collect-saying",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- }
- },
- "outputs": [],
- "source": [
- "# my_window = Window.partitionBy(\"machine_id\", \"id\").orderBy(df2.time.asc())\n",
- "\n",
- "w2 = Window.partitionBy(\"id\").orderBy(df2.time.asc()).rowsBetween(Window.currentRow, Window.unboundedFollowing)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "cooperative-appraisal",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- }
- },
- "outputs": [],
- "source": [
- "# .withColumn(\"prev_time\", lag(df2.time).over(my_window)) \\\n",
- "# .withColumn(\"prev_type\", lag(df2.type).over(my_window)) \\\n",
- "\n",
- "df3 = df2 \\\n",
- " .withColumn(\"t3_time\", when((df2.type != 3), None).otherwise(df2.time)) \\\n",
- " .withColumn(\"t45678_time\", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.time)) \\\n",
- " .withColumn(\"t45678_type\", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.type)) \\\n",
- " .withColumn(\"t01_time\", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.time)) \\\n",
- " .withColumn(\"t01_type\", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.type)) \\\n",
- " .withColumn(\"next_time\", when(df2.type == 3, first(col(\"t45678_time\"), ignorenulls=True).over(w2)) \\\n",
- " .when((df2.type == 0) | (df2.type == 1), first(col(\"t3_time\"), ignorenulls=True).over(w2)) \\\n",
- " .when((df2.type >= 4) | (df2.type <= 8), first(col(\"t01_time\"), ignorenulls=True).over(w2)) \\\n",
- " .otherwise(None)) \\\n",
- " .withColumn(\"next_type\", when(df2.type == 3, first(col(\"t45678_type\"), ignorenulls=True).over(w2)) \\\n",
- " .when((df2.type == 0) | (df2.type == 1), 3) \\\n",
- " .when((df2.type >= 4) | (df2.type <= 8), first(col(\"t01_type\"), ignorenulls=True).over(w2)) \\\n",
- " .otherwise(None)) \\\n",
- " .withColumn(\"last_term_type\", last(col(\"t45678_type\"), ignorenulls=True).over(w2)) \\\n",
- " .withColumn(\"time_delta\", col(\"next_time\") - col(\"time\")) \\\n",
- " .select(\"machine_id\", \"id\", \"time\", \"type\", \"last_term_type\", \"time_delta\", \"t01_time\", \"t01_type\", \"t3_time\", \"t45678_time\", \"t45678_type\", \"next_time\", \"next_type\") \\"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ideal-angle",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- }
- },
- "outputs": [],
- "source": [
- "df4 = df3.where(df3.next_type.isNotNull()).groupby(\"type\", \"next_type\", \"last_term_type\").sum(\"time_delta\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "working-difficulty",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- }
- },
- "outputs": [],
- "source": [
- "# df3.orderBy(df3.machine_id, df3.time).show(n=100)\n",
- "# df3.printSchema()\n",
- "df4.show(n=1000000)\n",
- "df4.write.csv(\"/home/claudio/google_2019/thesis_queries/machine_time_waste/\" + cluster + \"_state_change.csv\")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}