diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..e532dd83 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +**/.ipynb_checkpoints/ diff --git a/machine_configs/.ipynb_checkpoints/machine_configs-checkpoint.ipynb b/machine_configs/.ipynb_checkpoints/machine_configs-checkpoint.ipynb deleted file mode 100644 index bf43e41b..00000000 --- a/machine_configs/.ipynb_checkpoints/machine_configs-checkpoint.ipynb +++ /dev/null @@ -1,1592 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "built-symbol", - "metadata": {}, - "source": [ - "# Machine configurations\n", - "\n", - "This query returns all the distinct NCU/NMU configurations in the borg clusters, including how many machines ids match for any specific configuration.\n", - "\n", - "Please note that for simplicity's sake the we are technically counting the number of ADD or UPDATE events for each configuration, and not the actual count of machines. Therefore a machine configuration may change over time and count twice or more." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "stuffed-lightning", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "# For pretty printing\n", - "from IPython.display import display\n", - "\n", - "# Disables row ellipsis\n", - "pd.set_option('display.max_rows', 200)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "upper-lloyd", - "metadata": {}, - "outputs": [], - "source": [ - "# Load all machine event rows in a single DataFrame, and add a \"cluster\" column to differentiate\n", - "# between clusters\n", - "df = None\n", - "for l in \"abcdefgh\":\n", - " dfl = pd.read_csv(\"~/google_2019/machine_events/\" + l + \"_machine_events.csv\")\n", - " dfl[\"cluster\"] = l\n", - " if df is None:\n", - " df = dfl\n", - " else:\n", - " df = pd.concat([df, dfl], axis=0)\n", - "\n", - "# Filter only ADD or UPDATE events\n", - "df = df[(df.type==1)|(df.type==3)]\n", - "\n", - "# P.S: ADD=1, REMOVE=2, UPDATE=3\n", - " \n", - "df = df[[\"capacity.cpus\", \"capacity.memory\", \"cluster\", \n", - " \"missing_data_reason\", \"machine_id\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "presidential-farmer", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
capacity.cpuscapacity.memoryclustermachine_id
missing_data_reason
NaN523781523781532510532510
\n", - "
" - ], - "text/plain": [ - " capacity.cpus capacity.memory cluster machine_id\n", - "missing_data_reason \n", - "NaN 523781 523781 532510 532510" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Checking if we need to deal with particular missing data\n", - "# No columns returned, so missing data can be safely ignored\n", - "df.groupby(by=[\"missing_data_reason\"], dropna=False).count()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "informative-vietnam", - "metadata": {}, - "outputs": [], - "source": [ - "def do_group_by(df):\n", - " # Exclude \"cluster\" column and perform group-by\n", - " dfg = df[df.columns.difference(['cluster'])]. \\\n", - " groupby(by=[\"capacity.cpus\",\"capacity.memory\"], \n", - " dropna=False).count()\n", - " \n", - " # Compute relative number of machines\n", - " total_machines = dfg['machine_id'].sum()\n", - " dfg[\"machine_id_perc\"] = dfg[\"machine_id\"] * 100 / total_machines\n", - " \n", - " # Sort descending\n", - " dfg = dfg.sort_values(\"machine_id_perc\", ascending=False)\n", - " \n", - " display(dfg)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "pretty-taiwan", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "For cluster a:\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
machine_idmachine_id_perc
capacity.cpuscapacity.memory
0.5917970.3334962948734.758469
1.0000000.5000001344015.842705
0.7089840.3334961249514.728764
0.3867190.333496905710.676144
0.16674852656.206238
0.7089840.66699246085.431784
1.0000001.00000044465.240823
0.5917970.16674824842.928071
NaNNaN13771.623170
0.9589840.50000011431.347337
1.0000006540.770917
1.0000000.2500003660.431431
0.4794920.25000060.007073
0.7089840.25000060.007073
\n", - "
" - ], - "text/plain": [ - " machine_id machine_id_perc\n", - "capacity.cpus capacity.memory \n", - "0.591797 0.333496 29487 34.758469\n", - "1.000000 0.500000 13440 15.842705\n", - "0.708984 0.333496 12495 14.728764\n", - "0.386719 0.333496 9057 10.676144\n", - " 0.166748 5265 6.206238\n", - "0.708984 0.666992 4608 5.431784\n", - "1.000000 1.000000 4446 5.240823\n", - "0.591797 0.166748 2484 2.928071\n", - "NaN NaN 1377 1.623170\n", - "0.958984 0.500000 1143 1.347337\n", - " 1.000000 654 0.770917\n", - "1.000000 0.250000 366 0.431431\n", - "0.479492 0.250000 6 0.007073\n", - "0.708984 0.250000 6 0.007073" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "For cluster b:\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
machine_idmachine_id_perc
capacity.cpuscapacity.memory
0.5917970.3334961618431.982926
1.0000000.500000979019.347061
0.7089840.333496844816.694992
0.9589840.500000550210.873088
0.7089840.66699238327.572823
1.0000001.00000022144.375321
0.5917970.16674821524.252796
0.3867190.3334968161.612584
0.9589841.0000006181.221296
0.5917970.6669925000.988103
0.3867190.1667484120.814197
NaNNaN1340.264812
\n", - "
" - ], - "text/plain": [ - " machine_id machine_id_perc\n", - "capacity.cpus capacity.memory \n", - "0.591797 0.333496 16184 31.982926\n", - "1.000000 0.500000 9790 19.347061\n", - "0.708984 0.333496 8448 16.694992\n", - "0.958984 0.500000 5502 10.873088\n", - "0.708984 0.666992 3832 7.572823\n", - "1.000000 1.000000 2214 4.375321\n", - "0.591797 0.166748 2152 4.252796\n", - "0.386719 0.333496 816 1.612584\n", - "0.958984 1.000000 618 1.221296\n", - "0.591797 0.666992 500 0.988103\n", - "0.386719 0.166748 412 0.814197\n", - "NaN NaN 134 0.264812" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "For cluster c:\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
machine_idmachine_id_perc
capacity.cpuscapacity.memory
0.2592770.1667481575424.439204
0.3867190.3334961110417.225652
0.5917970.3334961040416.139741
0.9589840.500000663410.291334
1.0000000.50000056548.771059
0.3867190.16674835805.553660
0.7089840.66699229004.498774
1.0000001.00000027364.244361
0.25000021323.307375
NaNNaN14662.274208
0.9589841.0000007661.188297
0.7089840.3334966200.961807
0.9589840.2500006000.930781
0.5917970.1667481120.173746
\n", - "
" - ], - "text/plain": [ - " machine_id machine_id_perc\n", - "capacity.cpus capacity.memory \n", - "0.259277 0.166748 15754 24.439204\n", - "0.386719 0.333496 11104 17.225652\n", - "0.591797 0.333496 10404 16.139741\n", - "0.958984 0.500000 6634 10.291334\n", - "1.000000 0.500000 5654 8.771059\n", - "0.386719 0.166748 3580 5.553660\n", - "0.708984 0.666992 2900 4.498774\n", - "1.000000 1.000000 2736 4.244361\n", - " 0.250000 2132 3.307375\n", - "NaN NaN 1466 2.274208\n", - "0.958984 1.000000 766 1.188297\n", - "0.708984 0.333496 620 0.961807\n", - "0.958984 0.250000 600 0.930781\n", - "0.591797 0.166748 112 0.173746" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "For cluster d:\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
machine_idmachine_id_perc
capacity.cpuscapacity.memory
0.5917970.3334962839445.288376
0.3867190.333496840213.401174
0.2592770.166748802012.791885
0.3867190.16674858069.260559
0.7089840.66699243806.986092
0.33349639246.258772
0.5917970.16674825484.064055
NaNNaN4980.794309
0.2592770.3334964260.679469
1.0000000.5000002920.465739
0.5917970.25000040.006380
0.7089840.50000020.003190
\n", - "
" - ], - "text/plain": [ - " machine_id machine_id_perc\n", - "capacity.cpus capacity.memory \n", - "0.591797 0.333496 28394 45.288376\n", - "0.386719 0.333496 8402 13.401174\n", - "0.259277 0.166748 8020 12.791885\n", - "0.386719 0.166748 5806 9.260559\n", - "0.708984 0.666992 4380 6.986092\n", - " 0.333496 3924 6.258772\n", - "0.591797 0.166748 2548 4.064055\n", - "NaN NaN 498 0.794309\n", - "0.259277 0.333496 426 0.679469\n", - "1.000000 0.500000 292 0.465739\n", - "0.591797 0.250000 4 0.006380\n", - "0.708984 0.500000 2 0.003190" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "For cluster e:\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
machine_idmachine_id_perc
capacity.cpuscapacity.memory
0.2592770.1667483845248.202377
0.7089840.3334961178614.774608
0.9589840.500000864610.838389
0.7089840.66699276069.534674
1.0000000.50000055867.002457
0.3867190.16674844705.603470
0.2592770.33349612681.589530
0.0833746340.794765
NaNNaN5360.671915
0.5917970.3334963240.406158
1.0000000.2500002680.335957
1.0000001380.172993
0.5000000.062500540.067693
0.25000040.005014
\n", - "
" - ], - "text/plain": [ - " machine_id machine_id_perc\n", - "capacity.cpus capacity.memory \n", - "0.259277 0.166748 38452 48.202377\n", - "0.708984 0.333496 11786 14.774608\n", - "0.958984 0.500000 8646 10.838389\n", - "0.708984 0.666992 7606 9.534674\n", - "1.000000 0.500000 5586 7.002457\n", - "0.386719 0.166748 4470 5.603470\n", - "0.259277 0.333496 1268 1.589530\n", - " 0.083374 634 0.794765\n", - "NaN NaN 536 0.671915\n", - "0.591797 0.333496 324 0.406158\n", - "1.000000 0.250000 268 0.335957\n", - " 1.000000 138 0.172993\n", - "0.500000 0.062500 54 0.067693\n", - " 0.250000 4 0.005014" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "For cluster f:\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
machine_idmachine_id_perc
capacity.cpuscapacity.memory
1.0000000.5000004134066.396839
0.7089840.333496687811.046866
0.5917970.33349655648.936430
0.9589840.50000021723.488484
0.3867190.16674815442.479843
NaNNaN14322.299958
0.7089840.66699212441.998008
1.0000000.2500007921.272044
0.9589841.0000005360.860878
0.3867190.3334963980.639234
1.0000001.0000003440.552504
0.5000000.250000180.028910
\n", - "
" - ], - "text/plain": [ - " machine_id machine_id_perc\n", - "capacity.cpus capacity.memory \n", - "1.000000 0.500000 41340 66.396839\n", - "0.708984 0.333496 6878 11.046866\n", - "0.591797 0.333496 5564 8.936430\n", - "0.958984 0.500000 2172 3.488484\n", - "0.386719 0.166748 1544 2.479843\n", - "NaN NaN 1432 2.299958\n", - "0.708984 0.666992 1244 1.998008\n", - "1.000000 0.250000 792 1.272044\n", - "0.958984 1.000000 536 0.860878\n", - "0.386719 0.333496 398 0.639234\n", - "1.000000 1.000000 344 0.552504\n", - "0.500000 0.250000 18 0.028910" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "For cluster g:\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
machine_idmachine_id_perc
capacity.cpuscapacity.memory
0.2592770.1667481585222.892958
1.0000000.5000001180817.052741
0.7089840.333496796811.507134
0.5917970.333496783011.307839
0.3867190.16674846906.773150
0.7089840.66699242586.149269
0.9589840.50000041966.059731
0.3867190.33349638645.580267
0.5917970.16674826063.763503
1.0000000.25000021003.032754
NaNNaN15662.261568
0.2592770.33349613301.920744
0.9589841.0000007781.123563
1.0000001.0000003780.545896
0.5000000.250000120.017330
0.4794920.25000060.008665
0.50000020.002888
\n", - "
" - ], - "text/plain": [ - " machine_id machine_id_perc\n", - "capacity.cpus capacity.memory \n", - "0.259277 0.166748 15852 22.892958\n", - "1.000000 0.500000 11808 17.052741\n", - "0.708984 0.333496 7968 11.507134\n", - "0.591797 0.333496 7830 11.307839\n", - "0.386719 0.166748 4690 6.773150\n", - "0.708984 0.666992 4258 6.149269\n", - "0.958984 0.500000 4196 6.059731\n", - "0.386719 0.333496 3864 5.580267\n", - "0.591797 0.166748 2606 3.763503\n", - "1.000000 0.250000 2100 3.032754\n", - "NaN NaN 1566 2.261568\n", - "0.259277 0.333496 1330 1.920744\n", - "0.958984 1.000000 778 1.123563\n", - "1.000000 1.000000 378 0.545896\n", - "0.500000 0.250000 12 0.017330\n", - "0.479492 0.250000 6 0.008665\n", - " 0.500000 2 0.002888" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "For cluster h:\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
machine_idmachine_id_perc
capacity.cpuscapacity.memory
1.0000000.5000003632461.946178
0.5917970.33349648268.230158
0.7089840.33349636826.279205
0.9589840.50000028584.873973
0.3867190.33349625964.427163
1.0000001.00000020303.461919
0.25000018923.226577
NaNNaN17202.933251
0.3867190.16674812442.121491
0.7089840.6669927661.306320
0.5917970.6669925000.852689
0.9589841.0000002000.341076
\n", - "
" - ], - "text/plain": [ - " machine_id machine_id_perc\n", - "capacity.cpus capacity.memory \n", - "1.000000 0.500000 36324 61.946178\n", - "0.591797 0.333496 4826 8.230158\n", - "0.708984 0.333496 3682 6.279205\n", - "0.958984 0.500000 2858 4.873973\n", - "0.386719 0.333496 2596 4.427163\n", - "1.000000 1.000000 2030 3.461919\n", - " 0.250000 1892 3.226577\n", - "NaN NaN 1720 2.933251\n", - "0.386719 0.166748 1244 2.121491\n", - "0.708984 0.666992 766 1.306320\n", - "0.591797 0.666992 500 0.852689\n", - "0.958984 1.000000 200 0.341076" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " For all clusters:\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
machine_idmachine_id_perc
capacity.cpuscapacity.memory
1.0000000.50000012423423.329891
0.5917970.33349610301319.344801
0.2592770.1667487807814.662260
0.7089840.3334965580110.478864
0.3867190.333496362376.804943
0.9589840.500000311515.849843
0.7089840.666992295945.557454
0.3867190.166748270115.072393
1.0000001.000000122862.307187
0.5917970.16674899021.859496
NaNNaN87291.639218
1.0000000.25000075501.417814
0.9589841.00000035520.667030
0.2592770.33349630240.567877
0.5917970.66699210000.187790
0.2592770.0833746340.119059
0.9589840.2500006000.112674
0.5000000.062500540.010141
0.250000340.006385
0.4794920.250000120.002253
0.7089840.25000060.001127
0.5917970.25000040.000751
0.7089840.50000020.000376
0.4794920.50000020.000376
\n", - "
" - ], - "text/plain": [ - " machine_id machine_id_perc\n", - "capacity.cpus capacity.memory \n", - "1.000000 0.500000 124234 23.329891\n", - "0.591797 0.333496 103013 19.344801\n", - "0.259277 0.166748 78078 14.662260\n", - "0.708984 0.333496 55801 10.478864\n", - "0.386719 0.333496 36237 6.804943\n", - "0.958984 0.500000 31151 5.849843\n", - "0.708984 0.666992 29594 5.557454\n", - "0.386719 0.166748 27011 5.072393\n", - "1.000000 1.000000 12286 2.307187\n", - "0.591797 0.166748 9902 1.859496\n", - "NaN NaN 8729 1.639218\n", - "1.000000 0.250000 7550 1.417814\n", - "0.958984 1.000000 3552 0.667030\n", - "0.259277 0.333496 3024 0.567877\n", - "0.591797 0.666992 1000 0.187790\n", - "0.259277 0.083374 634 0.119059\n", - "0.958984 0.250000 600 0.112674\n", - "0.500000 0.062500 54 0.010141\n", - " 0.250000 34 0.006385\n", - "0.479492 0.250000 12 0.002253\n", - "0.708984 0.250000 6 0.001127\n", - "0.591797 0.250000 4 0.000751\n", - "0.708984 0.500000 2 0.000376\n", - "0.479492 0.500000 2 0.000376" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Generate machine configurations table per cluster + a global table\n", - "\n", - "df = df[df.columns.difference(['missing_data_reason'])]\n", - "\n", - "for l in \"abcdefgh\":\n", - " print(\"\\nFor cluster \" + l + \":\\n\")\n", - " do_group_by(df[df.cluster==l])\n", - "\n", - "print(\"\\n For all clusters:\")\n", - "do_group_by(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "supreme-hepatitis", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/machine_time_waste/.ipynb_checkpoints/machine_time_waste-checkpoint.ipynb b/machine_time_waste/.ipynb_checkpoints/machine_time_waste-checkpoint.ipynb deleted file mode 100644 index 40fa4661..00000000 --- a/machine_time_waste/.ipynb_checkpoints/machine_time_waste-checkpoint.ipynb +++ /dev/null @@ -1,895 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "proper-gnome", - "metadata": {}, - "source": [ - "# Temporal impact: machine time waste" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "fantastic-harrison", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "import pandas\n", - "from IPython import display\n", - "import findspark\n", - "findspark.init()\n", - "import pyspark\n", - "import pyspark.sql\n", - "import sys\n", - "\n", - "from pyspark.sql.functions import col, lag, when, concat_ws, last, first\n", - "from pyspark.sql import Window\n", - "from pyspark.sql.types import LongType" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "failing-rebecca", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "cluster=\"b\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "transsexual-baptist", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR:root:Exception while sending command.\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1207, in send_command\n", - " raise Py4JNetworkError(\"Answer from Java side is empty\")\n", - "py4j.protocol.Py4JNetworkError: Answer from Java side is empty\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1033, in send_command\n", - " response = connection.send_command(command)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1212, in send_command\n", - " \"Error while receiving\", e, proto.ERROR_ON_RECEIVE)\n", - "py4j.protocol.Py4JNetworkError: Error while receiving\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n", - "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36135)\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3343, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"\", line 5, in \n", - " df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")\n", - " File \"/opt/spark/python/pyspark/sql/readwriter.py\", line 300, in json\n", - " return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/opt/spark/python/pyspark/sql/utils.py\", line 128, in deco\n", - " return f(*a, **kw)\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 336, in get_return_value\n", - " format(target_id, \".\", name))\n", - "py4j.protocol.Py4JError: An error occurred while calling o26.json\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/claudio/python-venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2044, in showtraceback\n", - " stb = value._render_traceback_()\n", - "AttributeError: 'Py4JError' object has no attribute '_render_traceback_'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 977, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1115, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [Errno 111] Connection refused\n" - ] - }, - { - "ename": "Py4JError", - "evalue": "An error occurred while calling o26.json", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mPy4JError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mgetOrCreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/claudio/google_2019/instance_events/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcluster\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcluster\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"_instance_events*.json.gz\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/opt/spark/python/pyspark/sql/readwriter.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, path, schema, primitivesAsString, prefersDecimal, allowComments, allowUnquotedFieldNames, allowSingleQuotes, allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, mode, columnNameOfCorruptRecord, dateFormat, timestampFormat, multiLine, allowUnquotedControlChars, lineSep, samplingRatio, dropFieldIfAllNull, encoding, locale, pathGlobFilter, recursiveFileLookup)\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 299\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 300\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jreader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_spark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jvm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPythonUtils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoSeq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 301\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRDD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1303\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1304\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1305\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1306\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1307\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/spark/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 129\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0mconverted\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconvert_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 334\u001b[0m raise Py4JError(\n\u001b[1;32m 335\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 336\u001b[0;31m format(target_id, \".\", name))\n\u001b[0m\u001b[1;32m 337\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mtype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0manswer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mPy4JError\u001b[0m: An error occurred while calling o26.json" - ] - } - ], - "source": [ - "spark = pyspark.sql.SparkSession.builder \\\n", - " .appName(\"machine_time_waste\") \\\n", - " .getOrCreate()\n", - "\n", - "df = spark.read.json(\"/home/claudio/google_2019/instance_events/\" + cluster + \"/\" + cluster + \"_instance_events*.json.gz\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "juvenile-absolute", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "df.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "lucky-western", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "df.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "normal-settlement", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "# .filter(df.collection_type == 0) \\\n", - "df2 = df \\\n", - " .withColumn(\"time\", col(\"time\").cast(LongType())) \\\n", - " .withColumn(\"type\", col(\"type\").cast(LongType())) \\\n", - " .withColumn(\"type\", when(col(\"type\").isNull(), 0).otherwise(col(\"type\"))) \\\n", - " .withColumn(\"id\", concat_ws(\"-\", \"collection_id\", \"instance_index\")) \\\n", - " .where(col(\"time\").isNotNull()) \\\n", - " .where(col(\"type\").isNotNull()) \\\n", - " .where((col(\"instance_index\").isNotNull()) & (col(\"collection_id\").isNotNull())) \\\n", - " .select(\"machine_id\", \"id\", \"time\", \"type\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "typical-homeless", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "df2.show()\n", - "print(\"Total: \" + str(df.count()))\n", - "print(\"Filtered: \" + str(df2.count()))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "collect-saying", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "# my_window = Window.partitionBy(\"machine_id\", \"id\").orderBy(df2.time.asc())\n", - "\n", - "w2 = Window.partitionBy(\"id\").orderBy(df2.time.asc()).rowsBetween(Window.currentRow, Window.unboundedFollowing)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cooperative-appraisal", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "# .withColumn(\"prev_time\", lag(df2.time).over(my_window)) \\\n", - "# .withColumn(\"prev_type\", lag(df2.type).over(my_window)) \\\n", - "\n", - "df3 = df2 \\\n", - " .withColumn(\"t3_time\", when((df2.type != 3), None).otherwise(df2.time)) \\\n", - " .withColumn(\"t45678_time\", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.time)) \\\n", - " .withColumn(\"t45678_type\", when((df2.type < 4) | (df2.type > 8), None).otherwise(df2.type)) \\\n", - " .withColumn(\"t01_time\", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.time)) \\\n", - " .withColumn(\"t01_type\", when((df2.type != 0) & (df2.type != 1), None).otherwise(df2.type)) \\\n", - " .withColumn(\"next_time\", when(df2.type == 3, first(col(\"t45678_time\"), ignorenulls=True).over(w2)) \\\n", - " .when((df2.type == 0) | (df2.type == 1), first(col(\"t3_time\"), ignorenulls=True).over(w2)) \\\n", - " .when((df2.type >= 4) | (df2.type <= 8), first(col(\"t01_time\"), ignorenulls=True).over(w2)) \\\n", - " .otherwise(None)) \\\n", - " .withColumn(\"next_type\", when(df2.type == 3, first(col(\"t45678_type\"), ignorenulls=True).over(w2)) \\\n", - " .when((df2.type == 0) | (df2.type == 1), 3) \\\n", - " .when((df2.type >= 4) | (df2.type <= 8), first(col(\"t01_type\"), ignorenulls=True).over(w2)) \\\n", - " .otherwise(None)) \\\n", - " .withColumn(\"last_term_type\", last(col(\"t45678_type\"), ignorenulls=True).over(w2)) \\\n", - " .withColumn(\"time_delta\", col(\"next_time\") - col(\"time\")) \\\n", - " .select(\"machine_id\", \"id\", \"time\", \"type\", \"last_term_type\", \"time_delta\", \"t01_time\", \"t01_type\", \"t3_time\", \"t45678_time\", \"t45678_type\", \"next_time\", \"next_type\") \\" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ideal-angle", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "df4 = df3.where(df3.next_type.isNotNull()).groupby(\"type\", \"next_type\", \"last_term_type\").sum(\"time_delta\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "working-difficulty", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "# df3.orderBy(df3.machine_id, df3.time).show(n=100)\n", - "# df3.printSchema()\n", - "df4.show(n=1000000)\n", - "df4.write.csv(\"/home/claudio/google_2019/thesis_queries/machine_time_waste/\" + cluster + \"_state_change.csv\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}