From a7d3b2fce0dfa1277c17f678a186b37dd5167b76 Mon Sep 17 00:00:00 2001 From: Claudio Maggioni Date: Wed, 22 Mar 2023 10:35:29 +0100 Subject: [PATCH] hw1: ex1, ex2, ex3 (code only), ex4.1, ex4.2 (no continents) done --- Assignment1/Assignment1.ipynb | 335 +++++++++++++++++++++++++--------- 1 file changed, 251 insertions(+), 84 deletions(-) diff --git a/Assignment1/Assignment1.ipynb b/Assignment1/Assignment1.ipynb index 003fbb9..4571721 100644 --- a/Assignment1/Assignment1.ipynb +++ b/Assignment1/Assignment1.ipynb @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 1, "id": "fcf3beb9", "metadata": {}, "outputs": [], @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "a0af6847", "metadata": {}, "outputs": [ @@ -62,7 +62,7 @@ "('Ü', 'sloppy-windows-1252')" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "22ce9426", "metadata": {}, "outputs": [ @@ -271,7 +271,7 @@ "4 2016-03-31 00:00:00 0 60437 2016-04-06 10:17:21 " ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -284,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "a332b6a5", "metadata": {}, "outputs": [ @@ -313,7 +313,7 @@ " 'lastSeen': ['str']}" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -331,7 +331,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "11bfa9a2", "metadata": {}, "outputs": [ @@ -372,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "f1c539c4", "metadata": {}, "outputs": [ @@ -413,7 +413,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "86074e70", "metadata": {}, "outputs": [ @@ -610,7 +610,7 @@ "4 2016-03-31 00:00:00 0 60437 2016-04-06 10:17:21 " ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -653,7 +653,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "8b6f9ce3", "metadata": {}, "outputs": [ @@ -683,7 +683,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "98f8d101", "metadata": {}, "outputs": [ @@ -708,7 +708,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "f300f49d", "metadata": {}, "outputs": [ @@ -727,7 +727,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "923c5354", "metadata": {}, "outputs": [], @@ -738,7 +738,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "4b847b1f", "metadata": {}, "outputs": [], @@ -749,7 +749,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "bf1f417d", "metadata": {}, "outputs": [ @@ -779,7 +779,7 @@ "dtype: int64" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -791,7 +791,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "919e692f", "metadata": {}, "outputs": [], @@ -836,7 +836,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "7cc5c90f", "metadata": {}, "outputs": [ @@ -898,7 +898,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 16, "id": "ca97e7c8", "metadata": {}, "outputs": [ @@ -964,7 +964,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 17, "id": "eb956ed4", "metadata": {}, "outputs": [], @@ -985,7 +985,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 18, "id": "4a29684b", "metadata": {}, "outputs": [], @@ -995,7 +995,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 19, "id": "d3d58d25", "metadata": {}, "outputs": [ @@ -1044,7 +1044,10 @@ "\n", "You'll need to work with the *'airports'* and *‘airports-delays’* datasets. Examine the datasets and perform cleansing if needed, before performing the exercise.\n", "\n", - "1. Create a dataframe that provides, for each country, the mean of flights delayed. Display these information by binning the flights delayed in 6 bins. The resulting dataframe should have the countries as rows and the 6 bins as columns. For this exercise you cannot use pivot_table but only groupby. \n", + "1. Create a dataframe that provides, for each country, the mean of flights delayed. Display these information by binning the flights delayed in 6 bins. The resulting dataframe should have the countries as rows and the 6 bins as columns. For this exercise you cannot use pivot_table but only groupby. \n", + "\n", + "According to answer of question to professor:\n", + "> Bin by delay_duration value, compute delay mean per-bin per-country \n", "\n", "2. Create a dataframe from ‘a*irports-delays’* which shows for each continent and country:\n", " 1. max, min and mean of ‘**delay_duration**’;\n", @@ -1057,9 +1060,180 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 53, "id": "b4fde7e4", "metadata": {}, + "outputs": [], + "source": [ + "df_air = pd.read_csv(\"./datasets/airports.csv\", index_col='ID', na_values=['\\\\N'])\n", + "df_del = pd.read_csv(\"./datasets/airports-delays.csv\", index_col='ID', sep=\";\", na_values=['\\\\N'])" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "f8906707", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
delay_duration_bin(15.999, 30.0](30.0, 35.0](35.0, 41.0](41.0, 47.0](47.0, 59.0](59.0, 850.0]
country
Afghanistan0.00.0000000.0044.00.00000060.0
Albania18.531.0000000.000.056.00000063.0
Algeria26.533.85714338.7543.051.20000073.0
American Samoa0.00.0000000.0043.048.0000000.0
Angola28.034.50000036.0045.051.6666670.0
\n", + "
" + ], + "text/plain": [ + "delay_duration_bin (15.999, 30.0] (30.0, 35.0] (35.0, 41.0] (41.0, 47.0] \\\n", + "country \n", + "Afghanistan 0.0 0.000000 0.00 44.0 \n", + "Albania 18.5 31.000000 0.00 0.0 \n", + "Algeria 26.5 33.857143 38.75 43.0 \n", + "American Samoa 0.0 0.000000 0.00 43.0 \n", + "Angola 28.0 34.500000 36.00 45.0 \n", + "\n", + "delay_duration_bin (47.0, 59.0] (59.0, 850.0] \n", + "country \n", + "Afghanistan 0.000000 60.0 \n", + "Albania 56.000000 63.0 \n", + "Algeria 51.200000 73.0 \n", + "American Samoa 48.000000 0.0 \n", + "Angola 51.666667 0.0 " + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_4_1 = df_del.copy()\n", + "\n", + "# The following statements bins the data by the value of delay_duration.\n", + "# The bins are chosen as equally-spaced percentile values of the data. This is done to \n", + "# better distribute the data between bins, as it is quite skewed towards low values\n", + "df_4_1[\"delay_duration_bin\"] = pd.qcut(df_del.delay_duration, 6)\n", + "\n", + "# The dataframe will contain countries as row indices, the 6 bins as columns and values\n", + "# corresponding to the mean delay_duration per country, per bin. When no delay_duration \n", + "# falls in a particular bin for some country, that bin has a value of 0\n", + "df_4_1 = df_4_1.loc[:, ['country', 'delay_duration', 'delay_duration_bin']] \\\n", + " .groupby(['country', 'delay_duration_bin']) \\\n", + " .mean() \\\n", + " .fillna(0) \\\n", + " .reset_index() \\\n", + " .pivot(index='country', columns='delay_duration_bin', values='delay_duration') \n", + "\n", + "df_4_1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "a677ce07", + "metadata": {}, + "outputs": [], + "source": [ + "# 4.2\n", + "# TODO: continents\n", + "df_4_2 = df_del.loc[:, ['country', 'delay_duration', 'flights_cancelled', 'flights_delayed', 'flights_planned']] \\\n", + " .groupby('country') \\\n", + " .agg(dur_min=('delay_duration', 'min'), \\\n", + " dur_mean=('delay_duration', 'mean'), \\\n", + " dur_max=('delay_duration', 'max'), \\\n", + " cancelled_sum=('flights_cancelled', 'sum'), \\\n", + " cancelled_mean=('flights_cancelled', 'mean'), \\\n", + " delayed_sum=('flights_delayed', 'sum'), \\\n", + " delayed_mean=('flights_delayed', 'mean'), \\\n", + " planned_sum=('flights_planned', 'sum'), \\\n", + " planned_mean=('flights_planned', 'mean'))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "a29b8c2f", + "metadata": {}, "outputs": [ { "data": { @@ -1132,7 +1306,7 @@ " 31.328199\n", " 35.388599\n", " -1266\n", - " 2\n", + " 2.0\n", " E\n", " Asia/Jerusalem\n", " airport\n", @@ -1152,7 +1326,7 @@ " 30.621700\n", " 35.203300\n", " -164\n", - " 2\n", + " 2.0\n", " E\n", " Asia/Jerusalem\n", " airport\n", @@ -1172,7 +1346,7 @@ " 33.626701\n", " -116.160004\n", " -115\n", - " -8\n", + " -8.0\n", " A\n", " America/Los_Angeles\n", " airport\n", @@ -1192,7 +1366,7 @@ " 47.121899\n", " 51.821400\n", " -72\n", - " 5\n", + " 5.0\n", " U\n", " Asia/Oral\n", " airport\n", @@ -1272,7 +1446,7 @@ " 47.092444\n", " 8.305184\n", " 1400\n", - " 1\n", + " 1.0\n", " E\n", " Europe/Zurich\n", " airport\n", @@ -1292,7 +1466,7 @@ " 3.421000\n", " 115.153999\n", " 1400\n", - " 8\n", + " 8.0\n", " N\n", " Asia/Kuala_Lumpur\n", " airport\n", @@ -1312,7 +1486,7 @@ " -13.549100\n", " -48.195301\n", " 1401\n", - " -3\n", + " -3.0\n", " S\n", " America/Sao_Paulo\n", " airport\n", @@ -1327,12 +1501,12 @@ " Bubovice Airport\n", " Bubovice\n", " Czech Republic\n", - " \\N\n", + " NaN\n", " LKBU\n", " 49.974400\n", " 14.178100\n", " 1401\n", - " 1\n", + " 1.0\n", " E\n", " Europe/Prague\n", " airport\n", @@ -1360,67 +1534,60 @@ "1670 Emmen Air Base Emmen Switzerland EML \n", "6215 Long Lellang Airport Long Datih Malaysia LGL \n", "7375 Minaçu Airport Minacu Brazil MQH \n", - "9253 Bubovice Airport Bubovice Czech Republic \\N \n", + "9253 Bubovice Airport Bubovice Czech Republic NaN \n", "\n", - " ICAO latitude longitude altitude timezone DST tz_database_timezone \\\n", - "ID \n", - "1600 LLMZ 31.328199 35.388599 -1266 2 E Asia/Jerusalem \n", - "1595 LLEY 30.621700 35.203300 -164 2 E Asia/Jerusalem \n", - "7646 KTRM 33.626701 -116.160004 -115 -8 A America/Los_Angeles \n", - "4357 UATG 47.121899 51.821400 -72 5 U Asia/Oral \n", - "2151 OINR 36.909901 50.679600 -70 3.5 E Asia/Tehran \n", - "... ... ... ... ... ... .. ... \n", - "3039 VELP 23.840599 92.619698 1398 5.5 N Asia/Calcutta \n", - "1670 LSME 47.092444 8.305184 1400 1 E Europe/Zurich \n", - "6215 WBGF 3.421000 115.153999 1400 8 N Asia/Kuala_Lumpur \n", - "7375 SBMC -13.549100 -48.195301 1401 -3 S America/Sao_Paulo \n", - "9253 LKBU 49.974400 14.178100 1401 1 E Europe/Prague \n", + " ICAO latitude longitude altitude timezone DST \\\n", + "ID \n", + "1600 LLMZ 31.328199 35.388599 -1266 2.0 E \n", + "1595 LLEY 30.621700 35.203300 -164 2.0 E \n", + "7646 KTRM 33.626701 -116.160004 -115 -8.0 A \n", + "4357 UATG 47.121899 51.821400 -72 5.0 U \n", + "2151 OINR 36.909901 50.679600 -70 3.5 E \n", + "... ... ... ... ... ... .. \n", + "3039 VELP 23.840599 92.619698 1398 5.5 N \n", + "1670 LSME 47.092444 8.305184 1400 1.0 E \n", + "6215 WBGF 3.421000 115.153999 1400 8.0 N \n", + "7375 SBMC -13.549100 -48.195301 1401 -3.0 S \n", + "9253 LKBU 49.974400 14.178100 1401 1.0 E \n", "\n", - " type source flights_planned flights_cancelled \\\n", - "ID \n", - "1600 airport OurAirports 62 0 \n", - "1595 airport OurAirports 56 0 \n", - "7646 airport OurAirports 60 0 \n", - "4357 airport OurAirports 71 0 \n", - "2151 airport OurAirports 62 1 \n", - "... ... ... ... ... \n", - "3039 airport OurAirports 118 0 \n", - "1670 airport OurAirports 124 0 \n", - "6215 airport OurAirports 126 0 \n", - "7375 airport OurAirports 119 1 \n", - "9253 airport OurAirports 128 0 \n", + " tz_database_timezone type source flights_planned \\\n", + "ID \n", + "1600 Asia/Jerusalem airport OurAirports 62 \n", + "1595 Asia/Jerusalem airport OurAirports 56 \n", + "7646 America/Los_Angeles airport OurAirports 60 \n", + "4357 Asia/Oral airport OurAirports 71 \n", + "2151 Asia/Tehran airport OurAirports 62 \n", + "... ... ... ... ... \n", + "3039 Asia/Calcutta airport OurAirports 118 \n", + "1670 Europe/Zurich airport OurAirports 124 \n", + "6215 Asia/Kuala_Lumpur airport OurAirports 126 \n", + "7375 America/Sao_Paulo airport OurAirports 119 \n", + "9253 Europe/Prague airport OurAirports 128 \n", "\n", - " flights_delayed delay_duration \n", - "ID \n", - "1600 9 32.0 \n", - "1595 7 24.0 \n", - "7646 7 28.0 \n", - "4357 9 35.0 \n", - "2151 6 47.0 \n", - "... ... ... \n", - "3039 23 38.0 \n", - "1670 19 38.0 \n", - "6215 18 32.0 \n", - "7375 25 48.0 \n", - "9253 15 32.0 \n", + " flights_cancelled flights_delayed delay_duration \n", + "ID \n", + "1600 0 9 32.0 \n", + "1595 0 7 24.0 \n", + "7646 0 7 28.0 \n", + "4357 0 9 35.0 \n", + "2151 1 6 47.0 \n", + "... ... ... ... \n", + "3039 0 23 38.0 \n", + "1670 0 19 38.0 \n", + "6215 0 18 32.0 \n", + "7375 1 25 48.0 \n", + "9253 0 15 32.0 \n", "\n", "[6029 rows x 17 columns]" ] }, - "execution_count": 62, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_air = pd.read_csv(\"./datasets/airports.csv\", index_col='ID')\n", - "df_del = pd.read_csv(\"./datasets/airports-delays.csv\", index_col='ID', sep=\";\")\n", - "\n", - "df_del\n", - "#pd.cut(df_del.flights_delayed, range(0, df_del.flights_delayed.max(), 25))\n", - "\n", - "#df_bycountry = df_del.loc[:, ['country', 'flights_delayed']].groupby('country').sum().sort_values('flights_delayed', ascending=False)\n", - "#df_bycountry\n" + "df_del" ] }, {