hw1: ex1, ex2, ex3 (code only), ex4.1, ex4.2 (no continents) done

This commit is contained in:
Claudio Maggioni 2023-03-22 10:35:29 +01:00
parent 7485e14887
commit a7d3b2fce0

View file

@ -22,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 1,
"id": "fcf3beb9",
"metadata": {},
"outputs": [],
@ -52,7 +52,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "a0af6847",
"metadata": {},
"outputs": [
@ -62,7 +62,7 @@
"('Ü', 'sloppy-windows-1252')"
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@ -74,7 +74,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"id": "22ce9426",
"metadata": {},
"outputs": [
@ -271,7 +271,7 @@
"4 2016-03-31 00:00:00 0 60437 2016-04-06 10:17:21 "
]
},
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@ -284,7 +284,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "a332b6a5",
"metadata": {},
"outputs": [
@ -313,7 +313,7 @@
" 'lastSeen': ['str']}"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@ -331,7 +331,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"id": "11bfa9a2",
"metadata": {},
"outputs": [
@ -372,7 +372,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"id": "f1c539c4",
"metadata": {},
"outputs": [
@ -413,7 +413,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"id": "86074e70",
"metadata": {},
"outputs": [
@ -610,7 +610,7 @@
"4 2016-03-31 00:00:00 0 60437 2016-04-06 10:17:21 "
]
},
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -653,7 +653,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"id": "8b6f9ce3",
"metadata": {},
"outputs": [
@ -683,7 +683,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"id": "98f8d101",
"metadata": {},
"outputs": [
@ -708,7 +708,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"id": "f300f49d",
"metadata": {},
"outputs": [
@ -727,7 +727,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"id": "923c5354",
"metadata": {},
"outputs": [],
@ -738,7 +738,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"id": "4b847b1f",
"metadata": {},
"outputs": [],
@ -749,7 +749,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"id": "bf1f417d",
"metadata": {},
"outputs": [
@ -779,7 +779,7 @@
"dtype: int64"
]
},
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@ -791,7 +791,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"id": "919e692f",
"metadata": {},
"outputs": [],
@ -836,7 +836,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 15,
"id": "7cc5c90f",
"metadata": {},
"outputs": [
@ -898,7 +898,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 16,
"id": "ca97e7c8",
"metadata": {},
"outputs": [
@ -964,7 +964,7 @@
},
{
"cell_type": "code",
"execution_count": 130,
"execution_count": 17,
"id": "eb956ed4",
"metadata": {},
"outputs": [],
@ -985,7 +985,7 @@
},
{
"cell_type": "code",
"execution_count": 131,
"execution_count": 18,
"id": "4a29684b",
"metadata": {},
"outputs": [],
@ -995,7 +995,7 @@
},
{
"cell_type": "code",
"execution_count": 136,
"execution_count": 19,
"id": "d3d58d25",
"metadata": {},
"outputs": [
@ -1044,7 +1044,10 @@
"\n",
"You'll need to work with the *'airports'* and *airports-delays* datasets. Examine the datasets and perform cleansing if needed, before performing the exercise.\n",
"\n",
"1. Create a dataframe that provides, for each country, the mean of flights delayed. Display these information by binning the flights delayed in 6 bins. The resulting dataframe should have the countries as rows and the 6 bins as columns. For this exercise you cannot use pivot_table but only groupby. \n",
"1. Create a dataframe that provides, for each country, <del>the mean of flights delayed</del>. Display these information by binning the flights delayed in 6 bins. The resulting dataframe should have the countries as rows and the 6 bins as columns. For this exercise you cannot use pivot_table but only groupby. \n",
"\n",
"<span style=\"color: red\">According to answer of question to professor:</span>\n",
"> Bin by delay_duration value, compute delay mean per-bin per-country \n",
"\n",
"2. Create a dataframe from a*irports-delays* which shows for each continent and country:\n",
" 1. max, min and mean of **delay_duration**;\n",
@ -1057,9 +1060,180 @@
},
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 53,
"id": "b4fde7e4",
"metadata": {},
"outputs": [],
"source": [
"df_air = pd.read_csv(\"./datasets/airports.csv\", index_col='ID', na_values=['\\\\N'])\n",
"df_del = pd.read_csv(\"./datasets/airports-delays.csv\", index_col='ID', sep=\";\", na_values=['\\\\N'])"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "f8906707",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>delay_duration_bin</th>\n",
" <th>(15.999, 30.0]</th>\n",
" <th>(30.0, 35.0]</th>\n",
" <th>(35.0, 41.0]</th>\n",
" <th>(41.0, 47.0]</th>\n",
" <th>(47.0, 59.0]</th>\n",
" <th>(59.0, 850.0]</th>\n",
" </tr>\n",
" <tr>\n",
" <th>country</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan</th>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>44.0</td>\n",
" <td>0.000000</td>\n",
" <td>60.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania</th>\n",
" <td>18.5</td>\n",
" <td>31.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>56.000000</td>\n",
" <td>63.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria</th>\n",
" <td>26.5</td>\n",
" <td>33.857143</td>\n",
" <td>38.75</td>\n",
" <td>43.0</td>\n",
" <td>51.200000</td>\n",
" <td>73.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>American Samoa</th>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>43.0</td>\n",
" <td>48.000000</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola</th>\n",
" <td>28.0</td>\n",
" <td>34.500000</td>\n",
" <td>36.00</td>\n",
" <td>45.0</td>\n",
" <td>51.666667</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"delay_duration_bin (15.999, 30.0] (30.0, 35.0] (35.0, 41.0] (41.0, 47.0] \\\n",
"country \n",
"Afghanistan 0.0 0.000000 0.00 44.0 \n",
"Albania 18.5 31.000000 0.00 0.0 \n",
"Algeria 26.5 33.857143 38.75 43.0 \n",
"American Samoa 0.0 0.000000 0.00 43.0 \n",
"Angola 28.0 34.500000 36.00 45.0 \n",
"\n",
"delay_duration_bin (47.0, 59.0] (59.0, 850.0] \n",
"country \n",
"Afghanistan 0.000000 60.0 \n",
"Albania 56.000000 63.0 \n",
"Algeria 51.200000 73.0 \n",
"American Samoa 48.000000 0.0 \n",
"Angola 51.666667 0.0 "
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_4_1 = df_del.copy()\n",
"\n",
"# The following statements bins the data by the value of delay_duration.\n",
"# The bins are chosen as equally-spaced percentile values of the data. This is done to \n",
"# better distribute the data between bins, as it is quite skewed towards low values\n",
"df_4_1[\"delay_duration_bin\"] = pd.qcut(df_del.delay_duration, 6)\n",
"\n",
"# The dataframe will contain countries as row indices, the 6 bins as columns and values\n",
"# corresponding to the mean delay_duration per country, per bin. When no delay_duration \n",
"# falls in a particular bin for some country, that bin has a value of 0\n",
"df_4_1 = df_4_1.loc[:, ['country', 'delay_duration', 'delay_duration_bin']] \\\n",
" .groupby(['country', 'delay_duration_bin']) \\\n",
" .mean() \\\n",
" .fillna(0) \\\n",
" .reset_index() \\\n",
" .pivot(index='country', columns='delay_duration_bin', values='delay_duration') \n",
"\n",
"df_4_1.head()"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "a677ce07",
"metadata": {},
"outputs": [],
"source": [
"# 4.2\n",
"# TODO: continents\n",
"df_4_2 = df_del.loc[:, ['country', 'delay_duration', 'flights_cancelled', 'flights_delayed', 'flights_planned']] \\\n",
" .groupby('country') \\\n",
" .agg(dur_min=('delay_duration', 'min'), \\\n",
" dur_mean=('delay_duration', 'mean'), \\\n",
" dur_max=('delay_duration', 'max'), \\\n",
" cancelled_sum=('flights_cancelled', 'sum'), \\\n",
" cancelled_mean=('flights_cancelled', 'mean'), \\\n",
" delayed_sum=('flights_delayed', 'sum'), \\\n",
" delayed_mean=('flights_delayed', 'mean'), \\\n",
" planned_sum=('flights_planned', 'sum'), \\\n",
" planned_mean=('flights_planned', 'mean'))"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "a29b8c2f",
"metadata": {},
"outputs": [
{
"data": {
@ -1132,7 +1306,7 @@
" <td>31.328199</td>\n",
" <td>35.388599</td>\n",
" <td>-1266</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>E</td>\n",
" <td>Asia/Jerusalem</td>\n",
" <td>airport</td>\n",
@ -1152,7 +1326,7 @@
" <td>30.621700</td>\n",
" <td>35.203300</td>\n",
" <td>-164</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>E</td>\n",
" <td>Asia/Jerusalem</td>\n",
" <td>airport</td>\n",
@ -1172,7 +1346,7 @@
" <td>33.626701</td>\n",
" <td>-116.160004</td>\n",
" <td>-115</td>\n",
" <td>-8</td>\n",
" <td>-8.0</td>\n",
" <td>A</td>\n",
" <td>America/Los_Angeles</td>\n",
" <td>airport</td>\n",
@ -1192,7 +1366,7 @@
" <td>47.121899</td>\n",
" <td>51.821400</td>\n",
" <td>-72</td>\n",
" <td>5</td>\n",
" <td>5.0</td>\n",
" <td>U</td>\n",
" <td>Asia/Oral</td>\n",
" <td>airport</td>\n",
@ -1272,7 +1446,7 @@
" <td>47.092444</td>\n",
" <td>8.305184</td>\n",
" <td>1400</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>E</td>\n",
" <td>Europe/Zurich</td>\n",
" <td>airport</td>\n",
@ -1292,7 +1466,7 @@
" <td>3.421000</td>\n",
" <td>115.153999</td>\n",
" <td>1400</td>\n",
" <td>8</td>\n",
" <td>8.0</td>\n",
" <td>N</td>\n",
" <td>Asia/Kuala_Lumpur</td>\n",
" <td>airport</td>\n",
@ -1312,7 +1486,7 @@
" <td>-13.549100</td>\n",
" <td>-48.195301</td>\n",
" <td>1401</td>\n",
" <td>-3</td>\n",
" <td>-3.0</td>\n",
" <td>S</td>\n",
" <td>America/Sao_Paulo</td>\n",
" <td>airport</td>\n",
@ -1327,12 +1501,12 @@
" <td>Bubovice Airport</td>\n",
" <td>Bubovice</td>\n",
" <td>Czech Republic</td>\n",
" <td>\\N</td>\n",
" <td>NaN</td>\n",
" <td>LKBU</td>\n",
" <td>49.974400</td>\n",
" <td>14.178100</td>\n",
" <td>1401</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>E</td>\n",
" <td>Europe/Prague</td>\n",
" <td>airport</td>\n",
@ -1360,67 +1534,60 @@
"1670 Emmen Air Base Emmen Switzerland EML \n",
"6215 Long Lellang Airport Long Datih Malaysia LGL \n",
"7375 Minaçu Airport Minacu Brazil MQH \n",
"9253 Bubovice Airport Bubovice Czech Republic \\N \n",
"9253 Bubovice Airport Bubovice Czech Republic NaN \n",
"\n",
" ICAO latitude longitude altitude timezone DST tz_database_timezone \\\n",
" ICAO latitude longitude altitude timezone DST \\\n",
"ID \n",
"1600 LLMZ 31.328199 35.388599 -1266 2 E Asia/Jerusalem \n",
"1595 LLEY 30.621700 35.203300 -164 2 E Asia/Jerusalem \n",
"7646 KTRM 33.626701 -116.160004 -115 -8 A America/Los_Angeles \n",
"4357 UATG 47.121899 51.821400 -72 5 U Asia/Oral \n",
"2151 OINR 36.909901 50.679600 -70 3.5 E Asia/Tehran \n",
"... ... ... ... ... ... .. ... \n",
"3039 VELP 23.840599 92.619698 1398 5.5 N Asia/Calcutta \n",
"1670 LSME 47.092444 8.305184 1400 1 E Europe/Zurich \n",
"6215 WBGF 3.421000 115.153999 1400 8 N Asia/Kuala_Lumpur \n",
"7375 SBMC -13.549100 -48.195301 1401 -3 S America/Sao_Paulo \n",
"9253 LKBU 49.974400 14.178100 1401 1 E Europe/Prague \n",
"1600 LLMZ 31.328199 35.388599 -1266 2.0 E \n",
"1595 LLEY 30.621700 35.203300 -164 2.0 E \n",
"7646 KTRM 33.626701 -116.160004 -115 -8.0 A \n",
"4357 UATG 47.121899 51.821400 -72 5.0 U \n",
"2151 OINR 36.909901 50.679600 -70 3.5 E \n",
"... ... ... ... ... ... .. \n",
"3039 VELP 23.840599 92.619698 1398 5.5 N \n",
"1670 LSME 47.092444 8.305184 1400 1.0 E \n",
"6215 WBGF 3.421000 115.153999 1400 8.0 N \n",
"7375 SBMC -13.549100 -48.195301 1401 -3.0 S \n",
"9253 LKBU 49.974400 14.178100 1401 1.0 E \n",
"\n",
" type source flights_planned flights_cancelled \\\n",
" tz_database_timezone type source flights_planned \\\n",
"ID \n",
"1600 airport OurAirports 62 0 \n",
"1595 airport OurAirports 56 0 \n",
"7646 airport OurAirports 60 0 \n",
"4357 airport OurAirports 71 0 \n",
"2151 airport OurAirports 62 1 \n",
"1600 Asia/Jerusalem airport OurAirports 62 \n",
"1595 Asia/Jerusalem airport OurAirports 56 \n",
"7646 America/Los_Angeles airport OurAirports 60 \n",
"4357 Asia/Oral airport OurAirports 71 \n",
"2151 Asia/Tehran airport OurAirports 62 \n",
"... ... ... ... ... \n",
"3039 airport OurAirports 118 0 \n",
"1670 airport OurAirports 124 0 \n",
"6215 airport OurAirports 126 0 \n",
"7375 airport OurAirports 119 1 \n",
"9253 airport OurAirports 128 0 \n",
"3039 Asia/Calcutta airport OurAirports 118 \n",
"1670 Europe/Zurich airport OurAirports 124 \n",
"6215 Asia/Kuala_Lumpur airport OurAirports 126 \n",
"7375 America/Sao_Paulo airport OurAirports 119 \n",
"9253 Europe/Prague airport OurAirports 128 \n",
"\n",
" flights_delayed delay_duration \n",
" flights_cancelled flights_delayed delay_duration \n",
"ID \n",
"1600 9 32.0 \n",
"1595 7 24.0 \n",
"7646 7 28.0 \n",
"4357 9 35.0 \n",
"2151 6 47.0 \n",
"... ... ... \n",
"3039 23 38.0 \n",
"1670 19 38.0 \n",
"6215 18 32.0 \n",
"7375 25 48.0 \n",
"9253 15 32.0 \n",
"1600 0 9 32.0 \n",
"1595 0 7 24.0 \n",
"7646 0 7 28.0 \n",
"4357 0 9 35.0 \n",
"2151 1 6 47.0 \n",
"... ... ... ... \n",
"3039 0 23 38.0 \n",
"1670 0 19 38.0 \n",
"6215 0 18 32.0 \n",
"7375 1 25 48.0 \n",
"9253 0 15 32.0 \n",
"\n",
"[6029 rows x 17 columns]"
]
},
"execution_count": 62,
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_air = pd.read_csv(\"./datasets/airports.csv\", index_col='ID')\n",
"df_del = pd.read_csv(\"./datasets/airports-delays.csv\", index_col='ID', sep=\";\")\n",
"\n",
"df_del\n",
"#pd.cut(df_del.flights_delayed, range(0, df_del.flights_delayed.max(), 25))\n",
"\n",
"#df_bycountry = df_del.loc[:, ['country', 'flights_delayed']].groupby('country').sum().sort_values('flights_delayed', ascending=False)\n",
"#df_bycountry\n"
"df_del"
]
},
{