diff --git a/Assignment1/Assignment1.ipynb b/Assignment1/Assignment1.ipynb index 4571721..f46aa6e 100644 --- a/Assignment1/Assignment1.ipynb +++ b/Assignment1/Assignment1.ipynb @@ -969,6 +969,8 @@ "metadata": {}, "outputs": [], "source": [ + "# TODO: text explaination\n", + "\n", "df_m = pd.read_csv(\"./datasets/market_value_decline.csv\").rename(columns={\n", " 'Unnamed: 0': 'bank',\n", " 'market_value_2007': '2007',\n", @@ -1060,18 +1062,65 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 41, "id": "b4fde7e4", "metadata": {}, "outputs": [], "source": [ "df_air = pd.read_csv(\"./datasets/airports.csv\", index_col='ID', na_values=['\\\\N'])\n", - "df_del = pd.read_csv(\"./datasets/airports-delays.csv\", index_col='ID', sep=\";\", na_values=['\\\\N'])" + "df_del = pd.read_csv(\"./datasets/airports-delays.csv\", index_col='ID', sep=\";\", na_values=['\\\\N']) \\\n", + " .dropna(subset=['tz_database_timezone'])" ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 42, + "id": "25391739", + "metadata": {}, + "outputs": [], + "source": [ + "def tz_to_continent(tz: str) -> str:\n", + " tz_mappings = {\n", + " 'Asia/': 'Asia',\n", + " 'Africa/': 'Africa',\n", + " 'America/': 'America',\n", + " 'Europe/': 'Europe',\n", + " 'Australia/': 'Oceania',\n", + " 'Pacific/': 'Oceania',\n", + " 'Antarctica/': 'Antarctica',\n", + " 'Arctic/Longyearbyen': 'Europe',\n", + " 'Atlantic/Azores': 'Europe',\n", + " 'Atlantic/Bermuda': 'America',\n", + " 'Atlantic/Canary': 'Africa',\n", + " 'Atlantic/Cape_Verde': 'Africa',\n", + " 'Atlantic/Faeroe': 'Europe',\n", + " 'Atlantic/Reykjavik': 'Europe',\n", + " 'Atlantic/St_Helena': 'Africa',\n", + " 'Atlantic/Stanley': 'America',\n", + " 'Indian/Antananarivo': 'Africa',\n", + " 'Indian/Chagos': 'Asia',\n", + " 'Indian/Christmas': 'Oceania',\n", + " 'Indian/Cocos': 'Oceania',\n", + " 'Indian/Comoro': 'Africa',\n", + " 'Indian/Mahe': 'Africa',\n", + " 'Indian/Maldives': 'Asia',\n", + " 'Indian/Mauritius': 'Africa',\n", + " 'Indian/Mayotte': 'Africa',\n", + " 'Indian/Reunion': 'Africa',\n", + " }\n", + " if type(tz) != str:\n", + " raise ValueError(\"tz not str\")\n", + " to_return = [v for (k, v) in tz_mappings.items() if tz.startswith(k)]\n", + " if len(to_return) == 0:\n", + " raise ValueError(f\"'{tz}' no continent found\")\n", + " return to_return[0]\n", + "\n", + "df_del[\"continent\"] = df_del[\"tz_database_timezone\"].apply(tz_to_continent)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, "id": "f8906707", "metadata": {}, "outputs": [ @@ -1125,12 +1174,12 @@ " \n", " \n", " Albania\n", - " 18.5\n", - " 31.000000\n", + " 0.0\n", + " 0.000000\n", " 0.00\n", " 0.0\n", " 56.000000\n", - " 63.0\n", + " 0.0\n", " \n", " \n", " Algeria\n", @@ -1153,7 +1202,7 @@ " \n", " Angola\n", " 28.0\n", - " 34.500000\n", + " 35.000000\n", " 36.00\n", " 45.0\n", " 51.666667\n", @@ -1167,21 +1216,21 @@ "delay_duration_bin (15.999, 30.0] (30.0, 35.0] (35.0, 41.0] (41.0, 47.0] \\\n", "country \n", "Afghanistan 0.0 0.000000 0.00 44.0 \n", - "Albania 18.5 31.000000 0.00 0.0 \n", + "Albania 0.0 0.000000 0.00 0.0 \n", "Algeria 26.5 33.857143 38.75 43.0 \n", "American Samoa 0.0 0.000000 0.00 43.0 \n", - "Angola 28.0 34.500000 36.00 45.0 \n", + "Angola 28.0 35.000000 36.00 45.0 \n", "\n", "delay_duration_bin (47.0, 59.0] (59.0, 850.0] \n", "country \n", "Afghanistan 0.000000 60.0 \n", - "Albania 56.000000 63.0 \n", + "Albania 56.000000 0.0 \n", "Algeria 51.200000 73.0 \n", "American Samoa 48.000000 0.0 \n", "Angola 51.666667 0.0 " ] }, - "execution_count": 81, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -1209,31 +1258,9 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 40, "id": "a677ce07", "metadata": {}, - "outputs": [], - "source": [ - "# 4.2\n", - "# TODO: continents\n", - "df_4_2 = df_del.loc[:, ['country', 'delay_duration', 'flights_cancelled', 'flights_delayed', 'flights_planned']] \\\n", - " .groupby('country') \\\n", - " .agg(dur_min=('delay_duration', 'min'), \\\n", - " dur_mean=('delay_duration', 'mean'), \\\n", - " dur_max=('delay_duration', 'max'), \\\n", - " cancelled_sum=('flights_cancelled', 'sum'), \\\n", - " cancelled_mean=('flights_cancelled', 'mean'), \\\n", - " delayed_sum=('flights_delayed', 'sum'), \\\n", - " delayed_mean=('flights_delayed', 'mean'), \\\n", - " planned_sum=('flights_planned', 'sum'), \\\n", - " planned_mean=('flights_planned', 'mean'))" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "a29b8c2f", - "metadata": {}, "outputs": [ { "data": { @@ -1256,34 +1283,20 @@ " \n", " \n", " \n", - " airport_name\n", - " city\n", - " country\n", - " IATA\n", - " ICAO\n", - " latitude\n", - " longitude\n", - " altitude\n", - " timezone\n", - " DST\n", - " tz_database_timezone\n", - " type\n", - " source\n", - " flights_planned\n", - " flights_cancelled\n", - " flights_delayed\n", - " delay_duration\n", + " \n", + " dur_min\n", + " dur_mean\n", + " dur_max\n", + " cancelled_sum\n", + " cancelled_mean\n", + " delayed_sum\n", + " delayed_mean\n", + " planned_sum\n", + " planned_mean\n", " \n", " \n", - " ID\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " continent\n", + " country\n", " \n", " \n", " \n", @@ -1297,297 +1310,161 @@ " \n", " \n", " \n", - " 1600\n", - " Bar Yehuda Airfield\n", - " Metzada\n", - " Israel\n", - " MTZ\n", - " LLMZ\n", - " 31.328199\n", - " 35.388599\n", - " -1266\n", - " 2.0\n", - " E\n", - " Asia/Jerusalem\n", - " airport\n", - " OurAirports\n", - " 62\n", - " 0\n", - " 9\n", - " 32.0\n", - " \n", - " \n", - " 1595\n", - " Ein Yahav Airfield\n", - " Eyn-yahav\n", - " Israel\n", - " EIY\n", - " LLEY\n", - " 30.621700\n", - " 35.203300\n", - " -164\n", - " 2.0\n", - " E\n", - " Asia/Jerusalem\n", - " airport\n", - " OurAirports\n", - " 56\n", - " 0\n", - " 7\n", - " 24.0\n", - " \n", - " \n", - " 7646\n", - " Jacqueline Cochran Regional Airport\n", - " Palm Springs\n", - " United States\n", - " TRM\n", - " KTRM\n", - " 33.626701\n", - " -116.160004\n", - " -115\n", - " -8.0\n", - " A\n", - " America/Los_Angeles\n", - " airport\n", - " OurAirports\n", - " 60\n", - " 0\n", - " 7\n", - " 28.0\n", - " \n", - " \n", - " 4357\n", - " Atyrau Airport\n", - " Atyrau\n", - " Kazakhstan\n", - " GUW\n", - " UATG\n", - " 47.121899\n", - " 51.821400\n", - " -72\n", - " 5.0\n", - " U\n", - " Asia/Oral\n", - " airport\n", - " OurAirports\n", - " 71\n", - " 0\n", - " 9\n", - " 35.0\n", - " \n", - " \n", - " 2151\n", - " Ramsar Airport\n", - " Ramsar\n", - " Iran\n", - " RZR\n", - " OINR\n", - " 36.909901\n", - " 50.679600\n", - " -70\n", - " 3.5\n", - " E\n", - " Asia/Tehran\n", - " airport\n", - " OurAirports\n", - " 62\n", - " 1\n", + " Africa\n", + " Algeria\n", + " 26.0\n", + " 43.739130\n", + " 82.0\n", " 6\n", - " 47.0\n", + " 0.26087\n", + " 360\n", + " 15.652174\n", + " 1864\n", + " 81.043478\n", " \n", " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", + " Angola\n", + " 28.0\n", + " 42.714286\n", + " 53.0\n", + " 9\n", + " 1.12500\n", + " 97\n", + " 12.125000\n", + " 472\n", + " 59.000000\n", " \n", " \n", - " 3039\n", - " Lengpui Airport\n", - " Aizwal\n", - " India\n", - " AJL\n", - " VELP\n", - " 23.840599\n", - " 92.619698\n", - " 1398\n", - " 5.5\n", - " N\n", - " Asia/Calcutta\n", - " airport\n", - " OurAirports\n", - " 118\n", + " Benin\n", + " 69.0\n", + " 69.000000\n", + " 69.0\n", " 0\n", - " 23\n", - " 38.0\n", + " 0.00000\n", + " 7\n", + " 7.000000\n", + " 28\n", + " 28.000000\n", " \n", " \n", - " 1670\n", - " Emmen Air Base\n", - " Emmen\n", - " Switzerland\n", - " EML\n", - " LSME\n", - " 47.092444\n", - " 8.305184\n", - " 1400\n", - " 1.0\n", - " E\n", - " Europe/Zurich\n", - " airport\n", - " OurAirports\n", - " 124\n", - " 0\n", - " 19\n", - " 38.0\n", - " \n", - " \n", - " 6215\n", - " Long Lellang Airport\n", - " Long Datih\n", - " Malaysia\n", - " LGL\n", - " WBGF\n", - " 3.421000\n", - " 115.153999\n", - " 1400\n", - " 8.0\n", - " N\n", - " Asia/Kuala_Lumpur\n", - " airport\n", - " OurAirports\n", - " 126\n", + " Burkina Faso\n", + " 35.0\n", + " 35.000000\n", + " 35.0\n", " 0\n", + " 0.00000\n", " 18\n", - " 32.0\n", + " 18.000000\n", + " 65\n", + " 65.000000\n", " \n", " \n", - " 7375\n", - " Minaçu Airport\n", - " Minacu\n", - " Brazil\n", - " MQH\n", - " SBMC\n", - " -13.549100\n", - " -48.195301\n", - " 1401\n", - " -3.0\n", - " S\n", - " America/Sao_Paulo\n", - " airport\n", - " OurAirports\n", - " 119\n", - " 1\n", - " 25\n", - " 48.0\n", - " \n", - " \n", - " 9253\n", - " Bubovice Airport\n", - " Bubovice\n", - " Czech Republic\n", - " NaN\n", - " LKBU\n", - " 49.974400\n", - " 14.178100\n", - " 1401\n", - " 1.0\n", - " E\n", - " Europe/Prague\n", - " airport\n", - " OurAirports\n", - " 128\n", - " 0\n", - " 15\n", - " 32.0\n", + " Cameroon\n", + " 28.0\n", + " 51.250000\n", + " 83.0\n", + " 3\n", + " 0.75000\n", + " 61\n", + " 15.250000\n", + " 339\n", + " 84.750000\n", " \n", " \n", "\n", - "

6029 rows × 17 columns

\n", "" ], "text/plain": [ - " airport_name city country IATA \\\n", - "ID \n", - "1600 Bar Yehuda Airfield Metzada Israel MTZ \n", - "1595 Ein Yahav Airfield Eyn-yahav Israel EIY \n", - "7646 Jacqueline Cochran Regional Airport Palm Springs United States TRM \n", - "4357 Atyrau Airport Atyrau Kazakhstan GUW \n", - "2151 Ramsar Airport Ramsar Iran RZR \n", - "... ... ... ... ... \n", - "3039 Lengpui Airport Aizwal India AJL \n", - "1670 Emmen Air Base Emmen Switzerland EML \n", - "6215 Long Lellang Airport Long Datih Malaysia LGL \n", - "7375 Minaçu Airport Minacu Brazil MQH \n", - "9253 Bubovice Airport Bubovice Czech Republic NaN \n", + " dur_min dur_mean dur_max cancelled_sum \\\n", + "continent country \n", + "Africa Algeria 26.0 43.739130 82.0 6 \n", + " Angola 28.0 42.714286 53.0 9 \n", + " Benin 69.0 69.000000 69.0 0 \n", + " Burkina Faso 35.0 35.000000 35.0 0 \n", + " Cameroon 28.0 51.250000 83.0 3 \n", "\n", - " ICAO latitude longitude altitude timezone DST \\\n", - "ID \n", - "1600 LLMZ 31.328199 35.388599 -1266 2.0 E \n", - "1595 LLEY 30.621700 35.203300 -164 2.0 E \n", - "7646 KTRM 33.626701 -116.160004 -115 -8.0 A \n", - "4357 UATG 47.121899 51.821400 -72 5.0 U \n", - "2151 OINR 36.909901 50.679600 -70 3.5 E \n", - "... ... ... ... ... ... .. \n", - "3039 VELP 23.840599 92.619698 1398 5.5 N \n", - "1670 LSME 47.092444 8.305184 1400 1.0 E \n", - "6215 WBGF 3.421000 115.153999 1400 8.0 N \n", - "7375 SBMC -13.549100 -48.195301 1401 -3.0 S \n", - "9253 LKBU 49.974400 14.178100 1401 1.0 E \n", + " cancelled_mean delayed_sum delayed_mean \\\n", + "continent country \n", + "Africa Algeria 0.26087 360 15.652174 \n", + " Angola 1.12500 97 12.125000 \n", + " Benin 0.00000 7 7.000000 \n", + " Burkina Faso 0.00000 18 18.000000 \n", + " Cameroon 0.75000 61 15.250000 \n", "\n", - " tz_database_timezone type source flights_planned \\\n", - "ID \n", - "1600 Asia/Jerusalem airport OurAirports 62 \n", - "1595 Asia/Jerusalem airport OurAirports 56 \n", - "7646 America/Los_Angeles airport OurAirports 60 \n", - "4357 Asia/Oral airport OurAirports 71 \n", - "2151 Asia/Tehran airport OurAirports 62 \n", - "... ... ... ... ... \n", - "3039 Asia/Calcutta airport OurAirports 118 \n", - "1670 Europe/Zurich airport OurAirports 124 \n", - "6215 Asia/Kuala_Lumpur airport OurAirports 126 \n", - "7375 America/Sao_Paulo airport OurAirports 119 \n", - "9253 Europe/Prague airport OurAirports 128 \n", - "\n", - " flights_cancelled flights_delayed delay_duration \n", - "ID \n", - "1600 0 9 32.0 \n", - "1595 0 7 24.0 \n", - "7646 0 7 28.0 \n", - "4357 0 9 35.0 \n", - "2151 1 6 47.0 \n", - "... ... ... ... \n", - "3039 0 23 38.0 \n", - "1670 0 19 38.0 \n", - "6215 0 18 32.0 \n", - "7375 1 25 48.0 \n", - "9253 0 15 32.0 \n", - "\n", - "[6029 rows x 17 columns]" + " planned_sum planned_mean \n", + "continent country \n", + "Africa Algeria 1864 81.043478 \n", + " Angola 472 59.000000 \n", + " Benin 28 28.000000 \n", + " Burkina Faso 65 65.000000 \n", + " Cameroon 339 84.750000 " ] }, - "execution_count": 59, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_del" + "# 4.2\n", + "df_4_2 = df_del.loc[:, ['country', 'continent', 'delay_duration', 'flights_cancelled', 'flights_delayed', 'flights_planned']] \\\n", + " .sort_values(['continent', 'country']) \\\n", + " .groupby(['continent', 'country']) \\\n", + " .agg(dur_min=('delay_duration', 'min'), \\\n", + " dur_mean=('delay_duration', 'mean'), \\\n", + " dur_max=('delay_duration', 'max'), \\\n", + " cancelled_sum=('flights_cancelled', 'sum'), \\\n", + " cancelled_mean=('flights_cancelled', 'mean'), \\\n", + " delayed_sum=('flights_delayed', 'sum'), \\\n", + " delayed_mean=('flights_delayed', 'mean'), \\\n", + " planned_sum=('flights_planned', 'sum'), \\\n", + " planned_mean=('flights_planned', 'mean'))\n", + " \n", + "df_4_2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "a29b8c2f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_4_3 = df_del.loc[:, ['continent', 'flights_planned', 'flights_delayed']] \\\n", + " .rename(columns={'flights_planned': 'planned', 'flights_delayed': 'delayed'}) \\\n", + " .melt(id_vars=['continent'], value_vars=['planned', 'delayed'], var_name=\"Kind of flight\", \\\n", + " value_name=\"# of flights\") \\\n", + " .sort_values('continent')\n", + "\n", + "f, ax1 = plt.subplots(figsize=(15, 8))\n", + "\n", + "sns.set_theme(style=\"ticks\", palette=\"pastel\")\n", + "\n", + "# Draw a nested boxplot to show bills by day and time\n", + "sns.boxplot(x=\"# of flights\", y=\"continent\",\n", + " hue=\"Kind of flight\", palette=[\"m\", \"g\"],\n", + " data=df_4_3)\n", + "sns.despine(offset=10, trim=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "04aa4de5", + "metadata": {}, + "source": [ + "I observe that in all continents there is a significant higher number of planned flights than the number of delayed flights. This can be determined by the inter-quartile range positions of both series' boxplots with respect to each other." ] }, { @@ -1621,7 +1498,9 @@ "id": "386875c8", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# TODO: 5" + ] }, { "attachments": {},