hw1: ex1, ex2, ex3 (code only), ex4.1, ex4.2 (no continents) done

2023-03-22 10:35:29 +01:00 · 2023-03-22 10:35:29 +01:00 · a7d3b2fce0
commit a7d3b2fce0
parent 7485e14887
1 changed files with 251 additions and 84 deletions
--- a/Assignment1/Assignment1.ipynb
+++ b/Assignment1/Assignment1.ipynb
@ -22,7 +22,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 1,
   "id": "fcf3beb9",
   "metadata": {},
   "outputs": [],
@ -52,7 +52,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "id": "a0af6847",
   "metadata": {},
   "outputs": [
@ -62,7 +62,7 @@
       "('Ü', 'sloppy-windows-1252')"
      ]
     },
-     "execution_count": 3,
+     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -74,7 +74,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "id": "22ce9426",
   "metadata": {},
   "outputs": [
@ -271,7 +271,7 @@
       "4  2016-03-31 00:00:00             0       60437  2016-04-06 10:17:21  "
      ]
     },
-     "execution_count": 4,
+     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -284,7 +284,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "id": "a332b6a5",
   "metadata": {},
   "outputs": [
@ -313,7 +313,7 @@
       " 'lastSeen': ['str']}"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -331,7 +331,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "id": "11bfa9a2",
   "metadata": {},
   "outputs": [
@ -372,7 +372,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
   "id": "f1c539c4",
   "metadata": {},
   "outputs": [
@ -413,7 +413,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "id": "86074e70",
   "metadata": {},
   "outputs": [
@ -610,7 +610,7 @@
       "4  2016-03-31 00:00:00             0       60437  2016-04-06 10:17:21  "
      ]
     },
-     "execution_count": 8,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -653,7 +653,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
   "id": "8b6f9ce3",
   "metadata": {},
   "outputs": [
@ -683,7 +683,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "id": "98f8d101",
   "metadata": {},
   "outputs": [
@ -708,7 +708,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
   "id": "f300f49d",
   "metadata": {},
   "outputs": [
@ -727,7 +727,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
   "id": "923c5354",
   "metadata": {},
   "outputs": [],
@ -738,7 +738,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
   "id": "4b847b1f",
   "metadata": {},
   "outputs": [],
@ -749,7 +749,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
   "id": "bf1f417d",
   "metadata": {},
   "outputs": [
@ -779,7 +779,7 @@
       "dtype: int64"
      ]
     },
-     "execution_count": 14,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -791,7 +791,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
   "id": "919e692f",
   "metadata": {},
   "outputs": [],
@ -836,7 +836,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
   "id": "7cc5c90f",
   "metadata": {},
   "outputs": [
@ -898,7 +898,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 16,
   "id": "ca97e7c8",
   "metadata": {},
   "outputs": [
@ -964,7 +964,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 17,
   "id": "eb956ed4",
   "metadata": {},
   "outputs": [],
@ -985,7 +985,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 18,
   "id": "4a29684b",
   "metadata": {},
   "outputs": [],
@ -995,7 +995,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": 19,
   "id": "d3d58d25",
   "metadata": {},
   "outputs": [
@ -1044,7 +1044,10 @@
    "\n",
    "You'll need to work with the *'airports'* and *‘airports-delays’* datasets. Examine the datasets and perform cleansing if needed, before performing the exercise.\n",
    "\n",
-    "1. Create a dataframe that provides, for each country, the mean of flights delayed. Display these information by binning the flights delayed in 6 bins. The resulting dataframe should have the countries as rows and the 6 bins as columns. For this exercise you cannot use pivot_table but only groupby. \n",
+    "1. Create a dataframe that provides, for each country, <del>the mean of flights delayed</del>. Display these information by binning the flights delayed in 6 bins. The resulting dataframe should have the countries as rows and the 6 bins as columns. For this exercise you cannot use pivot_table but only groupby. \n",
+    "\n",
+    "<span style=\"color: red\">According to answer of question to professor:</span>\n",
+    "> Bin by delay_duration value, compute delay mean per-bin per-country \n",
    "\n",
    "2. Create a dataframe from ‘a*irports-delays’* which shows for each continent and country:\n",
    "    1. max, min and mean of ‘**delay_duration**’;\n",
@ -1057,9 +1060,180 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 53,
   "id": "b4fde7e4",
   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_air = pd.read_csv(\"./datasets/airports.csv\", index_col='ID', na_values=['\\\\N'])\n",
+    "df_del = pd.read_csv(\"./datasets/airports-delays.csv\", index_col='ID', sep=\";\", na_values=['\\\\N'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "f8906707",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>delay_duration_bin</th>\n",
+       "      <th>(15.999, 30.0]</th>\n",
+       "      <th>(30.0, 35.0]</th>\n",
+       "      <th>(35.0, 41.0]</th>\n",
+       "      <th>(41.0, 47.0]</th>\n",
+       "      <th>(47.0, 59.0]</th>\n",
+       "      <th>(59.0, 850.0]</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>country</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Afghanistan</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>44.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>60.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Albania</th>\n",
+       "      <td>18.5</td>\n",
+       "      <td>31.000000</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>56.000000</td>\n",
+       "      <td>63.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Algeria</th>\n",
+       "      <td>26.5</td>\n",
+       "      <td>33.857143</td>\n",
+       "      <td>38.75</td>\n",
+       "      <td>43.0</td>\n",
+       "      <td>51.200000</td>\n",
+       "      <td>73.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>American Samoa</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>43.0</td>\n",
+       "      <td>48.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Angola</th>\n",
+       "      <td>28.0</td>\n",
+       "      <td>34.500000</td>\n",
+       "      <td>36.00</td>\n",
+       "      <td>45.0</td>\n",
+       "      <td>51.666667</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "delay_duration_bin  (15.999, 30.0]  (30.0, 35.0]  (35.0, 41.0]  (41.0, 47.0]  \\\n",
+       "country                                                                        \n",
+       "Afghanistan                    0.0      0.000000          0.00          44.0   \n",
+       "Albania                       18.5     31.000000          0.00           0.0   \n",
+       "Algeria                       26.5     33.857143         38.75          43.0   \n",
+       "American Samoa                 0.0      0.000000          0.00          43.0   \n",
+       "Angola                        28.0     34.500000         36.00          45.0   \n",
+       "\n",
+       "delay_duration_bin  (47.0, 59.0]  (59.0, 850.0]  \n",
+       "country                                          \n",
+       "Afghanistan             0.000000           60.0  \n",
+       "Albania                56.000000           63.0  \n",
+       "Algeria                51.200000           73.0  \n",
+       "American Samoa         48.000000            0.0  \n",
+       "Angola                 51.666667            0.0  "
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_4_1 = df_del.copy()\n",
+    "\n",
+    "# The following statements bins the data by the value of delay_duration.\n",
+    "# The bins are chosen as equally-spaced percentile values of the data. This is done to \n",
+    "# better distribute the data between bins, as it is quite skewed towards low values\n",
+    "df_4_1[\"delay_duration_bin\"] = pd.qcut(df_del.delay_duration, 6)\n",
+    "\n",
+    "# The dataframe will contain countries as row indices, the 6 bins as columns and values\n",
+    "# corresponding to the mean delay_duration per country, per bin. When no delay_duration \n",
+    "# falls in a particular bin for some country, that bin has a value of 0\n",
+    "df_4_1 = df_4_1.loc[:, ['country', 'delay_duration', 'delay_duration_bin']] \\\n",
+    "    .groupby(['country', 'delay_duration_bin']) \\\n",
+    "    .mean() \\\n",
+    "    .fillna(0) \\\n",
+    "    .reset_index() \\\n",
+    "    .pivot(index='country', columns='delay_duration_bin', values='delay_duration') \n",
+    "\n",
+    "df_4_1.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "id": "a677ce07",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 4.2\n",
+    "# TODO: continents\n",
+    "df_4_2 = df_del.loc[:, ['country', 'delay_duration', 'flights_cancelled', 'flights_delayed', 'flights_planned']] \\\n",
+    "    .groupby('country') \\\n",
+    "    .agg(dur_min=('delay_duration', 'min'), \\\n",
+    "        dur_mean=('delay_duration', 'mean'), \\\n",
+    "        dur_max=('delay_duration', 'max'), \\\n",
+    "        cancelled_sum=('flights_cancelled', 'sum'), \\\n",
+    "        cancelled_mean=('flights_cancelled', 'mean'), \\\n",
+    "        delayed_sum=('flights_delayed', 'sum'), \\\n",
+    "        delayed_mean=('flights_delayed', 'mean'), \\\n",
+    "        planned_sum=('flights_planned', 'sum'), \\\n",
+    "        planned_mean=('flights_planned', 'mean'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "a29b8c2f",
+   "metadata": {},
   "outputs": [
    {
     "data": {
@ -1132,7 +1306,7 @@
       "      <td>31.328199</td>\n",
       "      <td>35.388599</td>\n",
       "      <td>-1266</td>\n",
-       "      <td>2</td>\n",
+       "      <td>2.0</td>\n",
       "      <td>E</td>\n",
       "      <td>Asia/Jerusalem</td>\n",
       "      <td>airport</td>\n",
@ -1152,7 +1326,7 @@
       "      <td>30.621700</td>\n",
       "      <td>35.203300</td>\n",
       "      <td>-164</td>\n",
-       "      <td>2</td>\n",
+       "      <td>2.0</td>\n",
       "      <td>E</td>\n",
       "      <td>Asia/Jerusalem</td>\n",
       "      <td>airport</td>\n",
@ -1172,7 +1346,7 @@
       "      <td>33.626701</td>\n",
       "      <td>-116.160004</td>\n",
       "      <td>-115</td>\n",
-       "      <td>-8</td>\n",
+       "      <td>-8.0</td>\n",
       "      <td>A</td>\n",
       "      <td>America/Los_Angeles</td>\n",
       "      <td>airport</td>\n",
@ -1192,7 +1366,7 @@
       "      <td>47.121899</td>\n",
       "      <td>51.821400</td>\n",
       "      <td>-72</td>\n",
-       "      <td>5</td>\n",
+       "      <td>5.0</td>\n",
       "      <td>U</td>\n",
       "      <td>Asia/Oral</td>\n",
       "      <td>airport</td>\n",
@ -1272,7 +1446,7 @@
       "      <td>47.092444</td>\n",
       "      <td>8.305184</td>\n",
       "      <td>1400</td>\n",
-       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
       "      <td>E</td>\n",
       "      <td>Europe/Zurich</td>\n",
       "      <td>airport</td>\n",
@ -1292,7 +1466,7 @@
       "      <td>3.421000</td>\n",
       "      <td>115.153999</td>\n",
       "      <td>1400</td>\n",
-       "      <td>8</td>\n",
+       "      <td>8.0</td>\n",
       "      <td>N</td>\n",
       "      <td>Asia/Kuala_Lumpur</td>\n",
       "      <td>airport</td>\n",
@ -1312,7 +1486,7 @@
       "      <td>-13.549100</td>\n",
       "      <td>-48.195301</td>\n",
       "      <td>1401</td>\n",
-       "      <td>-3</td>\n",
+       "      <td>-3.0</td>\n",
       "      <td>S</td>\n",
       "      <td>America/Sao_Paulo</td>\n",
       "      <td>airport</td>\n",
@ -1327,12 +1501,12 @@
       "      <td>Bubovice Airport</td>\n",
       "      <td>Bubovice</td>\n",
       "      <td>Czech Republic</td>\n",
-       "      <td>\\N</td>\n",
+       "      <td>NaN</td>\n",
       "      <td>LKBU</td>\n",
       "      <td>49.974400</td>\n",
       "      <td>14.178100</td>\n",
       "      <td>1401</td>\n",
-       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
       "      <td>E</td>\n",
       "      <td>Europe/Prague</td>\n",
       "      <td>airport</td>\n",
@ -1360,67 +1534,60 @@
       "1670                       Emmen Air Base         Emmen     Switzerland  EML   \n",
       "6215                 Long Lellang Airport    Long Datih        Malaysia  LGL   \n",
       "7375                       Minaçu Airport        Minacu          Brazil  MQH   \n",
-       "9253                     Bubovice Airport      Bubovice  Czech Republic   \\N   \n",
+       "9253                     Bubovice Airport      Bubovice  Czech Republic  NaN   \n",
       "\n",
-       "      ICAO   latitude   longitude  altitude timezone DST tz_database_timezone  \\\n",
+       "      ICAO   latitude   longitude  altitude  timezone DST  \\\n",
       "ID                                                          \n",
-       "1600  LLMZ  31.328199   35.388599     -1266        2   E       Asia/Jerusalem   \n",
-       "1595  LLEY  30.621700   35.203300      -164        2   E       Asia/Jerusalem   \n",
-       "7646  KTRM  33.626701 -116.160004      -115       -8   A  America/Los_Angeles   \n",
-       "4357  UATG  47.121899   51.821400       -72        5   U            Asia/Oral   \n",
-       "2151  OINR  36.909901   50.679600       -70      3.5   E          Asia/Tehran   \n",
-       "...    ...        ...         ...       ...      ...  ..                  ...   \n",
-       "3039  VELP  23.840599   92.619698      1398      5.5   N        Asia/Calcutta   \n",
-       "1670  LSME  47.092444    8.305184      1400        1   E        Europe/Zurich   \n",
-       "6215  WBGF   3.421000  115.153999      1400        8   N    Asia/Kuala_Lumpur   \n",
-       "7375  SBMC -13.549100  -48.195301      1401       -3   S    America/Sao_Paulo   \n",
-       "9253  LKBU  49.974400   14.178100      1401        1   E        Europe/Prague   \n",
+       "1600  LLMZ  31.328199   35.388599     -1266       2.0   E   \n",
+       "1595  LLEY  30.621700   35.203300      -164       2.0   E   \n",
+       "7646  KTRM  33.626701 -116.160004      -115      -8.0   A   \n",
+       "4357  UATG  47.121899   51.821400       -72       5.0   U   \n",
+       "2151  OINR  36.909901   50.679600       -70       3.5   E   \n",
+       "...    ...        ...         ...       ...       ...  ..   \n",
+       "3039  VELP  23.840599   92.619698      1398       5.5   N   \n",
+       "1670  LSME  47.092444    8.305184      1400       1.0   E   \n",
+       "6215  WBGF   3.421000  115.153999      1400       8.0   N   \n",
+       "7375  SBMC -13.549100  -48.195301      1401      -3.0   S   \n",
+       "9253  LKBU  49.974400   14.178100      1401       1.0   E   \n",
       "\n",
-       "         type       source  flights_planned  flights_cancelled  \\\n",
+       "     tz_database_timezone     type       source  flights_planned  \\\n",
       "ID                                                                 \n",
-       "1600  airport  OurAirports               62                  0   \n",
-       "1595  airport  OurAirports               56                  0   \n",
-       "7646  airport  OurAirports               60                  0   \n",
-       "4357  airport  OurAirports               71                  0   \n",
-       "2151  airport  OurAirports               62                  1   \n",
+       "1600       Asia/Jerusalem  airport  OurAirports               62   \n",
+       "1595       Asia/Jerusalem  airport  OurAirports               56   \n",
+       "7646  America/Los_Angeles  airport  OurAirports               60   \n",
+       "4357            Asia/Oral  airport  OurAirports               71   \n",
+       "2151          Asia/Tehran  airport  OurAirports               62   \n",
       "...                   ...      ...          ...              ...   \n",
-       "3039  airport  OurAirports              118                  0   \n",
-       "1670  airport  OurAirports              124                  0   \n",
-       "6215  airport  OurAirports              126                  0   \n",
-       "7375  airport  OurAirports              119                  1   \n",
-       "9253  airport  OurAirports              128                  0   \n",
+       "3039        Asia/Calcutta  airport  OurAirports              118   \n",
+       "1670        Europe/Zurich  airport  OurAirports              124   \n",
+       "6215    Asia/Kuala_Lumpur  airport  OurAirports              126   \n",
+       "7375    America/Sao_Paulo  airport  OurAirports              119   \n",
+       "9253        Europe/Prague  airport  OurAirports              128   \n",
       "\n",
-       "      flights_delayed  delay_duration  \n",
+       "      flights_cancelled  flights_delayed  delay_duration  \n",
       "ID                                                        \n",
-       "1600                9            32.0  \n",
-       "1595                7            24.0  \n",
-       "7646                7            28.0  \n",
-       "4357                9            35.0  \n",
-       "2151                6            47.0  \n",
-       "...               ...             ...  \n",
-       "3039               23            38.0  \n",
-       "1670               19            38.0  \n",
-       "6215               18            32.0  \n",
-       "7375               25            48.0  \n",
-       "9253               15            32.0  \n",
+       "1600                  0                9            32.0  \n",
+       "1595                  0                7            24.0  \n",
+       "7646                  0                7            28.0  \n",
+       "4357                  0                9            35.0  \n",
+       "2151                  1                6            47.0  \n",
+       "...                 ...              ...             ...  \n",
+       "3039                  0               23            38.0  \n",
+       "1670                  0               19            38.0  \n",
+       "6215                  0               18            32.0  \n",
+       "7375                  1               25            48.0  \n",
+       "9253                  0               15            32.0  \n",
       "\n",
       "[6029 rows x 17 columns]"
      ]
     },
-     "execution_count": 62,
+     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "df_air = pd.read_csv(\"./datasets/airports.csv\", index_col='ID')\n",
-    "df_del = pd.read_csv(\"./datasets/airports-delays.csv\", index_col='ID', sep=\";\")\n",
-    "\n",
-    "df_del\n",
-    "#pd.cut(df_del.flights_delayed, range(0, df_del.flights_delayed.max(), 25))\n",
-    "\n",
-    "#df_bycountry = df_del.loc[:, ['country', 'flights_delayed']].groupby('country').sum().sort_values('flights_delayed', ascending=False)\n",
-    "#df_bycountry\n"
+    "df_del"
   ]
  },
  {