Update calibration tutorial

peanutfun · peanutfun · commit e1fe68aa3820 · 2023-06-13T18:41:28.000+02:00
diff --git a/doc/tutorial/climada_util_calibrate.ipynb b/doc/tutorial/climada_util_calibrate.ipynb
@@ -19,7 +19,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -46,7 +46,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -867,7 +867,7 @@
        "[27 rows x 22 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -899,7 +899,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -1332,7 +1332,7 @@
        "2017260N12310  0.000000e+00  1.534596e+09  "
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1385,7 +1385,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1412,14 +1412,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2023-06-13 16:36:05,304 - climada.hazard.tc_tracks - WARNING - The cached IBTrACS data set dates from 2022-03-08 23:23:51 (older than 180 days). Very likely, a more recent version is available. Consider manually removing the file /Users/ldr.riedel/climada/data/IBTrACS.ALL.v04r00.nc and re-running this function, which will download the most recent version of the IBTrACS data set from the official URL.\n"
+      "2023-06-13 17:55:56,240 - climada.hazard.tc_tracks - WARNING - The cached IBTrACS data set dates from 2022-03-08 23:23:51 (older than 180 days). Very likely, a more recent version is available. Consider manually removing the file /Users/ldr.riedel/climada/data/IBTrACS.ALL.v04r00.nc and re-running this function, which will download the most recent version of the IBTrACS data set from the official URL.\n"
      ]
     },
     {
@@ -1484,7 +1484,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1521,26 +1521,23 @@
     "\n",
     "Computations on data frames align columns and indexes.\n",
     "The indexes of the calibration data are the IBTrACS IDs, but the indexes of the result of `Impact.impact_at_reg` are the hazard event IDs, which at this point are only integer numbers.\n",
-    "To resolve that, we simply set the hazard event IDs to the IBTrACS IDs, which are stored in `Hazard.event_name`.\n",
+    "To resolve that, we set the index of the resulting impact data to `Hazard.event_name`.\n",
     "Once both the impact data and the calibration data are in the same data format, we can compute the RMSE:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
     "from climada.engine import Impact\n",
     "\n",
-    "# TODO: Dont\n",
-    "# Make sure that Hazard.event_id matches indexes of 'data'\n",
-    "# hazard.event_id = np.asarray(hazard.event_name)\n",
-    "\n",
     "def cost_rmse(impact: Impact, data: pd.DataFrame):\n",
     "    \"\"\"A cost function computing the RMSE\"\"\"\n",
     "    impact = impact.impact_at_reg(exposure.gdf[\"region_id\"])\n",
+    "    impact.set_index(np.asarray(hazard.event_name), inplace=True)\n",
     "    return np.sqrt(np.mean(((data - impact) ** 2).to_numpy()))"
    ]
   },
@@ -1558,7 +1555,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1582,7 +1579,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -1609,7 +1606,7 @@
        "{'scale': 0.9903198881207879, 'v_half': 61.51163348395183}"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1625,9 +1622,7 @@
     "    cost_func=cost_rmse,\n",
     "    impact_func_gen=impact_func_tc,\n",
     "    bounds=bounds,\n",
-    "    align=False,\n",
     ")\n",
-    "exposure.assign_centroids(hazard)\n",
     "\n",
     "# Create and run the optimizer\n",
     "opt = BayesianOptimizer(input)\n",
@@ -1648,7 +1643,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -1657,7 +1652,7 @@
        "<AxesSubplot:title={'center':'TC 1: Emanuel 2011'}, xlabel='Intensity (m/s)', ylabel='Impact (%)'>"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -1690,7 +1685,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -1815,7 +1810,7 @@
        "[200 rows x 3 columns]"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1835,7 +1830,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -1960,7 +1955,7 @@
        "[200 rows x 3 columns]"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1972,16 +1967,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[<matplotlib.lines.Line2D at 0x329bee910>]"
+       "[<matplotlib.lines.Line2D at 0x30dc62cd0>]"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -2019,7 +2014,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -2354,8 +2349,8 @@
        "</div>"
       ],
       "text/plain": [
-       "                        28            44            92             132   \n",
-       "2010176N16278  0.000000e+00  0.000000e+00  0.000000e+00       0.000000  \\\n",
+       "                        28            44            92             132  \\\n",
+       "2010176N16278  0.000000e+00  0.000000e+00  0.000000e+00       0.000000   \n",
        "2010236N12341  9.610111e+06  0.000000e+00  2.390220e+07       0.000000   \n",
        "2010257N16282  0.000000e+00  0.000000e+00  0.000000e+00       0.000000   \n",
        "2010302N09306  0.000000e+00  0.000000e+00  0.000000e+00       0.000000   \n",
@@ -2373,8 +2368,8 @@
        "2017242N16333  1.816087e+08  8.362996e+05  3.338384e+08       0.000000   \n",
        "2017260N12310  0.000000e+00  0.000000e+00  1.910071e+07       0.000000   \n",
        "\n",
-       "                        192           212           214           388   \n",
-       "2010176N16278  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  \\\n",
+       "                        192           212           214           388  \\\n",
+       "2010176N16278  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   \n",
        "2010236N12341  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   \n",
        "2010257N16282  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   \n",
        "2010302N09306  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   \n",
@@ -2392,8 +2387,8 @@
        "2017242N16333  1.573645e+09  0.000000e+00  1.005487e+07  0.000000e+00   \n",
        "2017260N12310  0.000000e+00  4.919065e+08  4.214500e+07  0.000000e+00   \n",
        "\n",
-       "                        484           630           659           662   \n",
-       "2010176N16278  7.221970e+08  0.000000e+00  0.000000e+00  0.000000e+00  \\\n",
+       "                        484           630           659           662  \\\n",
+       "2010176N16278  7.221970e+08  0.000000e+00  0.000000e+00  0.000000e+00   \n",
        "2010236N12341  0.000000e+00  2.872097e+07  4.721180e+06  0.000000e+00   \n",
        "2010257N16282  5.025816e+07  0.000000e+00  0.000000e+00  0.000000e+00   \n",
        "2010302N09306  0.000000e+00  0.000000e+00  0.000000e+00  3.233805e+06   \n",
@@ -2431,7 +2426,7 @@
        "2017260N12310  0.000000e+00  2.477373e+07  "
       ]
      },
-     "execution_count": 19,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2442,6 +2437,7 @@
     "impf = impact_func_tc(**bayesian_output.params)\n",
     "impact = ImpactCalc(exposure, impf, hazard).impact(assign_centroids=False)\n",
     "impact_data = impact.impact_at_reg(exposure.gdf[\"region_id\"])\n",
+    "impact_data.set_index(np.asarray(hazard.event_name), inplace=True)\n",
     "impact_data"
    ]
   },
@@ -2459,7 +2455,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -2468,7 +2464,7 @@
        "<AxesSubplot:ylabel='Damages (USD)'>"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -2538,7 +2534,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -2873,8 +2869,8 @@
        "</div>"
       ],
       "text/plain": [
-       "               Antigua and Barbuda   Bahamas  Virgin Islands, British   \n",
-       "2010176N16278                  NaN       NaN                      NaN  \\\n",
+       "               Antigua and Barbuda   Bahamas  Virgin Islands, British  \\\n",
+       "2010176N16278                  NaN       NaN                      NaN   \n",
        "2010236N12341            -0.161719       NaN                 7.378438   \n",
        "2010257N16282                  NaN       NaN                      NaN   \n",
        "2010302N09306                  NaN       NaN                      NaN   \n",
@@ -2892,8 +2888,8 @@
        "2017242N16333            -0.065509 -0.333363                -0.953585   \n",
        "2017260N12310                  NaN       NaN                 7.281049   \n",
        "\n",
-       "               Cabo Verde      Cuba  Dominica  Dominican Republic   Jamaica   \n",
-       "2010176N16278         NaN       NaN       NaN                 NaN       NaN  \\\n",
+       "               Cabo Verde      Cuba  Dominica  Dominican Republic   Jamaica  \\\n",
+       "2010176N16278         NaN       NaN       NaN                 NaN       NaN   \n",
        "2010236N12341         NaN       NaN       NaN                 NaN       NaN   \n",
        "2010257N16282         NaN       NaN       NaN                 NaN       NaN   \n",
        "2010302N09306         NaN       NaN       NaN                 NaN       NaN   \n",
@@ -2911,8 +2907,8 @@
        "2017242N16333         NaN -0.844200       NaN            7.002377       NaN   \n",
        "2017260N12310         NaN       NaN -0.494112           -0.114143       NaN   \n",
        "\n",
-       "                 Mexico  Puerto Rico  Saint Kitts and Nevis  Saint Lucia   \n",
-       "2010176N16278 -0.536752          NaN                    NaN          NaN  \\\n",
+       "                 Mexico  Puerto Rico  Saint Kitts and Nevis  Saint Lucia  \\\n",
+       "2010176N16278 -0.536752          NaN                    NaN          NaN   \n",
        "2010236N12341       NaN     7.458199               6.674051          NaN   \n",
        "2010257N16282 -1.984236          NaN                    NaN          NaN   \n",
        "2010302N09306       NaN          NaN                    NaN     0.770404   \n",
@@ -2950,7 +2946,7 @@
        "2017260N12310                               NaN                  7.393991  "
       ]
      },
-     "execution_count": 42,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -3001,31 +2997,12 @@
     "Using a cost function based on the ratio between modelled and observed impact might increase the overall error but decrease the log-error for many events.\n",
     "\n",
     "So we present some ideas on how to continue and/or improve the calibration:\n",
-    "1. Use a different cost function\n",
-    "2. Also calibrate the `v_thresh` parameter. This requires adding constraints, because `v_thresh` < `v_half`.\n",
-    "3. Calibrate different impact functions for houses in Mexico and Puerto Rico within the same optimization task.\n",
-    "4. Employ the `ScipyMinimizeOptimizer` instead of the `BayesianOptimizer`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 70,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from climada.engine import Impact\n",
-    "\n",
-    "# Define a cost function\n",
-    "def cost_rmsle(impact: Impact, data: pd.DataFrame):\n",
-    "    impact = impact.impact_at_reg(exposure.gdf[\"region_id\"])\n",
-    "    data, impact = data.align(impact, \"outer\", fill_value=0)\n",
-    "    data, impact = data.to_numpy(), impact.to_numpy()\n",
-    "    return np.exp(np.sqrt(np.mean((np.log(data + 1) - np.log(impact + 1)) ** 2)) - 1)\n",
     "\n",
-    "def cost_rmse(impact: Impact, data: pd.DataFrame):\n",
-    "    impact = impact.impact_at_reg(exposure.gdf[\"region_id\"])\n",
-    "    return np.sqrt(np.mean(((data - impact) ** 2).to_numpy()))\n"
+    "1. Run the calibration again, but change the number of initial steps and/or iteration steps.\n",
+    "2. Use a different cost function, e.g., an error measure based on a ratio rather than a difference.\n",
+    "3. Also calibrate the `v_thresh` parameter. This requires adding constraints, because `v_thresh` < `v_half`.\n",
+    "4. Calibrate different impact functions for houses in Mexico and Puerto Rico within the same optimization task.\n",
+    "5. Employ the `ScipyMinimizeOptimizer` instead of the `BayesianOptimizer`."
    ]
   }
  ],