CLIMADA-project
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎climada/test/test_util_calibrate.py‎
Lines changed: 3 additions & 3 deletions b/‎climada/test/test_util_calibrate.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎climada/util/calibrate/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎climada/util/calibrate/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎climada/util/calibrate/base.py‎
Lines changed: 64 additions & 13 deletions b/‎climada/util/calibrate/base.py‎
Lines changed: 64 additions & 13 deletions
diff --git a/‎climada/util/calibrate/bayesian_optimizer.py‎
Lines changed: 18 additions & 6 deletions b/‎climada/util/calibrate/bayesian_optimizer.py‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎climada/util/calibrate/cost_func.py‎
Lines changed: 40 additions & 0 deletions b/‎climada/util/calibrate/cost_func.py‎
Lines changed: 40 additions & 0 deletions
@@ -30,6 +30,7 @@ Removed:
 - Added instructions to install Climada petals on Euler cluster in `doc.guide.Guide_Euler.ipynb` [#1029](https://github.com/CLIMADA-project/climada_python/pull/1029)
 - Added util methods to handle crs coordinates consistently: `is_geo_coords`, `check_if_geo_coords`, `get_crs_unit`, `estimate_matching_threshold`, `degree_to_km`, and `km_to_degree` [#1080](https://github.com/CLIMADA-project/climada_python/pull/1080)
 - `ImpactFunc` and `ImpactFuncSet` now support equality comparisons via `==` [#1027](https://github.com/CLIMADA-project/climada_python/pull/1027)
+- Calibration of impact function ensembles in `climada.util.calibrate` [#1048](https://github.com/CLIMADA-project/climada_python/pull/1048)
 - Added optional `attrs` parameter to `Exposures.from_raster` method to set additional object properties through the method's `Exposures.__init__` call.
 
 ### Changed
@@ -45,6 +46,7 @@ geographic coordinates as input (e.g. `util.coordinates.dist_to_coast`, `util.co
 - World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033)
 - `Exposures.write_hdf5` pickles geometry data in WKB format, which is faster and more sustainable. [#1051](https://github.com/CLIMADA-project/climada_python/pull/1051)
 - The online documentation has been completely overhauled, now uses PyData theme: [#977](https://github.com/CLIMADA-project/climada_python/pull/977)
+- `Input` to impact function calibration tasks now supports adding weights to the data [#1048](https://github.com/CLIMADA-project/climada_python/pull/1048)
 - Add `climada.hazard.xarray` module with helper structures for reading Hazard objects from `xarray` data [#1063](https://github.com/CLIMADA-project/climada_python/pull/1063)
 - The output of the `impact_yearset` was changed to only contain attributes corresponding to the yearly impact set. The application of the correction factor and the frequency of the resulting yearly impact object are corrected. [#1075](https://github.com/CLIMADA-project/climada_python/pull/1075)
 - `util.coordinates.get_resolution` always returns positive values, regardless of how the input coordinates' order [#1080](https://github.com/CLIMADA-project/climada_python/pull/1080).
 
@@ -176,7 +176,7 @@ def test_single(self):
             init_points=10, n_iter=20, max_iterations=1
         )
         optimizer = BayesianOptimizer(self.input, random_state=1)
-        output = optimizer.run(controller)
+        output = optimizer.run(controller=controller)
 
         # Check result (low accuracy)
         self.assertAlmostEqual(output.params["slope"], 1.0, places=2)
@@ -210,7 +210,7 @@ def test_multiple_constrained(self):
         controller = BayesianOptimizerController.from_input(
             self.input, sampling_base=5, max_iterations=3
         )
-        output = optimizer.run(controller)
+        output = optimizer.run(controller=controller)
 
         # Check results (low accuracy)
         self.assertEqual(output.p_space.dim, 2)
@@ -246,7 +246,7 @@ def test_plots(self):
         controller = BayesianOptimizerController.from_input(
             self.input, max_iterations=1
         )
-        output = optimizer.run(controller)
+        output = optimizer.run(controller=controller)
 
         output_eval = OutputEvaluator(self.input, output)
         output_eval.impf_set.plot()
 
@@ -26,4 +26,10 @@
     BayesianOptimizerOutputEvaluator,
     select_best,
 )
+from .cost_func import mse, msle
+from .ensemble import (
+    AverageEnsembleOptimizer,
+    EnsembleOptimizerOutput,
+    TragedyEnsembleOptimizer,
+)
 from .scipy_optimizer import ScipyMinimizeOptimizer
@@ -47,9 +47,9 @@ class Input:
         Hazard object to compute impacts from
     exposure : climada.Exposures
         Exposures object to compute impacts from
-    data : pandas.Dataframe
+    data : pandas.DataFrame
         The data to compare computed impacts to. Index: Event IDs matching the IDs of
-        ``hazard``. Columns: Arbitrary columns. NaN values in the data frame have
+        :py:attr:`hazard`. Columns: Arbitrary columns. NaN values in the data frame have
         special meaning: Corresponding impact values computed by the model are ignored
         in the calibration.
     impact_func_creator : Callable
@@ -64,8 +64,11 @@ class Input:
     cost_func : Callable
         Function that takes two ``pandas.Dataframe`` objects and returns the scalar
         "cost" between them. The optimization algorithm will try to minimize this
-        number. The first argument is the true/correct values (:py:attr:`data`), and the
-        second argument is the estimated/predicted values.
+        number. The first argument is the true/correct values (:py:attr:`data`), the
+        second argument is the estimated/predicted values, and the third argument is the
+        :py:attr:`data_weights`. The cost function is intended to operate on
+        ``numpy.ndarray`` objects.
+        Dataframes are transformed using :py:attr:`df_to_numpy`.
     bounds : Mapping (str, {Bounds, tuple(float, float)}), optional
         The bounds for the parameters. Keys: parameter names. Values:
         ``scipy.minimize.Bounds`` instance or tuple of minimum and maximum value.
@@ -85,6 +88,16 @@ class Input:
         :py:attr:`data`, insert this value. Defaults to NaN, in which case the impact
         from the model is ignored. Set this to zero to explicitly calibrate to zero
         impacts in these cases.
+    df_to_numpy : Callable, optional
+        A function that transforms a pandas.DataFrame into a numpy.ndarray to be
+        inserted into the :py:attr:`cost_func`. By default, this will flatten the data
+        frame.
+    data_weights : pandas.DataFrame, optional
+        Weights for each entry in :py:attr:`data`. Must have the exact same index and
+        columns. If ``None``, the weights will be ignored (equivalent to the same weight
+        for each event).
+    missing_weights_value : float, optional
+        Same as :py:attr:`missing_data_value`, but for :py:attr:`data_weights`.
     assign_centroids : bool, optional
         If ``True`` (default), assign the hazard centroids to the exposure when this
         object is created.
@@ -95,14 +108,19 @@ class Input:
     data: pd.DataFrame
     impact_func_creator: Callable[..., ImpactFuncSet]
     impact_to_dataframe: Callable[[Impact], pd.DataFrame]
-    cost_func: Callable[[pd.DataFrame, pd.DataFrame], Number]
+    cost_func: Callable[[np.ndarray, np.ndarray, np.ndarray | None], Number]
     bounds: Optional[Mapping[str, Union[Bounds, Tuple[Number, Number]]]] = None
     constraints: Optional[Union[ConstraintType, list[ConstraintType]]] = None
     impact_calc_kwds: Mapping[str, Any] = field(
         default_factory=lambda: {"assign_centroids": False}
     )
     missing_data_value: float = np.nan
-    assign_centroids: InitVar[bool] = True
+    df_to_numpy: Callable[[pd.DataFrame], np.ndarray] = (
+        lambda df: df.to_numpy().flatten()
+    )
+    data_weights: pd.DataFrame | None = field(default=None, kw_only=True)
+    missing_weights_value: float = field(default=0.0, kw_only=True)
+    assign_centroids: InitVar[bool] = field(default=True, kw_only=True)
 
     def __post_init__(self, assign_centroids):
         """Prepare input data"""
@@ -115,6 +133,17 @@ def __post_init__(self, assign_centroids):
                 )
             raise TypeError("'data' must be a pandas.DataFrame")
 
+        if self.data_weights is not None:
+            try:
+                pd.testing.assert_index_equal(self.data.index, self.data_weights.index)
+                pd.testing.assert_index_equal(
+                    self.data.columns, self.data_weights.columns
+                )
+            except AssertionError as err:
+                raise ValueError(
+                    "'data_weights' must have exact same index and columns as 'data'"
+                ) from err
+
         if assign_centroids:
             self.exposure.assign_centroids(self.hazard)
 
@@ -413,26 +442,30 @@ class Optimizer(ABC):
 
     input: Input
 
-    def _target_func(self, data: pd.DataFrame, predicted: pd.DataFrame) -> Number:
+    def _target_func(
+        self, data: np.ndarray, predicted: np.ndarray, weights: np.ndarray | None
+    ) -> Number:
         """Target function for the optimizer
 
         The default version of this function simply returns the value of the cost
         function evaluated on the arguments.
 
         Parameters
         ----------
-        data : pandas.DataFrame
+        data : nd.ndarray
             The reference data used for calibration. By default, this is
             :py:attr:`Input.data`.
-        predicted : pandas.DataFrame
+        predicted : nd.ndarray
             The impact predicted by the data calibration after it has been transformed
             into a dataframe by :py:attr:`Input.impact_to_dataframe`.
+        weights : nd.ndarray
+            The relative weight for each data/entry pair.
 
         Returns
         -------
         The value of the target function for the optimizer.
         """
-        return self.input.cost_func(data, predicted)
+        return self.input.cost_func(data, predicted, weights)
 
     def _kwargs_to_impact_func_creator(self, *_, **kwargs) -> Dict[str, Any]:
         """Define how the parameters to :py:meth:`_opt_func` must be transformed
@@ -484,11 +517,29 @@ def _opt_func(self, *args, **kwargs) -> Number:
             hazard=self.input.hazard,
         ).impact(**self.input.impact_calc_kwds)
 
-        # Transform to DataFrame, align, and compute target function
+        # Transform to DataFrame and align
         data_aligned, impact_df_aligned = self.input.impact_to_aligned_df(
-            impact, fillna=0
+            impact, fillna=0.0
+        )
+
+        # Align weights
+        weights_aligned = None
+        if self.input.data_weights is not None:
+            weights_aligned, _ = self.input.data_weights.align(
+                data_aligned,
+                axis=None,
+                join="right",
+                copy=True,
+                fill_value=self.input.missing_weights_value,
+            )
+            weights_aligned = self.input.df_to_numpy(weights_aligned)
+
+        # Compute target function
+        return self._target_func(
+            self.input.df_to_numpy(data_aligned),
+            self.input.df_to_numpy(impact_df_aligned),
+            weights_aligned,
         )
-        return self._target_func(data_aligned, impact_df_aligned)
 
     @abstractmethod
     def run(self, **opt_kwargs) -> Output:
 
@@ -616,11 +616,13 @@ def __post_init__(self, random_state, allow_duplicate_points, bayes_opt_kwds):
             **bayes_opt_kwds,
         )
 
-    def _target_func(self, data: pd.DataFrame, predicted: pd.DataFrame) -> Number:
+    def _target_func(
+        self, data: np.ndarray, predicted: np.ndarray, weights: np.ndarray | None
+    ) -> Number:
         """Invert the cost function because BayesianOptimization maximizes the target"""
-        return -self.input.cost_func(data, predicted)
+        return -self.input.cost_func(data, predicted, weights)
 
-    def run(self, controller: BayesianOptimizerController) -> BayesianOptimizerOutput:
+    def run(self, **opt_kwargs) -> BayesianOptimizerOutput:
         """Execute the optimization
 
         ``BayesianOptimization`` *maximizes* a target function. Therefore, this class
@@ -631,15 +633,25 @@ def run(self, controller: BayesianOptimizerController) -> BayesianOptimizerOutpu
         ----------
         controller : BayesianOptimizerController
             The controller instance used to set the optimization iteration parameters.
-        opt_kwargs
-            Further keyword arguments passed to ``BayesianOptimization.maximize``.
+        kwargs
+            Further keyword arguments passed to ``BayesianOptimization.maximize``. Note
+            that some arguments are also provided by
+            :py:meth:`BayesianOptimizerController.optimizer_params`.
 
         Returns
         -------
         output : BayesianOptimizerOutput
             Optimization output. :py:attr:`BayesianOptimizerOutput.p_space` stores data
             on the sampled parameter space.
         """
+        # Take the controller
+        try:
+            controller = opt_kwargs.pop("controller")
+        except KeyError as err:
+            raise RuntimeError(
+                "BayesianOptimizer.run requires 'controller' as keyword argument"
+            ) from err
+
         # Register the controller
         for event in (Events.OPTIMIZATION_STEP, Events.OPTIMIZATION_END):
             self.optimizer.subscribe(event, controller)
@@ -660,7 +672,7 @@ def run(self, controller: BayesianOptimizerController) -> BayesianOptimizerOutpu
         while controller.iterations < controller.max_iterations:
             try:
                 LOGGER.info(f"Optimization iteration: {controller.iterations}")
-                self.optimizer.maximize(**controller.optimizer_params())
+                self.optimizer.maximize(**controller.optimizer_params(), **opt_kwargs)
             except StopEarly:
                 # Start a new iteration
                 continue
 
@@ -0,0 +1,40 @@
+"""
+This file is part of CLIMADA.
+
+Copyright (C) 2017 ETH Zurich, CLIMADA contributors listed in AUTHORS.
+
+CLIMADA is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free
+Software Foundation, version 3.
+
+CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with CLIMADA. If not, see <https://www.gnu.org/licenses/>.
+
+---
+Cost functions for impact function calibration module
+"""
+
+import numpy as np
+from sklearn.metrics import mean_squared_error, mean_squared_log_error
+
+
+def mse(data: np.ndarray, predicted: np.ndarray, weights: np.ndarray | None) -> float:
+    """Weighted mean squared error
+
+    See
+    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
+    """
+    return mean_squared_error(data, predicted, sample_weight=weights)
+
+
+def msle(data: np.ndarray, predicted: np.ndarray, weights: np.ndarray | None) -> float:
+    """Weighted mean squared logarithmic error
+
+    See
+    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html
+    """
+    return mean_squared_log_error(data, predicted, sample_weight=weights)