Make cost functions consume numpy arrays

peanutfun · peanutfun · commit 96cf55cdbc04 · 2025-07-02T12:21:03.000+02:00
diff --git a/climada/util/calibrate/__init__.py b/climada/util/calibrate/__init__.py
@@ -26,6 +26,7 @@
     BayesianOptimizerOutputEvaluator,
     select_best,
 )
+from .cost_func import mse, msle
 from .ensemble import (
     AverageEnsembleOptimizer,
     EnsembleOptimizerOutput,
diff --git a/climada/util/calibrate/base.py b/climada/util/calibrate/base.py
@@ -86,6 +86,10 @@ class Input:
         :py:attr:`data`, insert this value. Defaults to NaN, in which case the impact
         from the model is ignored. Set this to zero to explicitly calibrate to zero
         impacts in these cases.
+    df_to_numpy : Callable
+        A function that transforms a pandas.DataFrame into a numpy.ndarray to be
+        inserted into the :py:attr:`cost_func`. By default, this will flatten the data
+        frame.
     data_weights : pandas.DataFrame, optional
         Weights for each entry in :py:attr:`data`. Must have the exact same index and
         columns. If ``None``, the weights will be ignored (equivalent to the same weight
@@ -109,6 +113,9 @@ class Input:
         default_factory=lambda: {"assign_centroids": False}
     )
     missing_data_value: float = np.nan
+    df_to_numpy: Callable[[pd.DataFrame], np.ndarray] = (
+        lambda df: df.to_numpy().flatten()
+    )
     data_weights: pd.DataFrame | None = field(default=None, kw_only=True)
     missing_weights_value: float = field(default=0.0, kw_only=True)
     assign_centroids: InitVar[bool] = field(default=True, kw_only=True)
@@ -523,12 +530,12 @@ def _opt_func(self, *args, **kwargs) -> Number:
                 copy=True,
                 fill_value=self.input.missing_weights_value,
             )
-            weights_aligned = weights_aligned.to_numpy().flatten()
+            weights_aligned = self.input.df_to_numpy(weights_aligned)
 
         # Compute target function
         return self._target_func(
-            data_aligned.to_numpy().flatten(),
-            impact_df_aligned.to_numpy().flatten(),
+            self.input.df_to_numpy(data_aligned),
+            self.input.df_to_numpy(impact_df_aligned),
             weights_aligned,
         )
 
diff --git a/climada/util/calibrate/bayesian_optimizer.py b/climada/util/calibrate/bayesian_optimizer.py
@@ -617,7 +617,7 @@ def __post_init__(self, random_state, allow_duplicate_points, bayes_opt_kwds):
         )
 
     def _target_func(
-        self, data: pd.DataFrame, predicted: pd.DataFrame, weights: pd.DataFrame | None
+        self, data: np.ndarray, predicted: np.ndarray, weights: np.ndarray | None
     ) -> Number:
         """Invert the cost function because BayesianOptimization maximizes the target"""
         return -self.input.cost_func(data, predicted, weights)
diff --git a/climada/util/calibrate/cost_func.py b/climada/util/calibrate/cost_func.py
@@ -0,0 +1,40 @@
+"""
+This file is part of CLIMADA.
+
+Copyright (C) 2017 ETH Zurich, CLIMADA contributors listed in AUTHORS.
+
+CLIMADA is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free
+Software Foundation, version 3.
+
+CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with CLIMADA. If not, see <https://www.gnu.org/licenses/>.
+
+---
+Cost functions for impact function calibration module
+"""
+
+import numpy as np
+from sklearn.metrics import mean_squared_error, mean_squared_log_error
+
+
+def mse(data: np.ndarray, predicted: np.ndarray, weights: np.ndarray | None) -> float:
+    """Weighted mean squared error
+
+    See
+    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
+    """
+    return mean_squared_error(data, predicted, sample_weight=weights)
+
+
+def msle(data: np.ndarray, predicted: np.ndarray, weights: np.ndarray | None) -> float:
+    """Weighted mean squared logarithmic error
+
+    See
+    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html
+    """
+    return mean_squared_log_error(data, predicted, sample_weight=weights)
diff --git a/climada/util/calibrate/test/test_base.py b/climada/util/calibrate/test/test_base.py
@@ -212,6 +212,7 @@ def test_align(self, _):
         self.input.impact_to_dataframe = lambda _: pd.DataFrame(
             data={"col1": [2, 4], "col2": [4, 0]}, index=[0, 2]
         )
+        self.input.df_to_numpy = lambda df: df
         self.input.cost_func = lambda x, y, w: (x, y, w)
 
         # Apply
diff --git a/climada/util/calibrate/test/test_bayesian_optimizer.py b/climada/util/calibrate/test/test_bayesian_optimizer.py
@@ -316,10 +316,10 @@ def test_kwargs_to_impact_func_creator(self, _):
 
         # Call 'run'
         with patch.object(self.input, "impact_to_aligned_df") as align:
-            align.return_value = (None, None)
+            align.return_value = (pd.DataFrame(), pd.DataFrame())
             self.optimizer.run(controller=self.controller)
 
-        # Check call to '_kwargs_to_impact_func_gen'
+        # Check call to '_kwargs_to_impact_func_creator'
         call_args = self.input.impact_func_creator.call_args_list
         self.assertEqual(len(call_args), 3)
         for args in call_args:
@@ -340,7 +340,7 @@ def test_target_func(self, _):
 
         # Call 'run'
         with patch.object(self.input, "impact_to_aligned_df") as align:
-            align.return_value = (None, None)
+            align.return_value = (pd.DataFrame(), pd.DataFrame())
             output = self.optimizer.run(controller=self.controller)
 
         # Check target space
diff --git a/climada/util/calibrate/test/test_ensemble.py b/climada/util/calibrate/test/test_ensemble.py
@@ -371,16 +371,17 @@ def setUp(self):
 
     def test_post_init_sampling(self):
         opt = AverageEnsembleOptimizer(
-            input=self.input, sample_fraction=0.5, optimizer_type=ConcreteOptimizer
+            input=self.input, optimizer_type=ConcreteOptimizer
         )
         samples = np.array(opt.samples)
-        self.assertTupleEqual(samples.shape, (20, 2, 2))
+        self.assertTupleEqual(samples.shape, (20, 4, 2))
 
         opt = AverageEnsembleOptimizer(
             input=self.input,
             ensemble_size=11,
             sample_fraction=0.8,  # Will cause rounding
             optimizer_type=ConcreteOptimizer,
+            replace=False,
         )
         samples = np.array(opt.samples)
         self.assertTupleEqual(samples.shape, (11, 3, 2))
@@ -390,6 +391,7 @@ def test_post_init_sampling(self):
             ensemble_size=2,
             sample_fraction=0.95,  # Will cause rounding, always select all
             optimizer_type=ConcreteOptimizer,
+            replace=False,
         )
 
         samples = [sorted([tuple(idx) for idx in arr]) for arr in opt.samples]
diff --git a/climada/util/calibrate/test/test_scipy_optimizer.py b/climada/util/calibrate/test/test_scipy_optimizer.py
@@ -63,7 +63,7 @@ def test_kwargs_to_impact_func_creator(self, _):
         # Call 'run', make sure that 'minimize' is only with these parameters
         params_init = {"x_2": 1, "x 1": 2, "x_3": 3}  # NOTE: Also works with whitespace
         with patch.object(self.input, "impact_to_aligned_df") as align:
-            align.return_value = (None, None)
+            align.return_value = (pd.DataFrame(), pd.DataFrame())
             self.optimizer.run(params_init=params_init, options={"maxiter": 1})
 
         # Check call to '_kwargs_to_impact_func_creator'

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`BayesianOptimizerOutputEvaluator,`
`27`	`27`	`select_best,`
`28`	`28`	`)`
	`29`	`+from .cost_func import mse, msle`
`29`	`30`	`from .ensemble import (`
`30`	`31`	`AverageEnsembleOptimizer,`
`31`	`32`	`EnsembleOptimizerOutput,`
Original file line number	Diff line number	Diff line change
`@@ -617,7 +617,7 @@ def __post_init__(self, random_state, allow_duplicate_points, bayes_opt_kwds):`
`617`	`617`	`)`
`618`	`618`
`619`	`619`	`def _target_func(`
`620`		`- self, data: pd.DataFrame, predicted: pd.DataFrame, weights: pd.DataFrame \| None`
	`620`	`+ self, data: np.ndarray, predicted: np.ndarray, weights: np.ndarray \| None`
`621`	`621`	`) -> Number:`
`622`	`622`	`"""Invert the cost function because BayesianOptimization maximizes the target"""`
`623`	`623`	`return -self.input.cost_func(data, predicted, weights)`
Original file line number	Diff line number	Diff line change
`@@ -212,6 +212,7 @@ def test_align(self, _):`
`212`	`212`	`self.input.impact_to_dataframe = lambda _: pd.DataFrame(`
`213`	`213`	`data={"col1": [2, 4], "col2": [4, 0]}, index=[0, 2]`
`214`	`214`	`)`
	`215`	`+ self.input.df_to_numpy = lambda df: df`
`215`	`216`	`self.input.cost_func = lambda x, y, w: (x, y, w)`
`216`	`217`
`217`	`218`	`# Apply`