SimonBlanke
diff --git a/‎src/surfaces/test_functions/machine_learning/_base_machine_learning.py‎
Lines changed: 24 additions & 2 deletions b/‎src/surfaces/test_functions/machine_learning/_base_machine_learning.py‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎src/surfaces/test_functions/machine_learning/tabular/classification/test_functions/k_neighbors_classifier.py‎
Lines changed: 102 additions & 36 deletions b/‎src/surfaces/test_functions/machine_learning/tabular/classification/test_functions/k_neighbors_classifier.py‎
Lines changed: 102 additions & 36 deletions
diff --git a/‎src/surfaces/test_functions/machine_learning/tabular/regression/datasets.py‎
Lines changed: 21 additions & 1 deletion b/‎src/surfaces/test_functions/machine_learning/tabular/regression/datasets.py‎
Lines changed: 21 additions & 1 deletion
@@ -56,7 +56,9 @@ def __init__(
         use_surrogate: bool = False,
         **kwargs,
     ):
-        super().__init__(objective, sleep, memory, collect_data, callbacks, catch_errors)
+        super().__init__(
+            objective, sleep, memory, collect_data, callbacks, catch_errors
+        )
         self.use_surrogate = use_surrogate
         self._surrogate = None
 
@@ -77,6 +79,24 @@ def _load_surrogate(self) -> None:
             )
             self.use_surrogate = False
 
+    def _get_surrogate_params(self, params: Dict[str, Any]) -> Dict[str, Any]:
+        """Get parameters for surrogate prediction.
+
+        Override in subclasses to add fixed parameters (like dataset, cv)
+        that are not in the search space but needed by the surrogate.
+
+        Parameters
+        ----------
+        params : dict
+            Search parameters from the optimizer.
+
+        Returns
+        -------
+        dict
+            Full parameters for surrogate prediction.
+        """
+        return params
+
     def _evaluate(self, params: Dict[str, Any]) -> float:
         """Evaluate with timing and objective transformation.
 
@@ -86,7 +106,9 @@ def _evaluate(self, params: Dict[str, Any]) -> float:
         time.sleep(self.sleep)
 
         if self.use_surrogate and self._surrogate is not None:
-            raw_value = self._surrogate.predict(params)
+            # Use _get_surrogate_params to include fixed params (dataset, cv)
+            surrogate_params = self._get_surrogate_params(params)
+            raw_value = self._surrogate.predict(surrogate_params)
         else:
             raw_value = self.pure_objective_function(params)
 
 
@@ -2,13 +2,22 @@
 # Email: [email protected]
 # License: MIT License
 
+"""K-Nearest Neighbors Classifier test function with surrogate support."""
+
 import numpy as np
 from sklearn.model_selection import cross_val_score
 from sklearn.neighbors import KNeighborsClassifier
 
 from .._base_classification import BaseClassification
 from ..datasets import digits_data, iris_data, wine_data
 
+# Dataset registry: maps string names to loader functions
+DATASETS = {
+    "digits": digits_data,
+    "iris": iris_data,
+    "wine": wine_data,
+}
+
 
 class KNeighborsClassifierFunction(BaseClassification):
     """K-Nearest Neighbors Classifier test function.
@@ -18,73 +27,130 @@ class KNeighborsClassifierFunction(BaseClassification):
 
     Parameters
     ----------
-    metric : str, default="accuracy"
-        Scoring metric for cross-validation.
+    dataset : str, default="digits"
+        Dataset to use for evaluation. One of: "digits", "iris", "wine".
+        This is a fixed parameter (like a coefficient), not part of the search space.
+    cv : int, default=5
+        Number of cross-validation folds.
+        This is a fixed parameter, not part of the search space.
+    use_surrogate : bool, default=False
+        If True, use pre-trained surrogate model for fast evaluation (~1ms).
+        Falls back to real evaluation if no surrogate is available.
+    objective : str, default="maximize"
+        Either "minimize" or "maximize".
     sleep : float, default=0
         Artificial delay in seconds added to each evaluation.
 
     Attributes
     ----------
-    para_names : list
-        Names of the hyperparameters: n_neighbors, algorithm, cv, dataset.
-    n_neighbors_default : list
-        Default values for n_neighbors parameter (3 to 150, step 5).
-    algorithm_default : list
-        Default algorithm options: auto, ball_tree, kd_tree, brute.
-    cv_default : list
-        Default cross-validation fold options: 2, 3, 4, 5, 8, 10.
-    dataset_default : list
-        Default datasets (digits, wine, iris).
+    available_datasets : list
+        Available dataset names: ["digits", "iris", "wine"].
+    available_cv : list
+        Available CV fold options: [2, 3, 5, 10].
 
     Examples
     --------
+    Basic usage with real evaluation:
+
     >>> from surfaces.test_functions import KNeighborsClassifierFunction
-    >>> func = KNeighborsClassifierFunction()
-    >>> search_space = func.search_space
-    >>> list(search_space.keys())
-    ['n_neighbors', 'algorithm', 'cv', 'dataset']
+    >>> func = KNeighborsClassifierFunction(dataset="iris", cv=5)
+    >>> func.search_space
+    {'n_neighbors': [3, 8, 13, ...], 'algorithm': ['auto', 'ball_tree', ...]}
+    >>> result = func({"n_neighbors": 5, "algorithm": "auto"})
+
+    Fast evaluation with surrogate (requires surfaces[surrogates]):
+
+    >>> func = KNeighborsClassifierFunction(dataset="iris", cv=5, use_surrogate=True)
+    >>> result = func({"n_neighbors": 5, "algorithm": "auto"})  # ~1ms
     """
 
     name = "KNeighbors Classifier Function"
     _name_ = "k_neighbors_classifier"
     __name__ = "KNeighborsClassifierFunction"
 
-    para_names = ["n_neighbors", "algorithm", "cv", "dataset"]
+    # Available options (for validation and documentation)
+    available_datasets = list(DATASETS.keys())
+    available_cv = [2, 3, 5, 10]
 
+    # Search space parameters (only actual hyperparameters)
+    para_names = ["n_neighbors", "algorithm"]
     n_neighbors_default = list(np.arange(3, 150, 5))
     algorithm_default = ["auto", "ball_tree", "kd_tree", "brute"]
-    cv_default = [2, 3, 4, 5, 8, 10]
-    dataset_default = [digits_data, wine_data, iris_data]
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
 
-    def _search_space(
+    def __init__(
         self,
-        n_neighbors: list = None,
-        algorithm: list = None,
-        cv: list = None,
-        dataset: list = None,
+        dataset: str = "digits",
+        cv: int = 5,
+        objective: str = "maximize",
+        sleep: float = 0,
+        memory: bool = False,
+        collect_data: bool = True,
+        callbacks=None,
+        catch_errors=None,
+        use_surrogate: bool = False,
     ):
-        search_space: dict = {}
+        # Validate dataset
+        if dataset not in DATASETS:
+            raise ValueError(
+                f"Unknown dataset '{dataset}'. "
+                f"Available: {self.available_datasets}"
+            )
+
+        # Validate cv
+        if cv not in self.available_cv:
+            raise ValueError(
+                f"Invalid cv={cv}. Available: {self.available_cv}"
+            )
 
-        search_space["n_neighbors"] = (
-            self.n_neighbors_default if n_neighbors is None else n_neighbors
+        # Store fixed parameters (like coefficients in math functions)
+        self.dataset = dataset
+        self.cv = cv
+
+        # Load dataset for real evaluation
+        self._dataset_loader = DATASETS[dataset]
+
+        super().__init__(
+            objective=objective,
+            sleep=sleep,
+            memory=memory,
+            collect_data=collect_data,
+            callbacks=callbacks,
+            catch_errors=catch_errors,
+            use_surrogate=use_surrogate,
         )
-        search_space["algorithm"] = self.algorithm_default if algorithm is None else algorithm
-        search_space["cv"] = self.cv_default if cv is None else cv
-        search_space["dataset"] = self.dataset_default if dataset is None else dataset
 
-        return search_space
+    @property
+    def search_space(self):
+        """Search space containing only hyperparameters (not dataset/cv)."""
+        return {
+            "n_neighbors": self.n_neighbors_default,
+            "algorithm": self.algorithm_default,
+        }
 
     def _create_objective_function(self):
+        """Create objective function with fixed dataset and cv."""
+        # Load dataset once
+        X, y = self._dataset_loader()
+        cv = self.cv
+
         def k_neighbors_classifier(params):
             knc = KNeighborsClassifier(
                 n_neighbors=params["n_neighbors"],
                 algorithm=params["algorithm"],
             )
-            X, y = params["dataset"]()
-            scores = cross_val_score(knc, X, y, cv=params["cv"], scoring="accuracy")
+            scores = cross_val_score(knc, X, y, cv=cv, scoring="accuracy")
             return scores.mean()
 
         self.pure_objective_function = k_neighbors_classifier
+
+    def _get_surrogate_params(self, params):
+        """Add fixed parameters (dataset, cv) to params for surrogate prediction.
+
+        The surrogate model was trained on all (HP, dataset, cv) combinations,
+        so we need to include the fixed parameters when querying it.
+        """
+        return {
+            **params,
+            "dataset": self.dataset,
+            "cv": self.cv,
+        }
@@ -2,10 +2,30 @@
 # Email: [email protected]
 # License: MIT License
 
-from sklearn.datasets import load_diabetes
+"""Regression datasets for ML test functions."""
 
+from sklearn.datasets import load_diabetes, fetch_california_housing
+
+# Pre-load datasets for fast access
 diabetes_dataset = load_diabetes()
+_california_dataset = None  # Lazy load (larger download)
 
 
 def diabetes_data():
+    """Load diabetes dataset (442 samples, 10 features)."""
     return diabetes_dataset.data, diabetes_dataset.target
+
+
+def california_data():
+    """Load California housing dataset (20640 samples, 8 features)."""
+    global _california_dataset
+    if _california_dataset is None:
+        _california_dataset = fetch_california_housing()
+    return _california_dataset.data, _california_dataset.target
+
+
+# Registry for easy access
+DATASETS = {
+    "diabetes": diabetes_data,
+    "california": california_data,
+}