automl · lhennig0103 · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/smac/runhistory/encoder/gaussian_copula_encoder.py b/smac/runhistory/encoder/gaussian_copula_encoder.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+import numpy as np
+
+from smac import constants
+from smac.runhistory.encoder.encoder import RunHistoryEncoder
+from smac.utils.logging import get_logger
+import scipy
+
+__copyright__ = "Copyright 2025, Leibniz University Hanover, Institute of AI"
+__license__ = "3-clause BSD"
+
+
+logger = get_logger(__name__)
+
+
+class RunHistoryGaussianCopulaEncoder(RunHistoryEncoder):
+    def transform_response_values(self, values: np.ndarray) -> np.ndarray:
+        """Transforms the response values by using log."""
+        # Ensure minimal value is sufficiently large
+        min_log_cost = max(constants.MINIMAL_COST_FOR_LOG, 1e-10)
+
+        if np.any(values <= 0):
+            logger.warning(
+                "Got cost of smaller/equal to 0. Replace by %f since we use"
+                " log cost." % min_log_cost
+            )
+            values[values < min_log_cost] = min_log_cost
+
+        n = max(len(values), 2)  # Ensure at least two values to avoid division by zero
+        log_n = np.log(n) if n > 1 else 1e-10  # Avoid log(1) = 0
+
+        quants = (scipy.stats.rankdata(values.flatten()) - 1) / (n - 1)
+
+        # Ensure cutoff does not exceed reasonable values
+        cutoff = min(0.1, 1 / (4 * np.power(n, 0.25) * np.sqrt(np.pi * log_n)))
+
+        quants = np.clip(quants, a_min=cutoff, a_max=1 - cutoff)
+
+        # Inverse Gaussian CDF with proper handling of extreme quantiles
+        rval = np.array([scipy.stats.norm.ppf(q) for q in quants]).reshape((-1, 1))
+
+        return rval
+
diff --git a/smac/runhistory/encoder/percentile_encoder.py b/smac/runhistory/encoder/percentile_encoder.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+import numpy as np
+import scipy.stats
+from smac import constants
+from smac.runhistory.encoder.encoder import RunHistoryEncoder
+from smac.utils.logging import get_logger
+
+__copyright__ = "Copyright 2022, automl.org"
+__license__ = "3-clause BSD"
+
+logger = get_logger(__name__)
+
+
+class RunHistoryPercentileEncoder(RunHistoryEncoder):
+    def transform_response_values(self, values: np.ndarray) -> np.ndarray:
+        """Transforms the response values by applying a log transformation,
+        rank-based quantile transformation, and inverse Gaussian CDF scaling."""
+
+        # Debug: show what shape is coming in
+        print(f">>> Encoder input values.shape = {values.shape}")
+
+        # Safeguard: aggregate if values look like multiple per config
+        if values.ndim > 1:
+            logger.warning(
+                f"Received values with shape {values.shape}, aggregating along axis=1."
+            )
+            values = np.mean(values, axis=1)
+
+        # Replace non-positive values with minimal cost
+        if np.any(values <= 0):
+            logger.warning(
+                "Got cost <= 0. Replacing by %f since we use log cost."
+                % constants.MINIMAL_COST_FOR_LOG
+            )
+            values = np.clip(values, constants.MINIMAL_COST_FOR_LOG, None)
+
+        # Apply log transformation
+        log_values = np.log(values)
+
+        # Compute rank-based quantiles
+        eps = 1e-6  # keep strictly within (0,1)
+        quants = [
+            np.clip(scipy.stats.percentileofscore(log_values, v) / 100, eps, 1 - eps)
+            for v in log_values
+        ]
+
+        # Inverse Gaussian CDF transformation
+        output = scipy.stats.norm.ppf(quants).reshape((-1, 1))
+
+        # Debug: show output shape
+        print(f">>> Encoder output shape = {output.shape}")
+
+        return output
diff --git a/smac/runhistory/encoder/power_transform_encoder.py b/smac/runhistory/encoder/power_transform_encoder.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+import numpy as np
+from sklearn.preprocessing import PowerTransformer
+
+from smac.runhistory.encoder.encoder import RunHistoryEncoder
+from smac.utils.logging import get_logger
+
+__copyright__ = "Copyright 2025, Leibniz University Hanover, Institute of AI"
+__license__ = "3-clause BSD"
+
+logger = get_logger(__name__)
+
+
+class RunHistoryPowerTransformEncoder(RunHistoryEncoder):
+    def transform_response_values(self, values: np.ndarray) -> np.ndarray:
+        """Apply PowerTransformer (Yeo-Johnson) to response values."""
+
+        if values.size == 0:
+            logger.debug("Received empty array for transformation.")
+            return values.reshape(-1, 1)
+
+        values = values.reshape(-1, 1)
+        transformer = PowerTransformer(method="yeo-johnson", standardize=True)
+        return transformer.fit_transform(values)