diff --git a/smac/runhistory/encoder/gaussian_copula_encoder.py b/smac/runhistory/encoder/gaussian_copula_encoder.py new file mode 100644 index 000000000..dde723864 --- /dev/null +++ b/smac/runhistory/encoder/gaussian_copula_encoder.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +import numpy as np + +from smac import constants +from smac.runhistory.encoder.encoder import RunHistoryEncoder +from smac.utils.logging import get_logger +import scipy + +__copyright__ = "Copyright 2025, Leibniz University Hanover, Institute of AI" +__license__ = "3-clause BSD" + + +logger = get_logger(__name__) + + +class RunHistoryGaussianCopulaEncoder(RunHistoryEncoder): + def transform_response_values(self, values: np.ndarray) -> np.ndarray: + """Transforms the response values by using log.""" + # Ensure minimal value is sufficiently large + min_log_cost = max(constants.MINIMAL_COST_FOR_LOG, 1e-10) + + if np.any(values <= 0): + logger.warning( + "Got cost of smaller/equal to 0. Replace by %f since we use" + " log cost." % min_log_cost + ) + values[values < min_log_cost] = min_log_cost + + n = max(len(values), 2) # Ensure at least two values to avoid division by zero + log_n = np.log(n) if n > 1 else 1e-10 # Avoid log(1) = 0 + + quants = (scipy.stats.rankdata(values.flatten()) - 1) / (n - 1) + + # Ensure cutoff does not exceed reasonable values + cutoff = min(0.1, 1 / (4 * np.power(n, 0.25) * np.sqrt(np.pi * log_n))) + + quants = np.clip(quants, a_min=cutoff, a_max=1 - cutoff) + + # Inverse Gaussian CDF with proper handling of extreme quantiles + rval = np.array([scipy.stats.norm.ppf(q) for q in quants]).reshape((-1, 1)) + + return rval + diff --git a/smac/runhistory/encoder/percentile_encoder.py b/smac/runhistory/encoder/percentile_encoder.py new file mode 100644 index 000000000..71f3ca280 --- /dev/null +++ b/smac/runhistory/encoder/percentile_encoder.py @@ -0,0 +1,53 @@ +from __future__ import annotations +import numpy as np +import scipy.stats +from smac import constants +from smac.runhistory.encoder.encoder import RunHistoryEncoder +from smac.utils.logging import get_logger + +__copyright__ = "Copyright 2022, automl.org" +__license__ = "3-clause BSD" + +logger = get_logger(__name__) + + +class RunHistoryPercentileEncoder(RunHistoryEncoder): + def transform_response_values(self, values: np.ndarray) -> np.ndarray: + """Transforms the response values by applying a log transformation, + rank-based quantile transformation, and inverse Gaussian CDF scaling.""" + + # Debug: show what shape is coming in + print(f">>> Encoder input values.shape = {values.shape}") + + # Safeguard: aggregate if values look like multiple per config + if values.ndim > 1: + logger.warning( + f"Received values with shape {values.shape}, aggregating along axis=1." + ) + values = np.mean(values, axis=1) + + # Replace non-positive values with minimal cost + if np.any(values <= 0): + logger.warning( + "Got cost <= 0. Replacing by %f since we use log cost." + % constants.MINIMAL_COST_FOR_LOG + ) + values = np.clip(values, constants.MINIMAL_COST_FOR_LOG, None) + + # Apply log transformation + log_values = np.log(values) + + # Compute rank-based quantiles + eps = 1e-6 # keep strictly within (0,1) + quants = [ + np.clip(scipy.stats.percentileofscore(log_values, v) / 100, eps, 1 - eps) + for v in log_values + ] + + # Inverse Gaussian CDF transformation + output = scipy.stats.norm.ppf(quants).reshape((-1, 1)) + + # Debug: show output shape + print(f">>> Encoder output shape = {output.shape}") + + return output diff --git a/smac/runhistory/encoder/power_transform_encoder.py b/smac/runhistory/encoder/power_transform_encoder.py new file mode 100644 index 000000000..5b3152d8b --- /dev/null +++ b/smac/runhistory/encoder/power_transform_encoder.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +import numpy as np +from sklearn.preprocessing import PowerTransformer + +from smac.runhistory.encoder.encoder import RunHistoryEncoder +from smac.utils.logging import get_logger + +__copyright__ = "Copyright 2025, Leibniz University Hanover, Institute of AI" +__license__ = "3-clause BSD" + +logger = get_logger(__name__) + + +class RunHistoryPowerTransformEncoder(RunHistoryEncoder): + def transform_response_values(self, values: np.ndarray) -> np.ndarray: + """Apply PowerTransformer (Yeo-Johnson) to response values.""" + + if values.size == 0: + logger.debug("Received empty array for transformation.") + return values.reshape(-1, 1) + + values = values.reshape(-1, 1) + transformer = PowerTransformer(method="yeo-johnson", standardize=True) + return transformer.fit_transform(values)