diff --git a/bofire/data_models/surrogates/api.py b/bofire/data_models/surrogates/api.py index e9dc0f16a..43e45d0e6 100644 --- a/bofire/data_models/surrogates/api.py +++ b/bofire/data_models/surrogates/api.py @@ -22,6 +22,7 @@ ) from bofire.data_models.surrogates.surrogate import Surrogate from bofire.data_models.surrogates.tanimoto_gp import TanimotoGPSurrogate + from bofire.data_models.surrogates.enting import EntingSurrogate from bofire.data_models.surrogates.xgb import XGBoostSurrogate AbstractSurrogate = Union[Surrogate, BotorchSurrogate, EmpiricalSurrogate] @@ -33,6 +34,7 @@ MixedSingleTaskGPSurrogate, MLPEnsemble, SaasSingleTaskGPSurrogate, + EntingSurrogate, XGBoostSurrogate, LinearSurrogate, TanimotoGPSurrogate, @@ -44,6 +46,7 @@ MixedSingleTaskGPSurrogate, MLPEnsemble, SaasSingleTaskGPSurrogate, + EntingSurrogate, XGBoostSurrogate, LinearSurrogate, TanimotoGPSurrogate, diff --git a/bofire/data_models/surrogates/enting.py b/bofire/data_models/surrogates/enting.py new file mode 100644 index 000000000..b5a7b3fcf --- /dev/null +++ b/bofire/data_models/surrogates/enting.py @@ -0,0 +1,27 @@ +from typing import Literal + +from pydantic import Field +from typing_extensions import Annotated + +from bofire.data_models.surrogates.surrogate import Surrogate +from bofire.data_models.surrogates.trainable import TrainableSurrogate + + +class EntingSurrogate(Surrogate, TrainableSurrogate): + type: Literal["EntingSurrogate"] = "EntingSurrogate" + train_lib: Literal["lgbm"] = "lgbm" + # mean model parameters + objective: str = "regression" + metric: str = "rmse" + boosting: str = "gbdt" + num_boost_round: Annotated[int, Field(ge=1)] = 100 + max_depth: Annotated[int, Field(ge=1)] = 3 + min_data_in_leaf: Annotated[int, Field(ge=1)] = 1 + min_data_per_group: Annotated[int, Field(ge=1)] = 1 + + # uncertainty model parameters + beta: Annotated[float, Field(gt=0)] = 1.96 + acq_sense: Literal["exploration", "penalty"] = "exploration" + dist_trafo: Literal["normal", "standard"] = "normal" + dist_metric: Literal["euclidean_squared", "l1", "l2"] = "euclidean_squared" + cat_metric: Literal["overlap", "of", "goodall4"] = "overlap" diff --git a/bofire/surrogates/api.py b/bofire/surrogates/api.py index 5bae145f7..285d1f96e 100644 --- a/bofire/surrogates/api.py +++ b/bofire/surrogates/api.py @@ -1,5 +1,6 @@ from bofire.surrogates.botorch_surrogates import BotorchSurrogates from bofire.surrogates.empirical import EmpiricalSurrogate +from bofire.surrogates.enting import EntingSurrogate from bofire.surrogates.mapper import map from bofire.surrogates.mixed_single_task_gp import MixedSingleTaskGPSurrogate from bofire.surrogates.mlp import MLPEnsemble diff --git a/bofire/surrogates/enting.py b/bofire/surrogates/enting.py new file mode 100644 index 000000000..403de2617 --- /dev/null +++ b/bofire/surrogates/enting.py @@ -0,0 +1,84 @@ +import warnings +from typing import Tuple + +import numpy as np +import pandas as pd + +try: + from entmoot.models.enting import Enting # type: ignore + from entmoot.problem_config import ProblemConfig +except ImportError: + warnings.warn("entmoot not installed, BoFire's `EntingSurrogate` cannot be used.") + +import uuid + +from bofire.data_models.surrogates.api import EntingSurrogate as DataModel +from bofire.surrogates.surrogate import Surrogate +from bofire.surrogates.trainable import TrainableSurrogate + + +class EntingSurrogate(TrainableSurrogate, Surrogate): + def __init__(self, data_model: DataModel, **kwargs) -> None: + self.train_lib = data_model.train_lib + + self.objective = data_model.objective + self.metric = data_model.metric + self.boosting = data_model.boosting + self.num_boost_round = data_model.num_boost_round + self.max_depth = data_model.max_depth + self.min_data_in_leaf = data_model.min_data_in_leaf + self.min_data_per_group = data_model.min_data_per_group + + self.beta = data_model.beta + self.acq_sense = data_model.acq_sense + self.dist_trafo = data_model.dist_trafo + self.dist_metric = data_model.dist_metric + self.cat_metric = data_model.cat_metric + + self.tmpfile_name = f"enting_{uuid.uuid4().hex}.json" + super().__init__(data_model=data_model, **kwargs) + + def _get_params_dict(self): + return { + "tree_train_params": { + "train_lib": self.train_lib, + "train_params": { + "objective": self.objective, + "metric": self.metric, + "boosting": self.boosting, + "num_boost_round": self.num_boost_round, + "max_depth": self.max_depth, + "min_data_in_leaf": self.min_data_in_leaf, + "min_data_per_group": self.min_data_per_group, + }, + "unc_params": { + "beta": self.beta, + "acq_sense": self.acq_sense, + "dist_trafo": self.dist_trafo, + "dist_metric": self.dist_metric, + "cat_metric": self.cat_metric, + }, + } + } + + def _fit(self, X: pd.DataFrame, Y: pd.DataFrame, **kwargs): + transformed_X = self.inputs.transform(X, self.input_preprocessing_specs) + self._get_params_dict() + self.model = Enting() + self.model.fit(X=transformed_X.values, y=Y.values) + + def _predict(self, transformed_X: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]: + preds = self.model.predict(transformed_X.to_numpy()) + # pred has shape [([mu1], std1), ([mu2], std2), ... ] + m, v = zip(*preds) + mean = np.array(m) + std = np.sqrt(np.array(v)).reshape(-1, 1) + # std is given combined - copy for each objective + std = np.tile(std, mean.shape[1]) + return mean, std + + def loads(self, data: str): + pass + + def _dumps(self) -> str: + pass diff --git a/tests/bofire/surrogates/test_enting.py b/tests/bofire/surrogates/test_enting.py new file mode 100644 index 000000000..54893bc58 --- /dev/null +++ b/tests/bofire/surrogates/test_enting.py @@ -0,0 +1,69 @@ +import importlib + +import pytest + +import bofire.surrogates.api as surrogates +from bofire.benchmarks.single import Himmelblau +from bofire.data_models.domain.api import Inputs, Outputs +from bofire.data_models.enum import CategoricalEncodingEnum +from bofire.data_models.features.api import ( + CategoricalInput, + ContinuousInput, + ContinuousOutput, +) +from bofire.data_models.surrogates.api import EntingSurrogate + +ENTMOOT_AVAILABLE = importlib.util.find_spec("entmoot") is not None + + +@pytest.mark.skipif(not ENTMOOT_AVAILABLE, reason="requires entmoot") +def test_EntingSurrogate(): + benchmark = Himmelblau() + samples = benchmark.domain.inputs.sample(10) + experiments = benchmark.f(samples, return_complete=True) + # + data_model = EntingSurrogate( + inputs=benchmark.domain.inputs, outputs=benchmark.domain.outputs, n_estimators=2 + ) + surrogate = surrogates.map(data_model) + assert isinstance(surrogate, surrogates.EntingSurrogate) + assert surrogate.input_preprocessing_specs == {} + assert surrogate.is_fitted is False + # fit it + surrogate.fit(experiments=experiments) + assert surrogate.is_fitted is True + # predict it + surrogate.predict(experiments) + # # dump it + # dump = surrogate.dumps() + # # load it + # surrogate2 = surrogates.map(data_model) + # surrogate2.loads(dump) + # preds2 = surrogate2.predict(experiments) + # assert_frame_equal(preds, preds2) + # assert_frame_equal(preds, preds2) + + +def test_EntingSurrogate_categorical(): + inputs = Inputs( + features=[ + ContinuousInput( + key=f"x_{i+1}", + bounds=(-4, 4), + ) + for i in range(2) + ] + + [CategoricalInput(key="x_cat", categories=["mama", "papa"])] + ) + outputs = Outputs(features=[ContinuousOutput(key="y")]) + experiments = inputs.sample(n=10) + experiments.eval("y=((x_1**2 + x_2 - 11)**2+(x_1 + x_2**2 -7)**2)", inplace=True) + experiments.loc[experiments.x_cat == "mama", "y"] *= 5.0 + experiments.loc[experiments.x_cat == "papa", "y"] /= 2.0 + experiments["valid_y"] = 1 + data_model = EntingSurrogate(inputs=inputs, outputs=outputs, n_estimators=2) + assert data_model.input_preprocessing_specs == { + "x_cat": CategoricalEncodingEnum.ONE_HOT + } + surrogate = surrogates.map(data_model) + surrogate.fit(experiments)