experimental-design · TobyBoyne · Sep 13, 2023 · Sep 13, 2023 · jduerholt · Sep 13, 2023
diff --git a/bofire/data_models/surrogates/api.py b/bofire/data_models/surrogates/api.py
@@ -22,6 +22,7 @@
     )
     from bofire.data_models.surrogates.surrogate import Surrogate
     from bofire.data_models.surrogates.tanimoto_gp import TanimotoGPSurrogate
+    from bofire.data_models.surrogates.enting import EntingSurrogate
     from bofire.data_models.surrogates.xgb import XGBoostSurrogate
 
     AbstractSurrogate = Union[Surrogate, BotorchSurrogate, EmpiricalSurrogate]
@@ -33,6 +34,7 @@
         MixedSingleTaskGPSurrogate,
         MLPEnsemble,
         SaasSingleTaskGPSurrogate,
+        EntingSurrogate,
         XGBoostSurrogate,
         LinearSurrogate,
         TanimotoGPSurrogate,
@@ -44,6 +46,7 @@
         MixedSingleTaskGPSurrogate,
         MLPEnsemble,
         SaasSingleTaskGPSurrogate,
+        EntingSurrogate,
         XGBoostSurrogate,
         LinearSurrogate,
         TanimotoGPSurrogate,

diff --git a/bofire/data_models/surrogates/enting.py b/bofire/data_models/surrogates/enting.py
@@ -0,0 +1,27 @@
+from typing import Literal
+
+from pydantic import Field
+from typing_extensions import Annotated
+
+from bofire.data_models.surrogates.surrogate import Surrogate
+from bofire.data_models.surrogates.trainable import TrainableSurrogate
+
+
+class EntingSurrogate(Surrogate, TrainableSurrogate):
+    type: Literal["EntingSurrogate"] = "EntingSurrogate"
+    train_lib: Literal["lgbm"] = "lgbm"
+    # mean model parameters
+    objective: str = "regression"
+    metric: str = "rmse"
+    boosting: str = "gbdt"
+    num_boost_round: Annotated[int, Field(ge=1)] = 100
+    max_depth: Annotated[int, Field(ge=1)] = 3
+    min_data_in_leaf: Annotated[int, Field(ge=1)] = 1
+    min_data_per_group: Annotated[int, Field(ge=1)] = 1
+
+    # uncertainty model parameters
+    beta: Annotated[float, Field(gt=0)] = 1.96
+    acq_sense: Literal["exploration", "penalty"] = "exploration"
+    dist_trafo: Literal["normal", "standard"] = "normal"
+    dist_metric: Literal["euclidean_squared", "l1", "l2"] = "euclidean_squared"
+    cat_metric: Literal["overlap", "of", "goodall4"] = "overlap"
diff --git a/bofire/surrogates/api.py b/bofire/surrogates/api.py
@@ -1,5 +1,6 @@
 from bofire.surrogates.botorch_surrogates import BotorchSurrogates
 from bofire.surrogates.empirical import EmpiricalSurrogate
+from bofire.surrogates.enting import EntingSurrogate
 from bofire.surrogates.mapper import map
 from bofire.surrogates.mixed_single_task_gp import MixedSingleTaskGPSurrogate
 from bofire.surrogates.mlp import MLPEnsemble

diff --git a/bofire/surrogates/enting.py b/bofire/surrogates/enting.py
@@ -0,0 +1,84 @@
+import warnings
+from typing import Tuple
+
+import numpy as np
+import pandas as pd
+
+try:
+    from entmoot.models.enting import Enting  # type: ignore
+    from entmoot.problem_config import ProblemConfig
+except ImportError:
+    warnings.warn("entmoot not installed, BoFire's `EntingSurrogate` cannot be used.")
+
+import uuid
+
+from bofire.data_models.surrogates.api import EntingSurrogate as DataModel
+from bofire.surrogates.surrogate import Surrogate
+from bofire.surrogates.trainable import TrainableSurrogate
+
+
+class EntingSurrogate(TrainableSurrogate, Surrogate):
+    def __init__(self, data_model: DataModel, **kwargs) -> None:
+        self.train_lib = data_model.train_lib
+
+        self.objective = data_model.objective
+        self.metric = data_model.metric
+        self.boosting = data_model.boosting
+        self.num_boost_round = data_model.num_boost_round
+        self.max_depth = data_model.max_depth
+        self.min_data_in_leaf = data_model.min_data_in_leaf
+        self.min_data_per_group = data_model.min_data_per_group
+
+        self.beta = data_model.beta
+        self.acq_sense = data_model.acq_sense
+        self.dist_trafo = data_model.dist_trafo
+        self.dist_metric = data_model.dist_metric
+        self.cat_metric = data_model.cat_metric
+
+        self.tmpfile_name = f"enting_{uuid.uuid4().hex}.json"
+        super().__init__(data_model=data_model, **kwargs)
+
+    def _get_params_dict(self):
+        return {
+            "tree_train_params": {
+                "train_lib": self.train_lib,
+                "train_params": {
+                    "objective": self.objective,
+                    "metric": self.metric,
+                    "boosting": self.boosting,
+                    "num_boost_round": self.num_boost_round,
+                    "max_depth": self.max_depth,
+                    "min_data_in_leaf": self.min_data_in_leaf,
+                    "min_data_per_group": self.min_data_per_group,
+                },
+                "unc_params": {
+                    "beta": self.beta,
+                    "acq_sense": self.acq_sense,
+                    "dist_trafo": self.dist_trafo,
+                    "dist_metric": self.dist_metric,
+                    "cat_metric": self.cat_metric,
+                },
+            }
+        }
+
+    def _fit(self, X: pd.DataFrame, Y: pd.DataFrame, **kwargs):
+        transformed_X = self.inputs.transform(X, self.input_preprocessing_specs)
+        self._get_params_dict()
+        self.model = Enting()
+        self.model.fit(X=transformed_X.values, y=Y.values)
+
+    def _predict(self, transformed_X: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
+        preds = self.model.predict(transformed_X.to_numpy())
+        # pred has shape [([mu1], std1), ([mu2], std2), ... ]
+        m, v = zip(*preds)
+        mean = np.array(m)
+        std = np.sqrt(np.array(v)).reshape(-1, 1)
+        # std is given combined - copy for each objective
+        std = np.tile(std, mean.shape[1])
+        return mean, std
+
+    def loads(self, data: str):
+        pass
+
+    def _dumps(self) -> str:
+        pass
diff --git a/tests/bofire/surrogates/test_enting.py b/tests/bofire/surrogates/test_enting.py
@@ -0,0 +1,69 @@
+import importlib
+
+import pytest
+
+import bofire.surrogates.api as surrogates
+from bofire.benchmarks.single import Himmelblau
+from bofire.data_models.domain.api import Inputs, Outputs
+from bofire.data_models.enum import CategoricalEncodingEnum
+from bofire.data_models.features.api import (
+    CategoricalInput,
+    ContinuousInput,
+    ContinuousOutput,
+)
+from bofire.data_models.surrogates.api import EntingSurrogate
+
+ENTMOOT_AVAILABLE = importlib.util.find_spec("entmoot") is not None
+
+
+@pytest.mark.skipif(not ENTMOOT_AVAILABLE, reason="requires entmoot")
+def test_EntingSurrogate():
+    benchmark = Himmelblau()
+    samples = benchmark.domain.inputs.sample(10)
+    experiments = benchmark.f(samples, return_complete=True)
+    #
+    data_model = EntingSurrogate(
+        inputs=benchmark.domain.inputs, outputs=benchmark.domain.outputs, n_estimators=2
+    )
+    surrogate = surrogates.map(data_model)
+    assert isinstance(surrogate, surrogates.EntingSurrogate)
+    assert surrogate.input_preprocessing_specs == {}
+    assert surrogate.is_fitted is False
+    # fit it
+    surrogate.fit(experiments=experiments)
+    assert surrogate.is_fitted is True
+    # predict it
+    surrogate.predict(experiments)
+    # # dump it
+    # dump = surrogate.dumps()
+    # # load it
+    # surrogate2 = surrogates.map(data_model)
+    # surrogate2.loads(dump)
+    # preds2 = surrogate2.predict(experiments)
+    # assert_frame_equal(preds, preds2)
+    # assert_frame_equal(preds, preds2)
+
+
+def test_EntingSurrogate_categorical():
+    inputs = Inputs(
+        features=[
+            ContinuousInput(
+                key=f"x_{i+1}",
+                bounds=(-4, 4),
+            )
+            for i in range(2)
+        ]
+        + [CategoricalInput(key="x_cat", categories=["mama", "papa"])]
+    )
+    outputs = Outputs(features=[ContinuousOutput(key="y")])
+    experiments = inputs.sample(n=10)
+    experiments.eval("y=((x_1**2 + x_2 - 11)**2+(x_1 + x_2**2 -7)**2)", inplace=True)
+    experiments.loc[experiments.x_cat == "mama", "y"] *= 5.0
+    experiments.loc[experiments.x_cat == "papa", "y"] /= 2.0
+    experiments["valid_y"] = 1
+    data_model = EntingSurrogate(inputs=inputs, outputs=outputs, n_estimators=2)
+    assert data_model.input_preprocessing_specs == {
+        "x_cat": CategoricalEncodingEnum.ONE_HOT
+    }
+    surrogate = surrogates.map(data_model)
+    surrogate.fit(experiments)