make eta configurable (#1526)

jduerholt · facebook-github-bot · commit 8ecc903392b2 · 2022-11-30T16:54:14.000-08:00
Summary:  ## Motivation I recently looked into the output constraint implementation in botorch and figured out that it behave like our custom `Objective` implementation for handling of constraints, namely by multiplying by sigmoids. Currently, the only difference is that we work often with different `eta` values per constraint. I think this would be a nice feature also for `botorch`. This PR is still work in progress, as the `apply_constraints` method is used at a lot of different occasions though-out the codebase, and my question is if one want to keep backwards compatibility. In the current PR, I kept the backwards compatibility and made `eta` of type `Union[float, torch.Tensor]`. If one does this one has to always catch if just a float is provided and transform the float in a tensor of the same length as the list of constraint callables. Another option would be to set `eta` as optional with default `None` and then just generate a tensor with the old default of 10e-3. Which solution would you prefer? Depending on your suggestion, I would finalize the PR and implement the functionality of different `eta`s per constraint though-out the whole codebase. ### Have you read the [Contributing Guidelines on pull requests] Yes. Pull Request resolved: #1526 Test Plan: Unit tests. Reviewed By: Balandat Differential Revision: D41600602 Pulled By: saitcakmak fbshipit-source-id: c04e68f6f0bb2264938f2dfb360a378cc661654a
diff --git a/botorch/acquisition/multi_objective/monte_carlo.py b/botorch/acquisition/multi_objective/monte_carlo.py
@@ -85,6 +85,7 @@ def __init__(
         sampler: Optional[MCSampler] = None,
         objective: Optional[MCMultiOutputObjective] = None,
         constraints: Optional[List[Callable[[Tensor], Tensor]]] = None,
+        eta: Optional[Union[Tensor, float]] = 1e-3,
         X_pending: Optional[Tensor] = None,
     ) -> None:
         r"""Constructor for the MCAcquisitionFunction base class.
@@ -102,6 +103,12 @@ def __init__(
                 `sample_shape x batch-shape x q x m` to a Tensor of dimension
                 `sample_shape x batch-shape x q`, where negative values imply
                 feasibility.
+            eta: The temperature parameter for the sigmoid function used for the
+                differentiable approximation of the constraints. In case of a float the
+                same eta is used for every constraint in constraints. In case of a
+                tensor the length of the tensor must match the number of provided
+                constraints. The i-th constraint is then estimated with the i-th
+                eta value.
             X_pending:  A `m x d`-dim Tensor of `m` design points that have
                 points that have been submitted for function evaluation
                 but have not yet been evaluated.
@@ -128,6 +135,10 @@ def __init__(
             )
         self.add_module("objective", objective)
         self.constraints = constraints
+        if constraints:
+            if type(eta) != Tensor:
+                eta = torch.full((len(constraints),), eta)
+            self.register_buffer("eta", eta)
         self.X_pending = None
         if X_pending is not None:
             self.set_X_pending(X_pending)
@@ -153,7 +164,7 @@ def __init__(
         objective: Optional[MCMultiOutputObjective] = None,
         constraints: Optional[List[Callable[[Tensor], Tensor]]] = None,
         X_pending: Optional[Tensor] = None,
-        eta: float = 1e-3,
+        eta: Optional[Union[Tensor, float]] = 1e-3,
     ) -> None:
         r"""q-Expected Hypervolume Improvement supporting m>=2 outcomes.
 
@@ -189,7 +200,11 @@ def __init__(
                 been evaluated. Concatenated into `X` upon forward call. Copied and set
                 to have no gradient.
             eta: The temperature parameter for the sigmoid function used for the
-                differentiable approximation of the constraints.
+                differentiable approximation of the constraints. In case of a float the
+                same eta is used for every constraint in constraints. In case of a
+                tensor the length of the tensor must match the number of provided
+                constraints. The i-th constraint is then estimated with the i-th
+                eta value.
         """
         if len(ref_point) != partitioning.num_outcomes:
             raise ValueError(
@@ -207,9 +222,9 @@ def __init__(
             sampler=sampler,
             objective=objective,
             constraints=constraints,
+            eta=eta,
             X_pending=X_pending,
         )
-        self.eta = eta
         self.register_buffer("ref_point", ref_point)
         cell_bounds = partitioning.get_hypercell_bounds()
         self.register_buffer("cell_lower_bounds", cell_bounds[0])
@@ -357,7 +372,7 @@ def __init__(
         objective: Optional[MCMultiOutputObjective] = None,
         constraints: Optional[List[Callable[[Tensor], Tensor]]] = None,
         X_pending: Optional[Tensor] = None,
-        eta: float = 1e-3,
+        eta: Optional[Union[Tensor, float]] = 1e-3,
         prune_baseline: bool = False,
         alpha: float = 0.0,
         cache_pending: bool = True,
@@ -400,7 +415,11 @@ def __init__(
                 have points that have been submitted for function evaluation, but
                 have not yet been evaluated.
             eta: The temperature parameter for the sigmoid function used for the
-                differentiable approximation of the constraints.
+                differentiable approximation of the constraints. In case of a float the
+                same eta is used for every constraint in constraints. In case of a
+                tensor the length of the tensor must match the number of provided
+                constraints. The i-th constraint is then estimated with the i-th
+                eta value.
             prune_baseline: If True, remove points in `X_baseline` that are
                 highly unlikely to be the pareto optimal and better than the
                 reference point. This can significantly improve computation time and
@@ -431,6 +450,7 @@ def __init__(
             sampler=sampler,
             objective=objective,
             constraints=constraints,
+            eta=eta,
         )
         self._setup(model=model, cache_root=cache_root)
 
@@ -450,7 +470,6 @@ def __init__(
             )
         self.register_buffer("ref_point", ref_point)
         self.alpha = alpha
-        self.eta = eta
         self.q_in = -1
         self.q_out = -1
         self.q_subset_indices = BufferDict()
diff --git a/botorch/acquisition/multi_objective/multi_fidelity.py b/botorch/acquisition/multi_objective/multi_fidelity.py
@@ -46,9 +46,9 @@ def __init__(
         sampler: Optional[MCSampler] = None,
         objective: Optional[MCMultiOutputObjective] = None,
         constraints: Optional[List[Callable[[Tensor], Tensor]]] = None,
+        eta: Optional[Union[Tensor, float]] = 1e-3,
         X_pending: Optional[Tensor] = None,
         cost_call: Callable[Tensor, Tensor] = None,
-        eta: float = 1e-3,
         **kwargs: Any,
     ) -> None:
         r"""MOMF acquisition function supporting m>=2 outcomes.
@@ -98,7 +98,11 @@ def __init__(
                 `batch_shape x q x m`. Defaults to an AffineCostModel with
                 `C(s) = 1 + s`.
             eta: The temperature parameter for the sigmoid function used for the
-                differentiable approximation of the constraints.
+                differentiable approximation of the constraints. In case of a float the
+                same eta is used for every constraint in constraints. In case of a
+                tensor the length of the tensor must match the number of provided
+                constraints. The i-th constraint is then estimated with the i-th
+                eta value.
         """
 
         if len(ref_point) != partitioning.num_outcomes:
@@ -119,6 +123,7 @@ def __init__(
             sampler=sampler,
             objective=objective,
             constraints=constraints,
+            eta=eta,
             X_pending=X_pending,
         )
 
diff --git a/botorch/acquisition/objective.py b/botorch/acquisition/objective.py
@@ -453,7 +453,7 @@ def __init__(
         objective: Callable[[Tensor, Optional[Tensor]], Tensor],
         constraints: List[Callable[[Tensor], Tensor]],
         infeasible_cost: Union[Tensor, float] = 0.0,
-        eta: float = 1e-3,
+        eta: Union[Tensor, float] = 1e-3,
     ) -> None:
         r"""
         Args:
@@ -468,11 +468,17 @@ def __init__(
             infeasible_cost: The cost of a design if all associated samples are
                 infeasible.
             eta: The temperature parameter of the sigmoid function approximating
-                the constraint.
+                the constraint. Can be either a float or a 1-dim tensor. In case
+                of a float the same eta is used for every constraint in
+                constraints. In case of a tensor the length of the tensor must
+                match the number of provided constraints. The i-th constraint is
+                then estimated with the i-th eta value.
         """
         super().__init__(objective=objective)
         self.constraints = constraints
-        self.register_buffer("eta", torch.as_tensor(eta))
+        if type(eta) != Tensor:
+            eta = torch.full((len(constraints),), eta)
+        self.register_buffer("eta", eta)
         self.register_buffer("infeasible_cost", torch.as_tensor(infeasible_cost))
 
     def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
diff --git a/botorch/utils/objective.py b/botorch/utils/objective.py
@@ -10,7 +10,7 @@
 
 from __future__ import annotations
 
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Union
 
 import torch
 from torch import Tensor
@@ -64,7 +64,7 @@ def apply_constraints_nonnegative_soft(
     obj: Tensor,
     constraints: List[Callable[[Tensor], Tensor]],
     samples: Tensor,
-    eta: float,
+    eta: Union[Tensor, float],
 ) -> Tensor:
     r"""Applies constraints to a non-negative objective.
 
@@ -78,14 +78,24 @@ def apply_constraints_nonnegative_soft(
             This callable must support broadcasting. Only relevant for multi-
             output models (`m` > 1).
         samples: A `n_samples x b x q x m` Tensor of samples drawn from the posterior.
-        eta: The temperature parameter for the sigmoid function.
+        eta: The temperature parameter for the sigmoid function. Can be either a float
+            or a 1-dim tensor. In case of a float the same eta is used for every
+            constraint in constraints. In case of a tensor the length of the tensor
+            must match the number of provided constraints. The i-th constraint is
+            then estimated with the i-th eta value.
 
     Returns:
         A `n_samples x b x q (x m')`-dim tensor of feasibility-weighted objectives.
     """
+    if type(eta) != Tensor:
+        eta = torch.full((len(constraints),), eta)
+    if len(eta) != len(constraints):
+        raise ValueError(
+            "Number of provided constraints and number of provided etas do not match."
+        )
     obj = obj.clamp_min(0)  # Enforce non-negativity with constraints
-    for constraint in constraints:
-        constraint_eval = soft_eval_constraint(constraint(samples), eta=eta)
+    for constraint, e in zip(constraints, eta):
+        constraint_eval = soft_eval_constraint(constraint(samples), eta=e)
         if obj.dim() == samples.dim():
             # Need to unsqueeze to accommodate the outcome dimension.
             constraint_eval = constraint_eval.unsqueeze(-1)
@@ -101,7 +111,7 @@ def soft_eval_constraint(lhs: Tensor, eta: float = 1e-3) -> Tensor:
     Args:
         lhs: The left hand side of the constraint `lhs <= 0`.
         eta: The temperature parameter of the softmax function. As eta
-            grows larger, this approximates the Heaviside step function.
+            decreases, this approximates the Heaviside step function.
 
     Returns:
         Element-wise 'soft' feasibility indicator of the same shape as `lhs`.
@@ -118,7 +128,7 @@ def apply_constraints(
     constraints: List[Callable[[Tensor], Tensor]],
     samples: Tensor,
     infeasible_cost: float,
-    eta: float = 1e-3,
+    eta: Union[Tensor, float] = 1e-3,
 ) -> Tensor:
     r"""Apply constraints using an infeasible_cost `M` for negative objectives.
 
@@ -136,7 +146,11 @@ def apply_constraints(
             output models (`m` > 1).
         samples: A `n_samples x b x q x m` Tensor of samples drawn from the posterior.
         infeasible_cost: The infeasible value.
-        eta: The temperature parameter of the sigmoid function.
+        eta: The temperature parameter of the sigmoid function. Can be either a float
+            or a 1-dim tensor. In case of a float the same eta is used for every
+            constraint in constraints. In case of a tensor the length of the tensor
+            must match the number of provided constraints. The i-th constraint is
+            then estimated with the i-th eta value.
 
     Returns:
         A `n_samples x b x q (x m')`-dim tensor of feasibility-weighted objectives.
diff --git a/test/acquisition/multi_objective/test_monte_carlo.py b/test/acquisition/multi_objective/test_monte_carlo.py
@@ -517,16 +517,59 @@ def test_constrained_q_expected_hypervolume_improvement(self):
             X = torch.zeros(1, 1, **tkwargs)
             # test zero slack
             for eta in (1e-1, 1e-2):
+                expected_values = [0.5 * 1.5, 0.5 * 0.5 * 1.5]
+                for i, constraints in enumerate(
+                    [
+                        [lambda Z: torch.zeros_like(Z[..., -1])],
+                        [
+                            lambda Z: torch.zeros_like(Z[..., -1]),
+                            lambda Z: torch.zeros_like(Z[..., -1]),
+                        ],
+                    ]
+                ):
+                    acqf = qExpectedHypervolumeImprovement(
+                        model=mm,
+                        ref_point=ref_point,
+                        partitioning=partitioning,
+                        sampler=sampler,
+                        constraints=constraints,
+                        eta=eta,
+                    )
+                    res = acqf(X)
+                    self.assertAlmostEqual(res.item(), expected_values[i], places=4)
+            # test multiple constraints one and multiple etas
+            constraints = [
+                lambda Z: torch.ones_like(Z[..., -1]),
+                lambda Z: torch.ones_like(Z[..., -1]),
+            ]
+            etas = [1, torch.tensor([1, 10])]
+            expected_values = [
+                (
+                    torch.sigmoid(torch.as_tensor(-1.0))
+                    * torch.sigmoid(torch.as_tensor(-1.0))
+                    * 1.5
+                ).item(),
+                (
+                    torch.sigmoid(torch.as_tensor(-1.0))
+                    * torch.sigmoid(torch.as_tensor(-1.0 / 10.0))
+                    * 1.5
+                ).item(),
+            ]
+            for eta, expected_value in zip(etas, expected_values):
                 acqf = qExpectedHypervolumeImprovement(
                     model=mm,
                     ref_point=ref_point,
                     partitioning=partitioning,
                     sampler=sampler,
-                    constraints=[lambda Z: torch.zeros_like(Z[..., -1])],
+                    constraints=constraints,
                     eta=eta,
                 )
                 res = acqf(X)
-                self.assertAlmostEqual(res.item(), 0.5 * 1.5, places=4)
+                self.assertAlmostEqual(
+                    res.item(),
+                    expected_value,
+                    places=4,
+                )
             # test feasible
             acqf = qExpectedHypervolumeImprovement(
                 model=mm,
@@ -1074,7 +1117,29 @@ def test_constrained_q_noisy_expected_hypervolume_improvement(self):
             )
             mm = MockModel(MockPosterior(samples=baseline_samples))
             X = torch.zeros(1, 1, **tkwargs)
-            # test zero slack
+            # test zero slack multiple constraints, multiple etas
+            for eta in [1e-1, 1e-2, torch.tensor([1.0, 10.0])]:
+                # set the MockPosterior to use samples over baseline points
+                mm._posterior._samples = baseline_samples
+                sampler = IIDNormalSampler(sample_shape=torch.Size([1]))
+                acqf = qNoisyExpectedHypervolumeImprovement(
+                    model=mm,
+                    ref_point=ref_point,
+                    X_baseline=X_baseline,
+                    sampler=sampler,
+                    constraints=[
+                        lambda Z: torch.zeros_like(Z[..., -1]),
+                        lambda Z: torch.zeros_like(Z[..., -1]),
+                    ],
+                    eta=eta,
+                    cache_root=False,
+                )
+                # set the MockPosterior to use samples over baseline points and new
+                # candidates
+                mm._posterior._samples = samples
+                res = acqf(X)
+                self.assertAlmostEqual(res.item(), 0.5 * 0.5 * 1.5, places=4)
+            # test zero slack single constraint
             for eta in (1e-1, 1e-2):
                 # set the MockPosterior to use samples over baseline points
                 mm._posterior._samples = baseline_samples
@@ -1169,6 +1234,37 @@ def test_constrained_q_noisy_expected_hypervolume_improvement(self):
             mm._posterior._samples = samples
             res = acqf(X)
             self.assertAlmostEqual(res.item(), 1.5, places=4)
+            # test multiple constraints one eta with
+            # this crashes for large etas, and I do not why
+            # set the MockPosterior to use samples over baseline points
+            etas = [torch.tensor([1.0]), torch.tensor([1.0, 10.0])]
+            constraints = [
+                [lambda Z: torch.ones_like(Z[..., -1])],
+                [
+                    lambda Z: torch.ones_like(Z[..., -1]),
+                    lambda Z: torch.ones_like(Z[..., -1]),
+                ],
+            ]
+            expected_values = [
+                (torch.sigmoid(torch.as_tensor(-1.0 / 1)) * 1.5).item(),
+                (
+                    torch.sigmoid(torch.as_tensor(-1.0 / 1))
+                    * torch.sigmoid(torch.as_tensor(-1.0 / 10))
+                    * 1.5
+                ).item(),
+            ]
+            for eta, constraint, expected_value in zip(
+                etas, constraints, expected_values
+            ):
+                acqf.constraints = constraint
+                acqf.eta = eta
+                res = acqf(X)
+
+                self.assertAlmostEqual(
+                    res.item(),
+                    expected_value,
+                    places=4,
+                )
             # test infeasible
             # set the MockPosterior to use samples over baseline points
             mm._posterior._samples = baseline_samples
diff --git a/test/acquisition/test_objective.py b/test/acquisition/test_objective.py
diff --git a/test/utils/test_objective.py b/test/utils/test_objective.py