add utilities for straight-through gradient estimators for discretization functions (#1515)

sdaulton · facebook-github-bot · commit 9da6f22e0366 · 2022-11-23T12:27:08.000-08:00
Summary: Pull Request resolved: #1515 see title Reviewed By: Balandat Differential Revision: D41475380 fbshipit-source-id: d5ba14b4f4e9c9fe51be73eec45ed03f625711f1
diff --git a/botorch/test_functions/multi_objective.py b/botorch/test_functions/multi_objective.py
@@ -11,7 +11,8 @@
 
 .. [Daulton2022]
     S. Daulton, S. Cakmak, M. Balandat, M. A. Osborne, E. Zhou, and E. Bakshy.
-    Robust Multi-Objective Bayesian Optimization Under Input Noise. 2022.
+    Robust Multi-Objective Bayesian Optimization Under Input Noise.
+    Proceedings of the 39th International Conference on Machine Learning, 2022.
 
 .. [Deb2005dtlz]
     K. Deb, L. Thiele, M. Laumanns, E. Zitzler, A. Abraham, L. Jain, and
diff --git a/botorch/utils/rounding.py b/botorch/utils/rounding.py
@@ -4,10 +4,24 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+r"""
+Discretization (rounding) functions for acquisition optimization.
+
+References
+
+.. [Daulton2022bopr]
+    S. Daulton, X. Wan, D. Eriksson, M. Balandat, M. A. Osborne, E. Bakshy.
+    Bayesian Optimization over Discrete and Mixed Spaces via Probabilistic
+    Reparameterization. Advances in Neural Information Processing Systems
+    35, 2022.
+"""
+
 from __future__ import annotations
 
 import torch
 from torch import Tensor
+from torch.autograd import Function
+from torch.nn.functional import one_hot
 
 
 def approximate_round(X: Tensor, tau: float = 1e-3) -> Tensor:
@@ -27,3 +41,68 @@ def approximate_round(X: Tensor, tau: float = 1e-3) -> Tensor:
     scaled_remainder = (X - offset - 0.5) / tau
     rounding_component = (torch.tanh(scaled_remainder) + 1) / 2
     return offset + rounding_component
+
+
+class IdentitySTEFunction(Function):
+    """Base class for functions using straight through gradient estimators.
+
+    This class approximates the gradient with the identity function.
+    """
+
+    @staticmethod
+    def backward(ctx, grad_output: Tensor) -> Tensor:
+        r"""Use a straight-through estimator the gradient.
+
+        This uses the identity function.
+
+        Args:
+            grad_output: A tensor of gradients.
+
+        Returns:
+            The provided tensor.
+        """
+        return grad_output
+
+
+class RoundSTE(IdentitySTEFunction):
+    r"""Round the input tensor and use a straight-through gradient estimator.
+
+    [Daulton2022bopr]_ proposes using this in acquisition optimization.
+    """
+
+    @staticmethod
+    def forward(ctx, X: Tensor) -> Tensor:
+        r"""Round the input tensor element-wise.
+
+        Args:
+            X: The tensor to be rounded.
+
+        Returns:
+            A tensor where each element is rounded to the nearest integer.
+        """
+        return X.round()
+
+
+class OneHotArgmaxSTE(IdentitySTEFunction):
+    r"""Discretize a continuous relaxation of a one-hot encoded categorical.
+
+    This returns a one-hot encoded categorical and use a straight-through
+    gradient estimator via an identity function.
+
+    [Daulton2022bopr]_ proposes using this in acquisition optimization.
+    """
+
+    @staticmethod
+    def forward(ctx, X: Tensor) -> Tensor:
+        r"""Discretize the input tensor.
+
+        This applies a argmax along the last dimensions of the input tensor
+        and one-hot encodes the result.
+
+        Args:
+            X: The tensor to be rounded.
+
+        Returns:
+            A tensor where each element is rounded to the nearest integer.
+        """
+        return one_hot(X.argmax(dim=-1), num_classes=X.shape[-1]).to(X)
diff --git a/test/utils/test_rounding.py b/test/utils/test_rounding.py
@@ -6,8 +6,20 @@
 
 
 import torch
-from botorch.utils.rounding import approximate_round
+from botorch.utils.rounding import (
+    approximate_round,
+    IdentitySTEFunction,
+    OneHotArgmaxSTE,
+    RoundSTE,
+)
 from botorch.utils.testing import BotorchTestCase
+from torch.nn.functional import one_hot
+
+
+class DummySTEFunction(IdentitySTEFunction):
+    @staticmethod
+    def forward(ctx, X):
+        return 2 * X
 
 
 class TestApproximateRound(BotorchTestCase):
@@ -25,3 +37,55 @@ def test_approximate_round(self):
             X.requires_grad_(True)
             approximate_round(X).sum().backward()
             self.assertTrue((X.grad.abs() != 0).any())
+
+
+class TestIdentitySTEFunction(BotorchTestCase):
+    def test_identity_ste(self):
+        for dtype in (torch.float, torch.double):
+            X = torch.rand(3, device=self.device, dtype=dtype)
+            with self.assertRaises(NotImplementedError):
+                IdentitySTEFunction.apply(X)
+            X = X.requires_grad_(True)
+            X_out = DummySTEFunction.apply(X)
+            X_out.sum().backward()
+            self.assertTrue(torch.equal(2 * X, X_out))
+            self.assertTrue(torch.equal(X.grad, torch.ones_like(X)))
+
+
+class TestRoundSTE(BotorchTestCase):
+    def test_round_ste(self):
+        for dtype in (torch.float, torch.double):
+            # sample uniformly from the interval [-2.5,2.5]
+            X = torch.rand(5, 2, device=self.device, dtype=dtype) * 5 - 2.5
+            expected_rounded_X = X.round()
+            rounded_X = RoundSTE.apply(X)
+            # test forward
+            self.assertTrue(torch.equal(expected_rounded_X, rounded_X))
+            # test backward
+            X = X.requires_grad_(True)
+            output = RoundSTE.apply(X)
+            # sample some weights to checked that gradients are passed
+            # as intended
+            w = torch.rand_like(X)
+            (w * output).sum().backward()
+            self.assertTrue(torch.equal(w, X.grad))
+
+
+class TestOneHotArgmaxSTE(BotorchTestCase):
+    def test_one_hot_argmax_ste(self):
+        for dtype in (torch.float, torch.double):
+            X = torch.rand(5, 4, device=self.device, dtype=dtype)
+            expected_discretized_X = one_hot(
+                X.argmax(dim=-1), num_classes=X.shape[-1]
+            ).to(X)
+            discretized_X = OneHotArgmaxSTE.apply(X)
+            # test forward
+            self.assertTrue(torch.equal(expected_discretized_X, discretized_X))
+            # test backward
+            X = X.requires_grad_(True)
+            output = OneHotArgmaxSTE.apply(X)
+            # sample some weights to checked that gradients are passed
+            # as intended
+            w = torch.rand_like(X)
+            (w * output).sum().backward()
+            self.assertTrue(torch.equal(w, X.grad))