add support for categoricals in Round input transform and use STEs (#1516)

sdaulton · facebook-github-bot · commit 97626a91b702 · 2022-11-23T12:27:08.000-08:00
Summary: Pull Request resolved: #1516 see title Reviewed By: Balandat Differential Revision: D41477456 fbshipit-source-id: 21e500b887349f8164223fee696e46c506d61ab2
diff --git a/botorch/models/transforms/input.py b/botorch/models/transforms/input.py
@@ -12,19 +12,19 @@
 rounding functions, and log transformations. The input transformation
 is typically part of a Model and applied within the model.forward()
 method.
-
 """
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
 from collections import OrderedDict
 from typing import Any, Callable, Dict, List, Optional, Union
+from warnings import warn
 
 import torch
 from botorch.exceptions.errors import BotorchTensorDimensionError
 from botorch.models.transforms.utils import subset_transform
 from botorch.models.utils import fantasize
-from botorch.utils.rounding import approximate_round
+from botorch.utils.rounding import approximate_round, OneHotArgmaxSTE, RoundSTE
 from gpytorch import Module as GPyTorchModule
 from gpytorch.constraints import GreaterThan
 from gpytorch.priors import Prior
@@ -649,10 +649,10 @@ def _update_coefficients(self, X: Tensor) -> None:
 
 
 class Round(InputTransform, Module):
-    r"""A rounding transformation for integer inputs.
+    r"""A discretization transformation for discrete inputs.
 
-    This will typically be used in conjunction with normalization as
-    follows:
+    For integers, this will typically be used in conjunction
+    with normalization as follows:
 
     In eval() mode (i.e. after training), the inputs pass
     would typically be normalized to the unit cube (e.g. during candidate
@@ -667,19 +667,26 @@ class Round(InputTransform, Module):
     should be set to False, so that the raw inputs are rounded and then
     normalized to the unit cube.
 
-    This transformation uses differentiable approximate rounding by default.
-    The rounding function is approximated with a piece-wise function where
-    each piece is a hyperbolic tangent function.
+    By default, the straight through estimators are used for the gradients as
+    proposed in [Daulton2022bopr]_. This transformation supports differentiable
+    approximate rounding (currently only for integers). The rounding function
+    is approximated with a piece-wise function where each piece is a hyperbolic
+    tangent function.
+
+    For categorical parameters, the input must be one-hot encoded.
 
     Example:
+        >>> bounds = torch.tensor([[0, 5], [0, 1], [0, 1]]).t()
+        >>> integer_indices = [0]
+        >>> categorical_features = {1: 2}
         >>> unnormalize_tf = Normalize(
         >>>     d=d,
         >>>     bounds=bounds,
         >>>     transform_on_eval=True,
         >>>     transform_on_train=True,
         >>>     reverse=True,
         >>> )
-        >>> round_tf = Round(integer_indices)
+        >>> round_tf = Round(integer_indices, categorical_features)
         >>> normalize_tf = Normalize(d=d, bounds=bounds)
         >>> tf = ChainedInputTransform(
         >>>     tf1=unnormalize_tf, tf2=round_tf, tf3=normalize_tf
@@ -688,46 +695,76 @@ class Round(InputTransform, Module):
 
     def __init__(
         self,
-        indices: List[int],
+        integer_indices: Optional[List[int]] = None,
+        categorical_features: Optional[Dict[int, int]] = None,
         transform_on_train: bool = True,
         transform_on_eval: bool = True,
         transform_on_fantasize: bool = True,
-        approximate: bool = True,
+        approximate: bool = False,
         tau: float = 1e-3,
+        **kwargs,
     ) -> None:
         r"""Initialize transform.
 
         Args:
-            indices: The indices of the integer inputs.
+            integer_indices: The indices of the integer inputs.
+            categorical_features: A dictionary mapping the starting index of each
+                categorical feature to its cardinality. This assumes that categoricals
+                are one-hot encoded.
             transform_on_train: A boolean indicating whether to apply the
                 transforms in train() mode. Default: True.
             transform_on_eval: A boolean indicating whether to apply the
                 transform in eval() mode. Default: True.
             transform_on_fantasize: A boolean indicating whether to apply the
                 transform when called from within a `fantasize` call. Default: True.
             approximate: A boolean indicating whether approximate or exact
-                rounding should be used. Default: approximate.
+                rounding should be used. Default: False.
             tau: The temperature parameter for approximate rounding.
         """
+        indices = kwargs.get("indices")
+        if indices is not None:
+            warn(
+                "`indices` is marked for deprecation in favor of `integer_indices`.",
+                DeprecationWarning,
+            )
+            integer_indices = indices
+        if approximate and categorical_features is not None:
+            raise NotImplementedError
         super().__init__()
         self.transform_on_train = transform_on_train
         self.transform_on_eval = transform_on_eval
         self.transform_on_fantasize = transform_on_fantasize
-        self.register_buffer("indices", torch.tensor(indices, dtype=torch.long))
+        integer_indices = integer_indices or []
+        self.register_buffer(
+            "integer_indices", torch.tensor(integer_indices, dtype=torch.long)
+        )
+        self.categorical_features = categorical_features or {}
         self.approximate = approximate
         self.tau = tau
 
-    @subset_transform
     def transform(self, X: Tensor) -> Tensor:
-        r"""Round the inputs.
+        r"""Discretize the inputs.
 
         Args:
             X: A `batch_shape x n x d`-dim tensor of inputs.
 
         Returns:
-            A `batch_shape x n x d`-dim tensor of rounded inputs.
+            A `batch_shape x n x d`-dim tensor of discretized inputs.
         """
-        return approximate_round(X, tau=self.tau) if self.approximate else X.round()
+        X_rounded = X.clone()
+        # round integers
+        X_int = X_rounded[..., self.integer_indices]
+        if self.approximate:
+            X_int = approximate_round(X_int, tau=self.tau)
+        else:
+            X_int = RoundSTE.apply(X_int)
+        X_rounded[..., self.integer_indices] = X_int
+        # discrete categoricals to the category with the largest value
+        # in the continuous relaxation of the one-hot encoding
+        for start, card in self.categorical_features.items():
+            end = start + card
+            X_rounded[..., start:end] = OneHotArgmaxSTE.apply(X[..., start:end])
+        return X_rounded
 
     def equals(self, other: InputTransform) -> bool:
         r"""Check if another input transform is equivalent.
@@ -740,6 +777,8 @@ def equals(self, other: InputTransform) -> bool:
         """
         return (
             super().equals(other=other)
+            and (self.integer_indices == other.integer_indices).all()
+            and self.categorical_features == other.categorical_features
             and self.approximate == other.approximate
             and self.tau == other.tau
         )
diff --git a/test/models/transforms/test_input.py b/test/models/transforms/test_input.py
@@ -5,9 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 
 import itertools
+import warnings
 from copy import deepcopy
 
 import torch
+from botorch import settings
 from botorch.exceptions.errors import BotorchTensorDimensionError
 from botorch.models.transforms.input import (
     AffineInputTransform,
@@ -29,6 +31,7 @@
 from torch import Tensor
 from torch.distributions import Kumaraswamy
 from torch.nn import Module
+from torch.nn.functional import one_hot
 
 
 def get_test_warp(indices, **kwargs):
@@ -534,19 +537,45 @@ def test_chained_input_transform(self):
     def test_round_transform(self):
         for dtype in (torch.float, torch.double):
             # basic init
-            int_idcs = [0, 2]
-            round_tf = Round(indices=[0, 2])
-            self.assertEqual(round_tf.indices.tolist(), int_idcs)
+            int_idcs = [0, 4]
+            categorical_feats = {2: 2, 5: 3}
+            # test deprecation warning
+            with warnings.catch_warnings(record=True) as ws, settings.debug(True):
+                Round(indices=int_idcs)
+                self.assertTrue(
+                    any(issubclass(w.category, DeprecationWarning) for w in ws)
+                )
+            round_tf = Round(
+                integer_indices=int_idcs, categorical_features=categorical_feats
+            )
+            self.assertEqual(round_tf.integer_indices.tolist(), int_idcs)
+            self.assertEqual(round_tf.categorical_features, categorical_feats)
             self.assertTrue(round_tf.training)
-            self.assertTrue(round_tf.approximate)
+            self.assertFalse(round_tf.approximate)
             self.assertEqual(round_tf.tau, 1e-3)
 
             # basic usage
-            for batch_shape, approx in itertools.product(
-                (torch.Size(), torch.Size([3])), (False, True)
+            for batch_shape, approx, categorical_features in itertools.product(
+                (torch.Size(), torch.Size([3])),
+                (False, True),
+                (None, categorical_feats),
             ):
-                X = 5 * torch.rand(*batch_shape, 4, 3, device=self.device, dtype=dtype)
-                round_tf = Round(indices=[0, 2], approximate=approx)
+                X = torch.rand(*batch_shape, 4, 8, device=self.device, dtype=dtype)
+                X[..., int_idcs] *= 5
+                if categorical_features is not None and approx:
+                    with self.assertRaises(NotImplementedError):
+                        Round(
+                            integer_indices=int_idcs,
+                            categorical_features=categorical_features,
+                            approximate=approx,
+                        )
+                    continue
+                round_tf = Round(
+                    integer_indices=int_idcs,
+                    categorical_features=categorical_features,
+                    approximate=approx,
+                    tau=1e-1,
+                )
                 X_rounded = round_tf(X)
                 exact_rounded_X_ints = X[..., int_idcs].round()
                 # check non-integers parameters are unchanged
@@ -560,17 +589,39 @@ def test_round_transform(self):
                             <= (X[..., int_idcs] - exact_rounded_X_ints).abs()
                         ).all()
                     )
+                    self.assertFalse(
+                        torch.equal(X_rounded[..., int_idcs], exact_rounded_X_ints)
+                    )
                 else:
-                    # check that exact rounding behaves as expected
+                    # check that exact rounding behaves as expected for integers
                     self.assertTrue(
                         torch.equal(X_rounded[..., int_idcs], exact_rounded_X_ints)
                     )
+                    if categorical_features is not None:
+                        # test that discretization works as expected for categoricals
+                        for start, card in categorical_features.items():
+                            end = start + card
+                            expected_categorical = one_hot(
+                                X[..., start:end].argmax(dim=-1), num_classes=card
+                            ).to(X)
+                            self.assertTrue(
+                                torch.equal(
+                                    X_rounded[..., start:end], expected_categorical
+                                )
+                            )
+                    # test that gradient information is passed via STE
+                    X2 = X.clone().requires_grad_(True)
+                    round_tf(X2).sum().backward()
+                    self.assertTrue(torch.equal(X2.grad, torch.ones_like(X2)))
                 with self.assertRaises(NotImplementedError):
                     round_tf.untransform(X_rounded)
 
                 # test no transform on eval
                 round_tf = Round(
-                    indices=int_idcs, approximate=approx, transform_on_eval=False
+                    integer_indices=int_idcs,
+                    categorical_features=categorical_features,
+                    approximate=approx,
+                    transform_on_eval=False,
                 )
                 X_rounded = round_tf(X)
                 self.assertFalse(torch.equal(X, X_rounded))
@@ -580,7 +631,10 @@ def test_round_transform(self):
 
                 # test no transform on train
                 round_tf = Round(
-                    indices=int_idcs, approximate=approx, transform_on_train=False
+                    integer_indices=int_idcs,
+                    categorical_features=categorical_features,
+                    approximate=approx,
+                    transform_on_train=False,
                 )
                 X_rounded = round_tf(X)
                 self.assertTrue(torch.equal(X, X_rounded))
@@ -590,27 +644,48 @@ def test_round_transform(self):
 
                 # test equals
                 round_tf2 = Round(
-                    indices=int_idcs, approximate=approx, transform_on_train=False
+                    integer_indices=int_idcs,
+                    categorical_features=categorical_features,
+                    approximate=approx,
+                    transform_on_train=False,
                 )
                 self.assertTrue(round_tf.equals(round_tf2))
                 # test different transform_on_train
-                round_tf2 = Round(indices=int_idcs, approximate=approx)
+                round_tf2 = Round(
+                    integer_indices=int_idcs,
+                    categorical_features=categorical_features,
+                    approximate=approx,
+                )
                 self.assertFalse(round_tf.equals(round_tf2))
                 # test different approx
+                round_tf = Round(
+                    integer_indices=int_idcs,
+                )
                 round_tf2 = Round(
-                    indices=int_idcs, approximate=not approx, transform_on_train=False
+                    integer_indices=int_idcs,
+                    approximate=not approx,
+                    transform_on_train=False,
                 )
                 self.assertFalse(round_tf.equals(round_tf2))
                 # test different indices
+                round_tf = Round(
+                    integer_indices=int_idcs,
+                    categorical_features=categorical_features,
+                    transform_on_train=False,
+                )
                 round_tf2 = Round(
-                    indices=[0, 1], approximate=approx, transform_on_train=False
+                    integer_indices=[0, 1],
+                    categorical_features=categorical_features,
+                    approximate=approx,
+                    transform_on_train=False,
                 )
                 self.assertFalse(round_tf.equals(round_tf2))
 
                 # test preprocess_transform
                 round_tf.transform_on_train = False
                 self.assertTrue(torch.equal(round_tf.preprocess_transform(X), X))
                 round_tf.transform_on_train = True
+                X_rounded = round_tf(X)
                 self.assertTrue(
                     torch.equal(round_tf.preprocess_transform(X), X_rounded)
                 )