Add propagate_grads context manager (#221)

Balandat · facebook-github-bot · commit 41789c7015b1 · 2019-08-02T09:05:57.000-07:00
Summary: Pull Request resolved: #221 Adds a `botorch.settings` module that introduces a contextmanager for the `propagate_grads` setting. This cleans up the API by removing the `propagate_grads` kwarg from `Model.posterior`. The new pattern for propagating gradients to the training inputs of a model is the following: ``` with settings.propagate_grads(True): post_X = self.posterior(X, observation_noise=observation_noise) ``` Right now this is very much just a wrapper around GPyTorch's `detach_test_caches` setting, but this allows implementing everything in a model-agnostic fashion. Reviewed By: sdaulton Differential Revision: D16583422 fbshipit-source-id: af070580f3016c57f70b726fc416307b772202e7
diff --git a/botorch/__init__.py b/botorch/__init__.py
@@ -2,7 +2,15 @@
 
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 
-from . import acquisition, exceptions, models, optim, posteriors, test_functions
+from . import (
+    acquisition,
+    exceptions,
+    models,
+    optim,
+    posteriors,
+    settings,
+    test_functions,
+)
 from .cross_validation import batch_cross_validation
 from .fit import fit_gpytorch_model
 from .gen import gen_candidates_scipy, gen_candidates_torch, get_best_candidates
@@ -24,5 +32,6 @@
     "models",
     "optim",
     "posteriors",
+    "settings",
     "test_functions",
 ]
diff --git a/botorch/models/gpytorch.py b/botorch/models/gpytorch.py
@@ -14,11 +14,12 @@
 from typing import Any, List, Optional, Tuple
 
 import torch
-from gpytorch import settings
+from gpytorch import settings as gpt_settings
 from gpytorch.distributions import MultitaskMultivariateNormal, MultivariateNormal
 from gpytorch.lazy import lazify
 from torch import Tensor
 
+from .. import settings
 from ..posteriors.gpytorch import GPyTorchPosterior
 from .model import Model
 from .utils import _make_X_full, add_output_dim, multioutput_to_batch_mode_transform
@@ -40,22 +41,19 @@ def posterior(
             X: A `(batch_shape) x q x d`-dim Tensor, where `d` is the dimension of the
                 feature space and `q` is the number of points considered jointly.
             observation_noise: If True, add observation noise to the posterior.
-            propagate_grads: If True, do not detach GPyTorch's test caches when
-                computing the posterior. Required for being able to compute
-                derivatives with respect to training inputs at test time (used
-                e.g. by qNoisyExpectedImprovement). Defaults to `False`.
 
         Returns:
             A `GPyTorchPosterior` object, representing a batch of `b` joint
             distributions over `q` points. Includes observation noise if
             `observation_noise=True`.
         """
         self.eval()  # make sure model is in eval mode
-        detach_test_caches = not kwargs.get("propagate_grads", False)
         with ExitStack() as es:
-            es.enter_context(settings.debug(False))
-            es.enter_context(settings.fast_pred_var())
-            es.enter_context(settings.detach_test_caches(detach_test_caches))
+            es.enter_context(gpt_settings.debug(False))
+            es.enter_context(gpt_settings.fast_pred_var())
+            es.enter_context(
+                gpt_settings.detach_test_caches(settings.propagate_grads.off())
+            )
             mvn = self(X)
             if observation_noise:
                 # TODO: Allow passing in observation noise via kwarg
@@ -162,10 +160,6 @@ def posterior(
                 model's outputs are required for optimization. If omitted,
                 computes the posterior over all model outputs.
             observation_noise: If True, add observation noise to the posterior.
-            propagate_grads: If True, do not detach GPyTorch's test caches when
-                computing of the posterior. Required for being able to compute
-                derivatives with respect to training inputs at test time (used
-                e.g. by qNoisyExpectedImprovement). Defaults to `False`.
 
         Returns:
             A `GPyTorchPosterior` object, representing `batch_shape` joint
@@ -174,11 +168,12 @@ def posterior(
             `observation_noise=True`.
         """
         self.eval()  # make sure model is in eval mode
-        detach_test_caches = not kwargs.get("propagate_grads", False)
         with ExitStack() as es:
-            es.enter_context(settings.debug(False))
-            es.enter_context(settings.fast_pred_var())
-            es.enter_context(settings.detach_test_caches(detach_test_caches))
+            es.enter_context(gpt_settings.debug(False))
+            es.enter_context(gpt_settings.fast_pred_var())
+            es.enter_context(
+                gpt_settings.detach_test_caches(settings.propagate_grads.off())
+            )
             # insert a dimension for the output dimension
             if self._num_outputs > 1:
                 X, output_dim_idx = add_output_dim(
@@ -242,12 +237,9 @@ def condition_on_observations(
             num_outputs=self._num_outputs,
             train_Yvar=kwargs.get("noise", None),
         )
-        fant_kwargs = {k: v for k, v in kwargs.items() if k != "propagate_grads"}
         if noise is not None:
-            fant_kwargs.update({"noise": noise})
-        fantasy_model = super().condition_on_observations(
-            X=inputs, Y=targets, **fant_kwargs
-        )
+            kwargs.update({"noise": noise})
+        fantasy_model = super().condition_on_observations(X=inputs, Y=targets, **kwargs)
         fantasy_model._input_batch_shape = fantasy_model.train_targets.shape[
             : (-1 if self._num_outputs == 1 else -2)
         ]
@@ -286,23 +278,20 @@ def posterior(
                 model's outputs are required for optimization. If omitted,
                 computes the posterior over all model outputs.
             observation_noise: If True, add observation noise to the posterior.
-            propagate_grads: If True, do not detach GPyTorch's test caches when
-                computing of the posterior. Required for being able to compute
-                derivatives with respect to training inputs at test time (used
-                e.g. by qNoisyExpectedImprovement). Defaults to `False`.
 
         Returns:
             A `GPyTorchPosterior` object, representing `batch_shape` joint
             distributions over `q` points and the outputs selected by
             `output_indices` each. Includes measurement noise if
             `observation_noise=True`.
         """
-        detach_test_caches = not kwargs.get("propagate_grads", False)
         self.eval()  # make sure model is in eval mode
         with ExitStack() as es:
-            es.enter_context(settings.debug(False))
-            es.enter_context(settings.fast_pred_var())
-            es.enter_context(settings.detach_test_caches(detach_test_caches))
+            es.enter_context(gpt_settings.debug(False))
+            es.enter_context(gpt_settings.fast_pred_var())
+            es.enter_context(
+                gpt_settings.detach_test_caches(settings.propagate_grads.off())
+            )
             if output_indices is not None:
                 mvns = [self.forward_i(i, X) for i in output_indices]
                 if observation_noise:
@@ -357,10 +346,6 @@ def posterior(
                 model's outputs are required for optimization. If omitted,
                 computes the posterior over all model outputs.
             observation_noise: If True, add observation noise to the posterior.
-            propagate_grads: If True, do not detach GPyTorch's test caches when
-                computing of the posterior. Required for being able to compute
-                derivatives with respect to training inputs at test time (used
-                e.g. by qNoisyExpectedImprovement). Defaults to `False`.
 
         Returns:
             A `GPyTorchPosterior` object, representing `batch_shape` joint
@@ -377,11 +362,12 @@ def posterior(
         X_full = _make_X_full(X=X, output_indices=output_indices, tf=self._task_feature)
 
         self.eval()  # make sure model is in eval mode
-        detach_test_caches = not kwargs.get("propagate_grads", False)
         with ExitStack() as es:
-            es.enter_context(settings.debug(False))
-            es.enter_context(settings.fast_pred_var())
-            es.enter_context(settings.detach_test_caches(detach_test_caches))
+            es.enter_context(gpt_settings.debug(False))
+            es.enter_context(gpt_settings.fast_pred_var())
+            es.enter_context(
+                gpt_settings.detach_test_caches(settings.propagate_grads.off())
+            )
             mvn = self(X_full)
             if observation_noise:
                 # TODO: Allow passing in observation noise via kwarg
diff --git a/botorch/models/model.py b/botorch/models/model.py
@@ -12,6 +12,7 @@
 from torch import Tensor
 from torch.nn import Module
 
+from .. import settings
 from ..posteriors import Posterior
 from ..sampling.samplers import MCSampler
 
@@ -96,6 +97,8 @@ def fantasize(
         Returns:
             The constructed fantasy model.
         """
-        post_X = self.posterior(X, observation_noise=observation_noise, **kwargs)
+        propagate_grads = kwargs.pop("propagate_grads", False)
+        with settings.propagate_grads(propagate_grads):
+            post_X = self.posterior(X, observation_noise=observation_noise)
         Y_fantasized = sampler(post_X)  # num_fantasies x batch_shape x m x o
         return self.condition_on_observations(X=X, Y=Y_fantasized, **kwargs)
diff --git a/botorch/settings.py b/botorch/settings.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+import typing  # noqa F401
+
+
+class _Flag:
+    r"""Base class for context managers for a binary setting."""
+
+    _state: bool = False
+
+    @classmethod
+    def on(cls) -> bool:
+        return cls._state
+
+    @classmethod
+    def off(cls) -> bool:
+        return not cls._state
+
+    @classmethod
+    def _set_state(cls, state: bool) -> None:
+        cls._state = state
+
+    def __init__(self, state: bool = True) -> None:
+        self.prev = self.__class__.on()
+        self.state = state
+
+    def __enter__(self) -> None:
+        self.__class__._set_state(self.state)
+
+    def __exit__(self, *args) -> None:
+        self.__class__._set_state(self.prev)
+
+
+class propagate_grads(_Flag):
+    r"""Flag for propagating gradients to model training inputs / training data.
+
+    When set to `True`, gradients will be propagated to the training inputs.
+    This is useful in particular for propating gradients through fantasy models.
+    """
+
+    _state: bool = False
diff --git a/sphinx/source/index.rst b/sphinx/source/index.rst
@@ -18,6 +18,7 @@ BoTorch API Reference
    fit
    gen
    sampling
+   settings
    test_functions
    exceptions
    utils
diff --git a/sphinx/source/settings.rst b/sphinx/source/settings.rst
@@ -0,0 +1,7 @@
+.. role:: hidden
+    :class: hidden-section
+
+botorch.settings
+================
+.. automodule:: botorch.settings
+.. currentmodule:: botorch.settings
diff --git a/test/optim/test_random_restart_optimization.py b/test/optim/test_random_restart_optimization.py
@@ -5,7 +5,7 @@
 import torch
 from botorch.acquisition import qExpectedImprovement
 from botorch.gen import gen_candidates_scipy, get_best_candidates
-from gpytorch import settings
+from gpytorch import settings as gpt_settings
 
 from ..test_gen import TestBaseCandidateGeneration
 
@@ -17,7 +17,7 @@ class TestRandomRestartOptimization(TestBaseCandidateGeneration):
     def test_random_restart_optimization(self, cuda=False):
         for double in (True, False):
             self._setUp(double=double, cuda=cuda)
-            with settings.debug(False):
+            with gpt_settings.debug(False):
                 best_f = self.model(self.train_x).mean.max().item()
             qEI = qExpectedImprovement(self.model, best_f=best_f)
             bounds = torch.tensor([[0.0], [1.0]]).type_as(self.train_x)
diff --git a/test/test_settings.py b/test/test_settings.py
@@ -0,0 +1,19 @@
+#! /usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import unittest
+
+from botorch import settings
+
+
+class TestSettings(unittest.TestCase):
+    def test_propagate_grads(self):
+        pgrads = settings.propagate_grads
+        self.assertFalse(pgrads.on())
+        self.assertTrue(pgrads.off())
+        with settings.propagate_grads(True):
+            self.assertTrue(pgrads.on())
+            self.assertFalse(pgrads.off())
+        self.assertFalse(pgrads.on())
+        self.assertTrue(pgrads.off())