Make behavior of Cholesky caching more clear (#1504)

esantorella · facebook-github-bot · commit 24ee799483c2 · 2022-11-16T10:10:20.000-08:00
Summary: ## Motivation General context: Caching is confusing and can lead to subtle issues. I have been trying to understand it better in order to reduce memory usage and improve runtime, since I've been seeing cache misses and tensors persisting longer than necessary. This PR doesn't fix that, but does make things a tiny bit more transparent. Two things are making the "_cache_root_decomposition" method harder to understand than necessary: 1. It sets `self._baseline_L` and returns `None` rather than just returning `baseline_L`, so when someone sees a call to `_cache_root_decomposition` they will not immediately realize `self._baseline_L` has been set. 2. It is uses two different kinds of caching: it sets `self._baseline_L` and it also invisibly uses LinearOperator's caching. This PR makes things more transparent by * Adding comments * Returning `baseline_L` rather than setting it as a side effect ### Have you read the [Contributing Guidelines on pull requests](https://github.com/pytorch/botorch/blob/main/CONTRIBUTING.md#pull-requests)? Yes Pull Request resolved: #1504 Test Plan: Unit tests Reviewed By: Balandat Differential Revision: D41308840 Pulled By: esantorella fbshipit-source-id: 0a4c9e331923c3a6b8cc8212e5f605f1c65b9901
diff --git a/botorch/acquisition/cached_cholesky.py b/botorch/acquisition/cached_cholesky.py
@@ -97,12 +97,24 @@ def _setup(
             cache_root = False
         self._cache_root = cache_root
 
-    def _cache_root_decomposition(
+    def _compute_root_decomposition(
         self,
         posterior: Posterior,
-    ) -> None:
+    ) -> Tensor:
         r"""Cache Cholesky of the posterior covariance over f(X_baseline).
 
+        Because `LinearOperator.root_decomposition` is decorated with LinearOperator's
+        @cached decorator, this function is doing a lot implicitly:
+
+        1) Check if a root decomposition has already been cached to `lazy_covar`.
+          Note that it will not have been if `posterior.mvn` is a
+          `MultitaskMultivariateNormal`, since we construct `lazy_covar` in that
+          case.
+        2) If the root decomposition has not been found in the cache, compute it.
+        3) Write it to the cache of `lazy_covar`. Note that this will become inacessible
+          if `posterior.mvn` is a `MultitaskMultivariateNormal`, since in that case
+          `lazy_covar`'s scope is only this function.
+
         Args:
             posterior: The posterior over f(X_baseline).
         """
@@ -112,8 +124,7 @@ def _cache_root_decomposition(
             lazy_covar = posterior.mvn.lazy_covariance_matrix
         with gpt_settings.fast_computations.covar_root_decomposition(False):
             lazy_covar_root = lazy_covar.root_decomposition()
-            baseline_L = lazy_covar_root.root.to_dense()
-        self.register_buffer("_baseline_L", baseline_L)
+        return lazy_covar_root.root.to_dense()
 
     def _get_f_X_samples(self, posterior: GPyTorchPosterior, q_in: int) -> Tensor:
         r"""Get posterior samples at the `q_in` new points from the joint posterior.
diff --git a/botorch/acquisition/monte_carlo.py b/botorch/acquisition/monte_carlo.py
@@ -254,6 +254,9 @@ def __init__(
             X_pending=X_pending,
         )
         self._setup(model=model, sampler=self.sampler, cache_root=cache_root)
+        # We make a copy here because we will write an attribute `base_samples`
+        # to `self.base_sampler.base_samples`, and we don't want to mutate
+        # `self.sampler`.
         self.base_sampler = deepcopy(self.sampler)
         if prune_baseline:
             X_baseline = prune_inferior_points(
@@ -272,13 +275,22 @@ def __init__(
                 posterior = self.model.posterior(
                     X_baseline, posterior_transform=self.posterior_transform
                 )
+                # Note: The root decomposition is cached in two different places. It
+                # may be confusing to have two different caches, but this is not
+                # trivial to change since each is needed for a different reason:
+                # - LinearOperator caching to `posterior.mvn` allows for reuse within
+                #  this function, which may be helpful if the same root decomposition
+                #  is produced by the calls to `self.base_sampler` and
+                #  `self._cache_root_decomposition`.
+                # - self._baseline_L allows a root decomposition to be persisted outside
+                #   this method.
                 baseline_samples = self.base_sampler(posterior)
             baseline_obj = self.objective(baseline_samples, X=X_baseline)
             self.register_buffer("baseline_samples", baseline_samples)
             self.register_buffer(
                 "baseline_obj_max_values", baseline_obj.max(dim=-1).values
             )
-            self._cache_root_decomposition(posterior=posterior)
+            self._baseline_L = self._compute_root_decomposition(posterior=posterior)
 
     def _set_sampler(
         self,
diff --git a/botorch/acquisition/multi_objective/monte_carlo.py b/botorch/acquisition/multi_objective/monte_carlo.py
@@ -545,7 +545,11 @@ def _set_cell_bounds(self, num_new_points: int) -> None:
             samples = self.base_sampler(posterior)
             # cache posterior
             if self._cache_root:
-                self._cache_root_decomposition(posterior=posterior)
+                # Note that this implicitly uses LinearOperator's caching to check if
+                # the proper root decomposition has already been cached to
+                # `posterior.mvn.lazy_covariance_matrix`, which it may have been in
+                # the call to `self.base_sampler`, and computes it if not found
+                self._baseline_L = self._compute_root_decomposition(posterior=posterior)
             obj = self.objective(samples, X=self.X_baseline)
             if self.constraints is not None:
                 feas = torch.stack(
diff --git a/test/acquisition/test_cached_cholesky.py b/test/acquisition/test_cached_cholesky.py
@@ -110,7 +110,9 @@ def test_cache_root_decomposition(self):
                 with mock.patch(
                     CHOLESKY_PATH, return_value=baseline_L
                 ) as mock_cholesky:
-                    acqf._cache_root_decomposition(posterior=posterior)
+                    baseline_L_acqf = acqf._compute_root_decomposition(
+                        posterior=posterior
+                    )
                     mock_extract_batch_covar.assert_called_once_with(posterior.mvn)
                     mock_cholesky.assert_called_once()
             # test mvn
@@ -121,10 +123,12 @@ def test_cache_root_decomposition(self):
                 with mock.patch(
                     CHOLESKY_PATH, return_value=baseline_L
                 ) as mock_cholesky:
-                    acqf._cache_root_decomposition(posterior=posterior)
+                    baseline_L_acqf = acqf._compute_root_decomposition(
+                        posterior=posterior
+                    )
                     mock_extract_batch_covar.assert_not_called()
                     mock_cholesky.assert_called_once()
-            self.assertTrue(torch.equal(acqf._baseline_L, baseline_L))
+            self.assertTrue(torch.equal(baseline_L_acqf, baseline_L))
 
     def test_get_f_X_samples(self):
         tkwargs = {"device": self.device}
diff --git a/test/acquisition/test_monte_carlo.py b/test/acquisition/test_monte_carlo.py
@@ -577,7 +577,7 @@ def test_cache_root(self):
         pt = ScalarizedPosteriorTransform(weights=torch.tensor([-1]))
         with mock.patch.object(
             qNoisyExpectedImprovement,
-            "_cache_root_decomposition",
+            "_compute_root_decomposition",
         ) as mock_cache_root:
             acqf = qNoisyExpectedImprovement(
                 model=model,