Handle Cholesky errors when fitting a fully bayesian model (#1507)

saitcakmak · facebook-github-bot · commit c6595ed2c862 · 2022-11-19T13:36:55.000-08:00
Summary: Pull Request resolved: #1507 X-link: facebook/Ax#1271 Adds a `_psd_safe_pyro_mvn_sample` to catch LinAlgErrors that happen in `pyro.sample`, and retries with increased jitter. Modeled after linear operator's `psd_safe_cholesky`. Reviewed By: Balandat Differential Revision: D41405255 fbshipit-source-id: 6fea8f1a953d2ad8ec5c0ca2ca8c13732f729879
diff --git a/botorch/models/fully_bayesian.py b/botorch/models/fully_bayesian.py
@@ -32,6 +32,7 @@
 
 
 import math
+import warnings
 from abc import abstractmethod
 from typing import Any, Dict, List, Mapping, Optional, Tuple
 
@@ -55,6 +56,7 @@
 from gpytorch.means.constant_mean import ConstantMean
 from gpytorch.means.mean import Mean
 from gpytorch.models.exact_gp import ExactGP
+from linear_operator import settings
 from torch import Tensor
 
 MIN_INFERRED_NOISE_LEVEL = 1e-6
@@ -81,6 +83,51 @@ def reshape_and_detach(target: Tensor, new_value: Tensor) -> None:
     return new_value.detach().clone().view(target.shape).to(target)
 
 
+def _psd_safe_pyro_mvn_sample(
+    name: str, loc: Tensor, covariance_matrix: Tensor, obs: Tensor
+) -> None:
+    r"""Wraps the `pyro.sample` call in a loop to add an increasing series of jitter
+    to the covariance matrix each time we get a LinAlgError.
+
+    This is modelled after linear_operator's `psd_safe_cholesky`.
+    """
+    jitter = settings.cholesky_jitter.value(loc.dtype)
+    max_tries = settings.cholesky_max_tries.value()
+    for i in range(max_tries + 1):
+        jitter_matrix = (
+            torch.eye(
+                covariance_matrix.shape[-1],
+                device=covariance_matrix.device,
+                dtype=covariance_matrix.dtype,
+            )
+            * jitter
+        )
+        jittered_covar = (
+            covariance_matrix if i == 0 else covariance_matrix + jitter_matrix
+        )
+        try:
+            pyro.sample(
+                name,
+                pyro.distributions.MultivariateNormal(
+                    loc=loc,
+                    covariance_matrix=jittered_covar,
+                ),
+                obs=obs,
+            )
+            return
+        except (torch.linalg.LinAlgError, ValueError) as e:
+            if isinstance(e, ValueError) and "satisfy the constraint" not in str(e):
+                # Not-PSD can be also caught in Distribution.__init__ during parameter
+                # validation, which raises a ValueError. Only catch those errors.
+                raise e
+            jitter = jitter * (10**i)
+            warnings.warn(
+                "Received a linear algebra error while sampling with Pyro. Adding a "
+                f"jitter of {jitter} to the covariance matrix and retrying.",
+                RuntimeWarning,
+            )
+
+
 class PyroModel:
     r"""
     Base class for a Pyro model; used to assist in learning hyperparameters.
@@ -164,12 +211,10 @@ def sample(self) -> None:
         lengthscale = self.sample_lengthscale(dim=self.ard_num_dims, **tkwargs)
         k = matern52_kernel(X=self.train_X, lengthscale=lengthscale)
         k = outputscale * k + noise * torch.eye(self.train_X.shape[0], **tkwargs)
-        pyro.sample(
-            "Y",
-            pyro.distributions.MultivariateNormal(
-                loc=mean.view(-1).expand(self.train_X.shape[0]),
-                covariance_matrix=k,
-            ),
+        _psd_safe_pyro_mvn_sample(
+            name="Y",
+            loc=mean.view(-1).expand(self.train_X.shape[0]),
+            covariance_matrix=k,
             obs=self.train_Y.squeeze(-1),
         )
 
diff --git a/botorch/models/fully_bayesian_multitask.py b/botorch/models/fully_bayesian_multitask.py
@@ -14,6 +14,7 @@
 import torch
 from botorch.acquisition.objective import PosteriorTransform
 from botorch.models.fully_bayesian import (
+    _psd_safe_pyro_mvn_sample,
     matern52_kernel,
     MIN_INFERRED_NOISE_LEVEL,
     PyroModel,
@@ -103,12 +104,10 @@ def sample(self) -> None:
         )
         k = k.mul(task_covar)
         k = outputscale * k + noise * torch.eye(self.train_X.shape[0], **tkwargs)
-        pyro.sample(
-            "Y",
-            pyro.distributions.MultivariateNormal(
-                loc=mean.view(-1).expand(self.train_X.shape[0]),
-                covariance_matrix=k,
-            ),
+        _psd_safe_pyro_mvn_sample(
+            name="Y",
+            loc=mean.view(-1).expand(self.train_X.shape[0]),
+            covariance_matrix=k,
             obs=self.train_Y.squeeze(-1),
         )
 
diff --git a/test/models/test_fully_bayesian.py b/test/models/test_fully_bayesian.py
@@ -6,6 +6,7 @@
 
 
 import itertools
+import warnings
 from unittest import mock
 
 import torch
@@ -32,6 +33,7 @@
 from botorch.models import ModelList, ModelListGP
 from botorch.models.deterministic import GenericDeterministicModel
 from botorch.models.fully_bayesian import (
+    _psd_safe_pyro_mvn_sample,
     MCMC_DIM,
     MIN_INFERRED_NOISE_LEVEL,
     PyroModel,
@@ -660,3 +662,53 @@ def f(x):
                         dist.cdf(x), q * torch.ones(1, 5, **tkwargs), atol=1e-4
                     )
                 )
+
+    def test_psd_safe_pyro_mvn_sample(self):
+        def mock_init(
+            batch_shape=torch.Size(),  # noqa
+            event_shape=torch.Size(),  # noqa
+            validate_args=None,
+        ):
+            self._batch_shape = batch_shape
+            self._event_shape = event_shape
+            self._validate_args = False
+
+        for dtype in (torch.float, torch.double):
+            tkwargs = {"dtype": dtype, "device": self.device}
+            loc = torch.rand(5, **tkwargs)
+            obs = torch.rand(5, **tkwargs)
+            psd_covar = torch.eye(5, **tkwargs)
+            not_psd_covar = torch.ones(5, 5, **tkwargs)
+            with warnings.catch_warnings(record=True) as ws:
+                warnings.simplefilter("always")
+                _psd_safe_pyro_mvn_sample(
+                    name="Y", loc=loc, covariance_matrix=psd_covar, obs=obs
+                )
+            self.assertFalse(any("linear algebra error" in str(w.message) for w in ws))
+            # With a PSD covar, it should only get called once.
+            # Raised as a ValueError:
+            with warnings.catch_warnings(record=True) as ws:
+                warnings.simplefilter("always")
+                _psd_safe_pyro_mvn_sample(
+                    name="Y", loc=loc, covariance_matrix=not_psd_covar, obs=obs
+                )
+            self.assertTrue(any("linear algebra error" in str(w.message) for w in ws))
+            # Raised as a LinAlgError:
+            with mock.patch(
+                "torch.distributions.multivariate_normal.Distribution.__init__",
+                wraps=mock_init,
+            ), warnings.catch_warnings(record=True) as ws:
+                warnings.simplefilter("always")
+                _psd_safe_pyro_mvn_sample(
+                    name="Y", loc=loc, covariance_matrix=not_psd_covar, obs=obs
+                )
+            # With a not-PSD covar, it should get called multiple times.
+            self.assertTrue(any("linear algebra error" in str(w.message) for w in ws))
+            # We don't catch random Value errors.
+            with mock.patch(
+                "torch.distributions.multivariate_normal.Distribution.__init__",
+                side_effect=ValueError("dummy error"),
+            ), self.assertRaisesRegex(ValueError, "dummy"):
+                _psd_safe_pyro_mvn_sample(
+                    name="Y", loc=loc, covariance_matrix=not_psd_covar, obs=obs
+                )