Add dtype registry for symeig (#1725)

wjmaddox · Balandat · web-flow · commit e4579ed1091f · 2021-08-31T11:59:47.000-04:00
* add dtype registry for symeig

* move to a linalg_dtypes operator

* Update gpytorch/test/lazy_tensor_test_case.py

Co-authored-by: Max Balandat &lt;Balandat@users.noreply.github.com&gt;

* rename linalg dtypes

Co-authored-by: Max Balandat &lt;Balandat@users.noreply.github.com&gt;
diff --git a/gpytorch/lazy/kronecker_product_added_diag_lazy_tensor.py b/gpytorch/lazy/kronecker_product_added_diag_lazy_tensor.py
@@ -118,16 +118,17 @@ def _solve(self, rhs, preconditioner=None, num_tridiag=0):
 
         rhs_dtype = rhs.dtype
 
+        # we perform the solve in double for numerical stability issues
+        symeig_dtype = settings._linalg_dtype_symeig.value()
+
         # if the diagonal is constant, we can solve this using the Kronecker-structured eigendecomposition
         # and performing a spectral shift of its eigenvalues
         if self._diag_is_constant:
-            # we perform the solve in double for numerical stability issues
-            # TODO: Use fp64 registry once #1213 is addressed
-            evals, q_matrix = self.lazy_tensor.to(torch.double).diagonalization()
-            evals_plus_diagonal = evals + self.diag_tensor.diag().double()
+            evals, q_matrix = self.lazy_tensor.to(symeig_dtype).diagonalization()
+            evals_plus_diagonal = evals + self.diag_tensor.diag().to(symeig_dtype)
             evals_root = evals_plus_diagonal.pow(0.5)
             inv_mat_sqrt = DiagLazyTensor(evals_root.reciprocal())
-            res = q_matrix.transpose(-2, -1).matmul(rhs.double())
+            res = q_matrix.transpose(-2, -1).matmul(rhs.to(symeig_dtype))
             res2 = inv_mat_sqrt.matmul(res)
             lazy_lhs = q_matrix.matmul(inv_mat_sqrt)
             return lazy_lhs.matmul(res2).type(rhs_dtype)
@@ -154,9 +155,9 @@ def _solve(self, rhs, preconditioner=None, num_tridiag=0):
 
             # again we perform the solve in double precision for numerical stability issues
             # TODO: Use fp64 registry once #1213 is addressed
-            rhs = rhs.double()
-            lt = self.lazy_tensor.to(torch.double)
-            dlt = self.diag_tensor.to(torch.double)
+            rhs = rhs.to(symeig_dtype)
+            lt = self.lazy_tensor.to(symeig_dtype)
+            dlt = self.diag_tensor.to(symeig_dtype)
 
             # If each of the diagonal factors is constant, life gets a little easier
             # as we can reuse the eigendecomposition
diff --git a/gpytorch/lazy/lazy_tensor.py b/gpytorch/lazy/lazy_tensor.py
@@ -2170,10 +2170,11 @@ def _symeig(self, eigenvectors: bool = False) -> Tuple[Tensor, Optional["LazyTen
         if settings.verbose_linalg.on():
             settings.verbose_linalg.logger.debug(f"Running symeig on a matrix of size {self.shape}.")
 
-        dtype = self.dtype  # perform decomposition in double precision for numerical stability
-        # TODO: Use fp64 registry once #1213 is addressed
-        evals, evecs = torch.linalg.eigh(self.evaluate().to(dtype=torch.double))
-        # chop any negative eigenvalues. TODO: warn if evals are significantly negative
+        # potentially perform decomposition in double precision for numerical stability
+        dtype = self.dtype
+        evals, evecs = torch.linalg.eigh(self.evaluate().to(dtype=settings._linalg_dtype_symeig.value()))
+        # chop any negative eigenvalues.
+        # TODO: warn if evals are significantly negative
         evals = evals.clamp_min(0.0).to(dtype=dtype)
         if eigenvectors:
             evecs = NonLazyTensor(evecs.to(dtype=dtype))
diff --git a/gpytorch/settings.py b/gpytorch/settings.py
@@ -671,7 +671,7 @@ class skip_logdet_forward(_feature_flag):
     pass will skip certain computations (i.e. the logdet computation), and will therefore
     be improper estimates.
 
-    If you're using SGD (or a varient) to optimize parameters, you probably
+    If you're using SGD (or a variant) to optimize parameters, you probably
     don't need an accurate MLL estimate; you only need accurate gradients. So
     this setting may give your model a performance boost.
 
@@ -681,6 +681,40 @@ class skip_logdet_forward(_feature_flag):
     _default = False
 
 
+class _linalg_dtype_symeig(_value_context):
+    _global_value = torch.double
+
+
+class _linalg_dtype_cholesky(_value_context):
+    _global_value = torch.double
+
+
+class linalg_dtypes:
+    """
+    Whether to perform less stable linalg calls in double precision or in a lower precision.
+    Currently, the default is to apply all symeig calls and cholesky calls within variational
+    methods in double precision.
+
+    (Default: torch.double)
+    """
+
+    def __init__(self, default=torch.double, symeig=None, cholesky=None):
+        symeig = default if symeig is None else symeig
+        cholesky = default if cholesky is None else cholesky
+
+        self.symeig = _linalg_dtype_symeig(symeig)
+        self.cholesky = _linalg_dtype_cholesky(cholesky)
+
+    def __enter__(self):
+        self.symeig.__enter__()
+        self.cholesky.__enter__()
+
+    def __exit__(self, *args):
+        self.symeig.__exit__()
+        self.cholesky.__exit__()
+        return False
+
+
 class terminate_cg_by_size(_feature_flag):
     """
     If set to true, cg will terminate after n iterations for an n x n matrix.
diff --git a/gpytorch/test/lazy_tensor_test_case.py b/gpytorch/test/lazy_tensor_test_case.py
@@ -9,6 +9,7 @@
 import torch
 
 import gpytorch
+from gpytorch.settings import linalg_dtypes
 from gpytorch.utils.cholesky import CHOLESKY_METHOD
 
 from .base_test_case import BaseTestCase
@@ -295,7 +296,7 @@ class LazyTensorTestCase(RectangularLazyTensorTestCase):
         "root_inv_decomposition": {"rtol": 0.05, "atol": 0.02},
         "sample": {"rtol": 0.3, "atol": 0.3},
         "sqrt_inv_matmul": {"rtol": 1e-4, "atol": 1e-3},
-        "symeig": {"rtol": 1e-4, "atol": 1e-3},
+        "symeig": {"double": {"rtol": 1e-4, "atol": 1e-3}, "float": {"rtol": 1e-3, "atol": 1e-2}},
         "svd": {"rtol": 1e-4, "atol": 1e-3},
     }
 
@@ -754,51 +755,56 @@ def test_sqrt_inv_matmul_no_lhs(self):
                 self.assertAllClose(arg.grad, arg_copy.grad, **self.tolerances["sqrt_inv_matmul"])
 
     def test_symeig(self):
-        lazy_tensor = self.create_lazy_tensor().detach().requires_grad_(True)
-        lazy_tensor_copy = lazy_tensor.clone().detach().requires_grad_(True)
-        evaluated = self.evaluate_lazy_tensor(lazy_tensor_copy)
-
-        # Perform forward pass
-        evals_unsorted, evecs_unsorted = lazy_tensor.symeig(eigenvectors=True)
-        evecs_unsorted = evecs_unsorted.evaluate()
-
-        # since LazyTensor.symeig does not sort evals, we do this here for the check
-        evals, idxr = torch.sort(evals_unsorted, dim=-1, descending=False)
-        evecs = torch.gather(evecs_unsorted, dim=-1, index=idxr.unsqueeze(-2).expand(evecs_unsorted.shape))
-
-        evals_actual, evecs_actual = torch.linalg.eigh(evaluated.double())
-        evals_actual = evals_actual.to(dtype=evaluated.dtype)
-        evecs_actual = evecs_actual.to(dtype=evaluated.dtype)
-
-        # Check forward pass
-        self.assertAllClose(evals, evals_actual, **self.tolerances["symeig"])
-        lt_from_eigendecomp = evecs @ torch.diag_embed(evals) @ evecs.transpose(-1, -2)
-        self.assertAllClose(lt_from_eigendecomp, evaluated, **self.tolerances["symeig"])
-
-        # if there are repeated evals, we'll skip checking the eigenvectors for those
-        any_evals_repeated = False
-        evecs_abs, evecs_actual_abs = evecs.abs(), evecs_actual.abs()
-        for idx in itertools.product(*[range(b) for b in evals_actual.shape[:-1]]):
-            eval_i = evals_actual[idx]
-            if torch.unique(eval_i.detach()).shape[-1] == eval_i.shape[-1]:  # detach to avoid pytorch/pytorch#41389
-                self.assertAllClose(evecs_abs[idx], evecs_actual_abs[idx], **self.tolerances["symeig"])
-            else:
-                any_evals_repeated = True
+        dtypes = {"double": torch.double, "float": torch.float}
+        for name, dtype in dtypes.items():
+            tolerances = self.tolerances["symeig"][name]
+
+            lazy_tensor = self.create_lazy_tensor().detach().requires_grad_(True)
+            lazy_tensor_copy = lazy_tensor.clone().detach().requires_grad_(True)
+            evaluated = self.evaluate_lazy_tensor(lazy_tensor_copy)
+
+            # Perform forward pass
+            with linalg_dtypes(dtype):
+                evals_unsorted, evecs_unsorted = lazy_tensor.symeig(eigenvectors=True)
+                evecs_unsorted = evecs_unsorted.evaluate()
+
+            # since LazyTensor.symeig does not sort evals, we do this here for the check
+            evals, idxr = torch.sort(evals_unsorted, dim=-1, descending=False)
+            evecs = torch.gather(evecs_unsorted, dim=-1, index=idxr.unsqueeze(-2).expand(evecs_unsorted.shape))
+
+            evals_actual, evecs_actual = torch.linalg.eigh(evaluated.type(dtype))
+            evals_actual = evals_actual.to(dtype=evaluated.dtype)
+            evecs_actual = evecs_actual.to(dtype=evaluated.dtype)
+
+            # Check forward pass
+            self.assertAllClose(evals, evals_actual, **tolerances)
+            lt_from_eigendecomp = evecs @ torch.diag_embed(evals) @ evecs.transpose(-1, -2)
+            self.assertAllClose(lt_from_eigendecomp, evaluated, **tolerances)
+
+            # if there are repeated evals, we'll skip checking the eigenvectors for those
+            any_evals_repeated = False
+            evecs_abs, evecs_actual_abs = evecs.abs(), evecs_actual.abs()
+            for idx in itertools.product(*[range(b) for b in evals_actual.shape[:-1]]):
+                eval_i = evals_actual[idx]
+                if torch.unique(eval_i.detach()).shape[-1] == eval_i.shape[-1]:  # detach to avoid pytorch/pytorch#41389
+                    self.assertAllClose(evecs_abs[idx], evecs_actual_abs[idx], **tolerances)
+                else:
+                    any_evals_repeated = True
 
-        # Perform backward pass
-        symeig_grad = torch.randn_like(evals)
-        ((evals * symeig_grad).sum()).backward()
-        ((evals_actual * symeig_grad).sum()).backward()
+            # Perform backward pass
+            symeig_grad = torch.randn_like(evals)
+            ((evals * symeig_grad).sum()).backward()
+            ((evals_actual * symeig_grad).sum()).backward()
 
-        # Check grads if there were no repeated evals
-        if not any_evals_repeated:
-            for arg, arg_copy in zip(lazy_tensor.representation(), lazy_tensor_copy.representation()):
-                if arg_copy.requires_grad and arg_copy.is_leaf and arg_copy.grad is not None:
-                    self.assertAllClose(arg.grad, arg_copy.grad, **self.tolerances["symeig"])
+            # Check grads if there were no repeated evals
+            if not any_evals_repeated:
+                for arg, arg_copy in zip(lazy_tensor.representation(), lazy_tensor_copy.representation()):
+                    if arg_copy.requires_grad and arg_copy.is_leaf and arg_copy.grad is not None:
+                        self.assertAllClose(arg.grad, arg_copy.grad, **tolerances)
 
-        # Test with eigenvectors=False
-        _, evecs = lazy_tensor.symeig(eigenvectors=False)
-        self.assertIsNone(evecs)
+            # Test with eigenvectors=False
+            _, evecs = lazy_tensor.symeig(eigenvectors=False)
+            self.assertIsNone(evecs)
 
     def test_svd(self):
         lazy_tensor = self.create_lazy_tensor().detach().requires_grad_(True)
diff --git a/gpytorch/variational/variational_strategy.py b/gpytorch/variational/variational_strategy.py
@@ -6,7 +6,7 @@
 
 from ..distributions import MultivariateNormal
 from ..lazy import DiagLazyTensor, MatmulLazyTensor, RootLazyTensor, SumLazyTensor, TriangularLazyTensor, delazify
-from ..settings import trace_mode
+from ..settings import _linalg_dtype_cholesky, trace_mode
 from ..utils.cholesky import psd_safe_cholesky
 from ..utils.errors import CachingError
 from ..utils.memoize import cached, clear_cache_hook, pop_from_cache_ignore_args
@@ -69,7 +69,7 @@ def __init__(self, model, inducing_points, variational_distribution, learn_induc
 
     @cached(name="cholesky_factor", ignore_args=True)
     def _cholesky_factor(self, induc_induc_covar):
-        L = psd_safe_cholesky(delazify(induc_induc_covar).double())
+        L = psd_safe_cholesky(delazify(induc_induc_covar).type(_linalg_dtype_cholesky.value()))
         return TriangularLazyTensor(L)
 
     @property
@@ -109,7 +109,7 @@ def forward(self, x, inducing_points, inducing_values, variational_inducing_cova
             except CachingError:
                 pass
             L = self._cholesky_factor(induc_induc_covar)
-        interp_term = L.inv_matmul(induc_data_covar.double()).to(full_inputs.dtype)
+        interp_term = L.inv_matmul(induc_data_covar.type(_linalg_dtype_cholesky.value())).to(full_inputs.dtype)
 
         # Compute the mean of q(f)
         # k_XZ K_ZZ^{-1/2} (m - K_ZZ^{-1/2} \mu_Z) + \mu_X
@@ -149,9 +149,10 @@ def __call__(self, x, prior=False, **kwargs):
 
                 # Change the variational parameters to be whitened
                 variational_dist = self.variational_distribution
-                mean_diff = (variational_dist.loc - prior_mean).unsqueeze(-1).double()
+                mean_diff = (variational_dist.loc - prior_mean).unsqueeze(-1).type(_linalg_dtype_cholesky.value())
                 whitened_mean = L.inv_matmul(mean_diff).squeeze(-1).to(variational_dist.loc.dtype)
-                covar_root = variational_dist.lazy_covariance_matrix.root_decomposition().root.evaluate().double()
+                covar_root = variational_dist.lazy_covariance_matrix.root_decomposition().root.evaluate()
+                covar_root = covar_root.type(_linalg_dtype_cholesky.value())
                 whitened_covar = RootLazyTensor(L.inv_matmul(covar_root).to(variational_dist.loc.dtype))
                 whitened_variational_distribution = variational_dist.__class__(whitened_mean, whitened_covar)
                 self._variational_distribution.initialize_variational_distribution(whitened_variational_distribution)