Merge pull request #2692 from kayween/fast-variational

jacobrgardner · web-flow · commit 11ed41cef955 · 2026-01-02T12:37:37.000-05:00
Speed Up Variational Strategy
diff --git a/gpytorch/variational/__init__.py b/gpytorch/variational/__init__.py
@@ -12,6 +12,7 @@
     IndependentMultitaskVariationalStrategy,
     MultitaskVariationalStrategy,
 )
+from .large_batch_variational_strategy import LargeBatchVariationalStrategy
 from .lmc_variational_strategy import LMCVariationalStrategy
 from .mean_field_variational_distribution import MeanFieldVariationalDistribution
 from .natural_variational_distribution import _NaturalVariationalDistribution, NaturalVariationalDistribution
@@ -29,6 +30,7 @@
     "GridInterpolationVariationalStrategy",
     "IndependentMultitaskVariationalStrategy",
     "LMCVariationalStrategy",
+    "LargeBatchVariationalStrategy",
     "MultitaskVariationalStrategy",
     "OrthogonallyDecoupledVariationalStrategy",
     "VariationalStrategy",
diff --git a/gpytorch/variational/_variational_strategy.py b/gpytorch/variational/_variational_strategy.py
@@ -129,6 +129,7 @@ def forward(
         inducing_points: Tensor,
         inducing_values: Tensor,
         variational_inducing_covar: Optional[LinearOperator] = None,
+        diag: bool = True,
         **kwargs,
     ) -> MultivariateNormal:
         r"""
@@ -146,6 +147,12 @@ def forward(
             the distribuiton :math:`q(\mathbf u)` is
             Gaussian, then this variable is the covariance matrix of that Gaussian.
             Otherwise, it will be None.
+        :param diag: If true and this module is in train mode, this method is allowed to skip the off-diagonal entries
+            in the predictive covariance and only compute the predictive variance, whenever it's deemed more efficient
+            by the underlying implementation. In that case, the off-diagonal entries in the covariance matrix of the
+            returned :class:`~gpytorch.distributions.MultivariateNormal` could be arbitrary dummy values. If this
+            argument is false, then this method computes the full covariance matrix even in train mode. This argument
+            is ignored if this module is in eval mode, in which case the full covariance matrix is always computed.
 
         :rtype: :obj:`~gpytorch.distributions.MultivariateNormal`
         :return: The distribution :math:`q( \mathbf f(\mathbf X))`
@@ -320,14 +327,21 @@ def get_fantasy_model(
         fantasy_model.prediction_strategy = fant_pred_strat
         return fantasy_model
 
-    def __call__(self, x: Tensor, prior: bool = False, **kwargs) -> MultivariateNormal:
+    def __call__(self, x: Tensor, prior: bool = False, diag: bool = True, **kwargs) -> MultivariateNormal:
         # If we're in prior mode, then we're done!
         if prior:
-            return self.model.forward(x, **kwargs)
+            if isinstance(self.model, _VariationalStrategy):
+                # If the model is itself a variational strategy, we need to force it to compute the full covariance in
+                # case that the model is in train mode.
+                return self.model.forward(x, diag=False, **kwargs)
+            else:
+                # Otherwise, the model is `ApproximateGP`. So we can just call forward.
+                return self.model.forward(x, **kwargs)
 
         # Delete previously cached items from the training distribution
         if self.training:
             self._clear_cache()
+
         # (Maybe) initialize variational distribution
         if not self.variational_params_initialized.item():
             prior_dist = self.prior_distribution
@@ -349,11 +363,17 @@ def __call__(self, x: Tensor, prior: bool = False, **kwargs) -> MultivariateNorm
                 inducing_points,
                 inducing_values=variational_dist_u.mean,
                 variational_inducing_covar=variational_dist_u.lazy_covariance_matrix,
+                diag=diag,
                 **kwargs,
             )
         elif isinstance(variational_dist_u, Delta):
             return super().__call__(
-                x, inducing_points, inducing_values=variational_dist_u.mean, variational_inducing_covar=None, **kwargs
+                x,
+                inducing_points,
+                inducing_values=variational_dist_u.mean,
+                variational_inducing_covar=None,
+                diag=diag,
+                **kwargs,
             )
         else:
             raise RuntimeError(
diff --git a/gpytorch/variational/additive_grid_interpolation_variational_strategy.py b/gpytorch/variational/additive_grid_interpolation_variational_strategy.py
@@ -60,7 +60,7 @@ def forward(
         inducing_points: Tensor,
         inducing_values: Tensor,
         variational_inducing_covar: Optional[LinearOperator] = None,
-        *params,
+        diag: bool = True,
         **kwargs,
     ) -> MultivariateNormal:
         if x.ndimension() == 1:
@@ -72,7 +72,7 @@ def forward(
         if num_dim != self.num_dim:
             raise RuntimeError("The number of dims should match the number specified.")
 
-        output = super().forward(x, inducing_points, inducing_values, variational_inducing_covar)
+        output = super().forward(x, inducing_points, inducing_values, variational_inducing_covar, diag=diag)
         if self.sum_output:
             if variational_inducing_covar is not None:
                 mean = output.mean.sum(0)
diff --git a/gpytorch/variational/batch_decoupled_variational_strategy.py b/gpytorch/variational/batch_decoupled_variational_strategy.py
@@ -181,6 +181,7 @@ def forward(
         inducing_points: Tensor,
         inducing_values: Tensor,
         variational_inducing_covar: Optional[LinearOperator] = None,
+        diag: bool = True,
         **kwargs,
     ) -> MultivariateNormal:
         # We'll compute the covariance, and cross-covariance terms for both the
diff --git a/gpytorch/variational/ciq_variational_strategy.py b/gpytorch/variational/ciq_variational_strategy.py
@@ -195,12 +195,12 @@ def forward(
         inducing_points: torch.Tensor,
         inducing_values: torch.Tensor,
         variational_inducing_covar: Optional[LinearOperator] = None,
-        *params,
+        diag: bool = True,
         **kwargs,
     ) -> MultivariateNormal:
         # Compute full prior distribution
         full_inputs = torch.cat([inducing_points, x], dim=-2)
-        full_output = self.model.forward(full_inputs, *params, **kwargs)
+        full_output = self.model.forward(full_inputs, **kwargs)
         full_covar = full_output.lazy_covariance_matrix
 
         # Covariance terms
@@ -275,7 +275,7 @@ def kl_divergence(self) -> Tensor:
         else:
             return super().kl_divergence()
 
-    def __call__(self, x: torch.Tensor, prior: bool = False, *params, **kwargs) -> MultivariateNormal:
+    def __call__(self, x: torch.Tensor, prior: bool = False, diag: bool = True, **kwargs) -> MultivariateNormal:
         # This is mostly the same as _VariationalStrategy.__call__()
         # but with special rules for natural gradient descent (to prevent O(M^3) computation)
 
@@ -313,7 +313,7 @@ def __call__(self, x: torch.Tensor, prior: bool = False, *params, **kwargs) -> M
                 inducing_points,
                 inducing_values=None,
                 variational_inducing_covar=None,
-                *params,
+                diag=diag,
                 **kwargs,
             )
         else:
@@ -327,6 +327,7 @@ def __call__(self, x: torch.Tensor, prior: bool = False, *params, **kwargs) -> M
                     inducing_points,
                     inducing_values=variational_dist_u.mean,
                     variational_inducing_covar=variational_dist_u.lazy_covariance_matrix,
+                    diag=diag,
                     **kwargs,
                 )
             elif isinstance(variational_dist_u, Delta):
@@ -336,6 +337,7 @@ def __call__(self, x: torch.Tensor, prior: bool = False, *params, **kwargs) -> M
                     inducing_points,
                     inducing_values=variational_dist_u.mean,
                     variational_inducing_covar=None,
+                    diag=diag,
                     **kwargs,
                 )
             else:
diff --git a/gpytorch/variational/grid_interpolation_variational_strategy.py b/gpytorch/variational/grid_interpolation_variational_strategy.py
@@ -1,8 +1,12 @@
 #!/usr/bin/env python3
 
+from typing import Optional
+
 import torch
+from linear_operator import LinearOperator
 from linear_operator.operators import InterpolatedLinearOperator
 from linear_operator.utils.interpolation import left_interp
+from torch import Tensor
 
 from ..distributions import MultivariateNormal
 from ..utils.interpolation import Interpolation
@@ -77,7 +81,15 @@ def prior_distribution(self):
         res = MultivariateNormal(out.mean, out.lazy_covariance_matrix.add_jitter(1e-3))
         return res
 
-    def forward(self, x, inducing_points, inducing_values, variational_inducing_covar=None):
+    def forward(
+        self,
+        x: Tensor,
+        inducing_points: Tensor,
+        inducing_values: Tensor,
+        variational_inducing_covar: Optional[LinearOperator] = None,
+        diag: bool = True,
+        **kwargs,
+    ):
         if variational_inducing_covar is None:
             raise RuntimeError(
                 "GridInterpolationVariationalStrategy is only compatible with Gaussian variational "
diff --git a/gpytorch/variational/independent_multitask_variational_strategy.py b/gpytorch/variational/independent_multitask_variational_strategy.py
@@ -1,9 +1,11 @@
 #!/usr/bin/env python3
 
 import warnings
+from typing import Optional
 
 import torch
 from linear_operator.operators import RootLinearOperator
+from torch import LongTensor, Tensor
 
 from ..distributions import MultitaskMultivariateNormal, MultivariateNormal
 from ..module import Module
@@ -49,11 +51,19 @@ def variational_params_initialized(self):
     def kl_divergence(self):
         return super().kl_divergence().sum(dim=-1)
 
-    def __call__(self, x, task_indices=None, prior=False, **kwargs):
+    def __call__(
+        self,
+        x: Tensor,
+        task_indices: Optional[LongTensor] = None,
+        prior: bool = False,
+        diag: bool = True,
+        **kwargs,
+    ):
         r"""
         See :class:`LMCVariationalStrategy`.
         """
-        function_dist = self.base_variational_strategy(x, prior=prior, **kwargs)
+        # Compute the full covariance because we might use the off-diagonal entries below
+        function_dist = self.base_variational_strategy(x, prior=prior, diag=False, **kwargs)
 
         if task_indices is None:
             # Every data point will get an output for each task
diff --git a/gpytorch/variational/large_batch_variational_strategy.py b/gpytorch/variational/large_batch_variational_strategy.py
@@ -0,0 +1,102 @@
+import torch
+
+from linear_operator.operators import DiagLinearOperator, LinearOperator, MatmulLinearOperator
+from torch import Tensor
+
+from gpytorch.variational.variational_strategy import VariationalStrategy
+
+
+class QuadFormDiagonal(torch.autograd.Function):
+    r"""A custom autograd function computing the diagonal of a quadratic form.
+
+    This function computes `torch.diag(B' A B)` where `A` is a symmetric matrix. The backward pass saves a large matmul
+    compared to PyTorch's default autograd engine when `B` has way more columns than rows.
+    """
+
+    @staticmethod
+    def forward(ctx, matrix: Tensor, rhs: Tensor):
+        r"""The forward pass computing the diagonal of a quadratic form. Note that it does not form `B' A B` explicitly.
+
+        :param matrix: A symmetric matrix of size `(..., M, M)`.
+        :param rhs: The right-hand side vector of size `(..., M, N)`.
+
+        :return: The quadratic form diagonal of size `(..., N)`.
+        """
+        product = matrix @ rhs
+
+        # The backward pass does not need `matrix`
+        ctx.save_for_backward(rhs, product)
+
+        return torch.sum(rhs * product, dim=-2)
+
+    @staticmethod
+    def backward(ctx, d_diag: Tensor):
+        rhs, product = ctx.saved_tensors
+
+        d_matrix = rhs @ (d_diag.unsqueeze(-1) * rhs.mT)
+        d_rhs = 2.0 * product * d_diag.unsqueeze(-2)
+
+        return d_matrix, d_rhs
+
+
+class LargeBatchVariationalStrategy(VariationalStrategy):
+    r"""A fast variational strategy implementation optimized for large batch stochastic training on data center GPUs.
+
+    This implementation has two assumptions on the use case:
+    1. FP64 operations (in particular triangular solve and matmul) on data center GPUs are not much slower than FP32;
+    2. The batch size is very large while the number of inducing points is moderate.
+
+    This implementation speeds up the standard `VariationalStrategy` in two ways:
+    1. Group the middle term `K_ZZ^{-1/2} (S - I) K_ZZ^{-1/2}` when computing the predictive covariance, which saves a
+    large triangular solve in the forward pass;
+    2. Use a custom autograd function computing the diagonal of `K_XZ @ middle_term @ K_ZX` in train mode, which saves
+    a large matmul in the backward pass.
+
+    NOTE: Grouping the middle term is not numerically friendly, and thus we have to use double precision to stabilize
+    the computation. As a result, this implementation is expected to be slow on CPUs and consumer GPUs. Those who use
+    CPUs and consumer cards should use `VariationalStrategy` instead.
+    """
+
+    def _compute_predictive_updates(
+        self,
+        chol: LinearOperator,
+        induc_data_covar: Tensor,
+        inducing_values: Tensor,
+        variational_inducing_covar: LinearOperator | None,
+        prior_covar: LinearOperator,
+        diag: bool = True,
+    ) -> tuple[Tensor, LinearOperator]:
+        dtype = induc_data_covar.dtype
+
+        # Make `K_ZZ^{-1/2}` dense because `TriangularLinearOperator` does not support solve with `left=False`.
+        chol = chol.to_dense().type(torch.float64)
+
+        induc_data_covar = induc_data_covar.type(torch.float64)
+        inducing_values = inducing_values.type(torch.float64)
+
+        # The mean update `k_XZ K_ZZ^{-1/2} (m - K_ZZ^{-1/2} \mu_Z)`
+        inv_chol_t_inducing_values = torch.linalg.solve_triangular(
+            chol.mT, inducing_values.unsqueeze(-1), upper=True, left=True
+        )
+        mean_update = (induc_data_covar.mT @ inv_chol_t_inducing_values).squeeze(-1).type(dtype)
+
+        # The grouped middle term `K_ZZ^{-1/2} (S - I) K_ZZ^{-1/2}`
+        middle_term = prior_covar.mul(-1).to_dense()
+        if variational_inducing_covar is not None:
+            middle_term = variational_inducing_covar.to_dense() + middle_term
+        middle_term = middle_term.type(torch.float64)
+
+        middle_term = torch.linalg.solve_triangular(chol, middle_term, upper=False, left=False)
+        middle_term = torch.linalg.solve_triangular(chol.mT, middle_term, upper=True, left=True)
+
+        # The covariance update `K_XZ K_ZZ^{-1/2} (S - I) K_ZZ^{-1/2} K_ZX`
+        if diag and self.training:
+            # The custom autograd function has a faster backward pass, but it doesn't compute the off-diagonal entries.
+            variance_update = QuadFormDiagonal.apply(middle_term, induc_data_covar)
+            covar_update = DiagLinearOperator(diag=variance_update.type(dtype))
+        else:
+            covar_update = MatmulLinearOperator(
+                induc_data_covar.mT.type(dtype), (middle_term @ induc_data_covar).type(dtype)
+            )
+
+        return mean_update, covar_update
diff --git a/gpytorch/variational/lmc_variational_strategy.py b/gpytorch/variational/lmc_variational_strategy.py
@@ -161,7 +161,13 @@ def kl_divergence(self) -> Tensor:
         return super().kl_divergence().sum(dim=self.latent_dim)
 
     def __call__(
-        self, x: Tensor, prior: bool = False, task_indices: Optional[LongTensor] = None, **kwargs
+        self,
+        x: Tensor,
+        *,
+        task_indices: Optional[LongTensor] = None,
+        prior: bool = False,
+        diag: bool = True,
+        **kwargs,
     ) -> Union[MultitaskMultivariateNormal, MultivariateNormal]:
         r"""
         Computes the variational (or prior) distribution
@@ -194,7 +200,7 @@ def __call__(
         :rtype: ~gpytorch.distributions.MultitaskMultivariateNormal (... x N x num_tasks)
             or ~gpytorch.distributions.MultivariateNormal (... x N)
         """
-        latent_dist = self.base_variational_strategy(x, prior=prior, **kwargs)
+        latent_dist = self.base_variational_strategy(x, prior=prior, diag=False, **kwargs)
         num_batch = len(latent_dist.batch_shape)
         latent_dim = num_batch + self.latent_dim
 
diff --git a/gpytorch/variational/nearest_neighbor_variational_strategy.py b/gpytorch/variational/nearest_neighbor_variational_strategy.py
@@ -133,7 +133,7 @@ def _cholesky_factor(
         return TriangularLinearOperator(L)
 
     def __call__(
-        self, x: Float[Tensor, "... N D"], prior: bool = False, **kwargs: Any
+        self, x: Float[Tensor, "... N D"], prior: bool = False, diag: bool = True, **kwargs: Any
     ) -> Float[MultivariateNormal, "... N"]:
         # If we're in prior mode, then we're done!
         if prior:
@@ -180,8 +180,11 @@ def forward(
         inducing_points: Float[Tensor, "... M D"],
         inducing_values: Float[Tensor, "... M"],
         variational_inducing_covar: Optional[Float[LinearOperator, "... M M"]] = None,
+        diag: bool = True,
         **kwargs: Any,
     ) -> Float[MultivariateNormal, "... N"]:
+        # TODO: This method needs to return the full covariance in eval mode, not just the predictive variance.
+        # TODO: Use `diag` to control when to compute the variance vs. covariance in train mode.
         if self.training:
             # In training mode, note that the full inducing points set = full training dataset
             # Users have the option to choose input None or a tensor of training data for x
diff --git a/gpytorch/variational/orthogonally_decoupled_variational_strategy.py b/gpytorch/variational/orthogonally_decoupled_variational_strategy.py
@@ -86,6 +86,7 @@ def forward(
         inducing_points: Tensor,
         inducing_values: Tensor,
         variational_inducing_covar: Optional[LinearOperator] = None,
+        diag: bool = True,
         **kwargs,
     ) -> MultivariateNormal:
         if variational_inducing_covar is not None:
@@ -94,7 +95,9 @@ def forward(
             )
 
         num_data = x.size(-2)
-        full_output = self.model(torch.cat([x, inducing_points], dim=-2), **kwargs)
+
+        # `self.model` is a variational strategy. Need to force it to compute full covariance in train mode.
+        full_output = self.model(torch.cat([x, inducing_points], dim=-2), diag=False, **kwargs)
         full_mean = full_output.mean
         full_covar = full_output.lazy_covariance_matrix
 
diff --git a/gpytorch/variational/unwhitened_variational_strategy.py b/gpytorch/variational/unwhitened_variational_strategy.py
diff --git a/gpytorch/variational/variational_strategy.py b/gpytorch/variational/variational_strategy.py
diff --git a/test/variational/test_large_batch_variational_strategy.py b/test/variational/test_large_batch_variational_strategy.py
diff --git a/test/variational/test_variational_strategy.py b/test/variational/test_variational_strategy.py