cornellius-gp
diff --git a/‎gpytorch/kernels/additive_structure_kernel.py‎
Lines changed: 11 additions & 7 deletions b/‎gpytorch/kernels/additive_structure_kernel.py‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎gpytorch/kernels/cosine_kernel.py‎
Lines changed: 4 additions & 2 deletions b/‎gpytorch/kernels/cosine_kernel.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎gpytorch/kernels/cylindrical_kernel.py‎
Lines changed: 1 addition & 3 deletions b/‎gpytorch/kernels/cylindrical_kernel.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎gpytorch/kernels/grid_interpolation_kernel.py‎
Lines changed: 7 additions & 0 deletions b/‎gpytorch/kernels/grid_interpolation_kernel.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎gpytorch/kernels/grid_kernel.py‎
Lines changed: 9 additions & 2 deletions b/‎gpytorch/kernels/grid_kernel.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎gpytorch/kernels/index_kernel.py‎
Lines changed: 6 additions & 0 deletions b/‎gpytorch/kernels/index_kernel.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎gpytorch/kernels/inducing_point_kernel.py‎
Lines changed: 6 additions & 0 deletions b/‎gpytorch/kernels/inducing_point_kernel.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎gpytorch/kernels/keops/matern_kernel.py‎
Lines changed: 2 additions & 2 deletions b/‎gpytorch/kernels/keops/matern_kernel.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎gpytorch/kernels/keops/rbf_kernel.py‎
Lines changed: 2 additions & 2 deletions b/‎gpytorch/kernels/keops/rbf_kernel.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎gpytorch/kernels/kernel.py‎
Lines changed: 123 additions & 13 deletions b/‎gpytorch/kernels/kernel.py‎
Lines changed: 123 additions & 13 deletions
@@ -34,13 +34,6 @@ class AdditiveStructureKernel(Kernel):
             Passed down to the `base_kernel`.
     """
 
-    @property
-    def is_stationary(self) -> bool:
-        """
-        Kernel is stationary if the base kernel is stationary.
-        """
-        return self.base_kernel.is_stationary
-
     def __init__(
         self,
         base_kernel: Kernel,
@@ -51,6 +44,17 @@ def __init__(
         self.base_kernel = base_kernel
         self.num_dims = num_dims
 
+    @property
+    def _lazily_evaluate(self) -> bool:
+        return self.base_kernel._lazily_evaluate
+
+    @property
+    def is_stationary(self) -> bool:
+        """
+        Kernel is stationary if the base kernel is stationary.
+        """
+        return self.base_kernel.is_stationary
+
     def forward(self, x1, x2, diag=False, last_dim_is_batch=False, **params):
         if last_dim_is_batch:
             raise RuntimeError("AdditiveStructureKernel does not accept the last_dim_is_batch argument.")
 
@@ -56,8 +56,6 @@ class CosineKernel(Kernel):
         >>> covar = covar_module(x)  # Output: LazyVariable of size (2 x 10 x 10)
     """
 
-    is_stationary = True
-
     def __init__(
         self,
         period_length_prior: Optional[Prior] = None,
@@ -85,6 +83,10 @@ def __init__(
 
         self.register_constraint("raw_period_length", period_length_constraint)
 
+    @property
+    def is_stationary(self):
+        return True
+
     @property
     def period_length(self):
         return self.raw_period_length_constraint.transform(self.raw_period_length)
 
@@ -4,7 +4,6 @@
 
 import torch
 
-from .. import settings
 from ..constraints import Interval, Positive
 from ..priors import Prior
 from .kernel import Kernel
@@ -152,8 +151,7 @@ def forward(self, x1: torch.Tensor, x2: torch.Tensor, diag: Optional[bool] = Fal
                 else:
                     angular_kernel = angular_kernel + self.angular_weights[..., p, None].mul(gram_mat.pow(p))
 
-        with settings.lazily_evaluate_kernels(False):
-            radial_kernel = self.radial_base_kernel(self.kuma(r1), self.kuma(r2), diag=diag, **params)
+        radial_kernel = self.radial_base_kernel.forward(self.kuma(r1), self.kuma(r2), diag=diag, **params)
         return radial_kernel.mul(angular_kernel)
 
     def kuma(self, x: torch.Tensor) -> torch.Tensor:
 
@@ -121,6 +121,13 @@ def __init__(
         )
         self.register_buffer("has_initialized_grid", torch.tensor(has_initialized_grid, dtype=torch.bool))
 
+    @property
+    def _lazily_evaluate(self) -> bool:
+        # GridInterpolationKernels should not lazily evaluate; there are few gains (the inducing point kernel
+        # matrix always needs to be evaluated; regardless of the size of x1 and x2), and the
+        # InterpolatedLinearOperator structure is needed for fast predictions.
+        return False
+
     @property
     def _tight_grid_bounds(self):
         grid_spacings = tuple((bound[1] - bound[0]) / self.grid_sizes[i] for i, bound in enumerate(self.grid_bounds))
 
@@ -44,8 +44,6 @@ class GridKernel(Kernel):
         http://www.cs.cmu.edu/~andrewgw/manet.pdf
     """
 
-    is_stationary = True
-
     def __init__(
         self,
         base_kernel: Kernel,
@@ -66,6 +64,15 @@ def __init__(
         if not self.interpolation_mode:
             self.register_buffer("full_grid", create_data_from_grid(grid))
 
+    @property
+    def _lazily_evaluate(self) -> bool:
+        # Toeplitz structure is very efficient; no need to lazily evaluate
+        return False
+
+    @property
+    def is_stationary(self) -> bool:
+        return True
+
     def _clear_cache(self):
         if hasattr(self, "_cached_kernel_mat"):
             del self._cached_kernel_mat
 
@@ -76,6 +76,12 @@ def __init__(
 
         self.register_constraint("raw_var", var_constraint)
 
+    @property
+    def _lazily_evaluate(self) -> bool:
+        # IndexKernel does not need lazy evaluation, since the complete BB^T + D_v` is always
+        # computed regardless of x1 and x2
+        return False
+
     @property
     def var(self):
         return self.raw_var_constraint.transform(self.raw_var)
 
@@ -47,6 +47,12 @@ def _clear_cache(self):
         if hasattr(self, "_cached_kernel_inv_root"):
             del self._cached_kernel_inv_root
 
+    @property
+    def _lazily_evaluate(self) -> bool:
+        # InducingPointKernels kernels should not lazily evaluate; to use the Woodbury formula,
+        # we want the Kernel to return a LowRankLinearOperator, not a KernelLinaerOperator.
+        return False
+
     @property
     def _inducing_mat(self):
         if not self.training and hasattr(self, "_cached_kernel_mat"):
 
@@ -2,7 +2,7 @@
 import math
 
 import torch
-from linear_operator.operators import KeOpsLinearOperator
+from linear_operator.operators import KernelLinearOperator
 
 from ... import settings
 from .keops_kernel import KeOpsKernel
@@ -92,7 +92,7 @@ def forward(self, x1, x2, diag=False, **params):
                 return self.covar_func(x1_, x2_, diag=True)
 
             covar_func = lambda x1, x2, diag=False: self.covar_func(x1, x2, diag)
-            return KeOpsLinearOperator(x1_, x2_, covar_func)
+            return KernelLinearOperator(x1_, x2_, covar_func=covar_func)
 
 except ImportError:
 
 
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 import torch
-from linear_operator.operators import KeOpsLinearOperator
+from linear_operator.operators import KernelLinearOperator
 
 from ... import settings
 from ..rbf_kernel import postprocess_rbf
@@ -54,7 +54,7 @@ def forward(self, x1, x2, diag=False, **params):
             if diag:
                 return covar_func(x1_, x2_, diag=True)
 
-            return KeOpsLinearOperator(x1_, x2_, covar_func)
+            return KernelLinearOperator(x1_, x2_, covar_func=covar_func)
 
 except ImportError:
 
 
@@ -4,12 +4,13 @@
 
 import warnings
 from abc import abstractmethod
+from collections import defaultdict, OrderedDict
 from copy import deepcopy
-from typing import Callable, Dict, Iterable, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union
 
 import torch
 from linear_operator import to_dense, to_linear_operator
-from linear_operator.operators import LinearOperator, ZeroLinearOperator
+from linear_operator.operators import KernelLinearOperator, LinearOperator, ZeroLinearOperator
 from torch import Tensor
 from torch.nn import ModuleList
 
@@ -75,6 +76,45 @@ def _dist(self, x1, x2, x1_eq_x2=False, postprocess=False):
         return self._postprocess(res) if postprocess else res
 
 
+class _autograd_kernel_hack(object):
+    """
+    Helper class.
+
+    When using KernelLinearOperator, the `covar_func` cannot close over any Tensors that require gradients.
+    (Any Tensor that `covar_func` closes over will not backpropagate gradients.)
+    Unfortunately, for most kernels, `covar_func=self.forward`, which closes over all of the kernel's parameters.
+
+    This context manager temporarily replaces a kernel (and its submodules') parameter assignments with an
+    external set of references to these parameters.
+    The external set of references will be passed in by KernelLinearOperator.
+
+    This way, when calling self.forward, no parameter references are closed over, and so all parameters
+    will receive the appropriate gradients.
+    """
+
+    def __init__(self, kernel: Kernel, params: Iterable[torch.nn.Parameters], param_names: Iterable[str]):
+        self.temp_module_param_dicts = defaultdict(OrderedDict)
+        for name, param in zip(param_names, params):
+            split_name = name.split(".")
+            module = kernel
+            while len(split_name) > 1:
+                module_name, *remaining_names = split_name
+                module = getattr(module, module_name)
+                split_name = remaining_names
+            (base_param_name,) = split_name
+            self.temp_module_param_dicts[module][base_param_name] = param
+
+        self.orig_model_param_dicts = dict((module, module._parameters) for module in self.temp_module_param_dicts)
+
+    def __enter__(self):
+        for module, temp_param_dict in self.temp_module_param_dicts.items():
+            object.__setattr__(module, "_parameters", temp_param_dict)
+
+    def __exit__(self, type, value, traceback):
+        for module, orig_param_dict in self.orig_model_param_dicts.items():
+            object.__setattr__(module, "_parameters", orig_param_dict)
+
+
 class Kernel(Module):
     r"""
     Kernels in GPyTorch are implemented as a :class:`gpytorch.Module` that, when called on two :class:`torch.Tensor`
@@ -206,6 +246,37 @@ def __init__(
         # TODO: Remove this on next official PyTorch release.
         self.__pdist_supports_batch = True
 
+    @property
+    def _lazily_evaluate(self) -> bool:
+        r"""
+        Determines whether or not the kernel is lazily evaluated.
+
+        If False, kernel(x1, x2) produces a Tensor/LinearOperator where the covariance function has been evaluated
+        over x1 and x2.
+
+        If True, kernel(x1, x2) produces a KernelLinearOperator that delays evaluation of the kernel function.
+        The kernel function will only be evaluated when either
+            - An mathematical operation is performed on the kernel matrix (e.g. solves, logdets, etc.), or
+            - An indexing operation is performed on the kernel matrix to select specific covariance entries.
+
+        In general, _lazily_evaluate should return True (this option is more efficient), unless lazy evaluation
+        offers no gains and there is specific structure that will be lost with lazy evaluation
+        (e.g. low-rank/Nystrom approximations).
+        """
+        return True
+
+    def _kernel_linear_operator_covar_func(
+        self, x1: Tensor, x2: Tensor, *params: torch.nn.Parameter, param_names: Dict[str] = {}, **kwargs: Any
+    ) -> Union[Tensor, LinearOperator]:
+        # This is the `covar_function` that is passed into KernelLinearOperator
+        # This function calls self.forward, but does so in a way so that no parameters are closed over
+        # (by using the _autograd_kernel_hack context manager)
+        if any(param.requires_grad for param in params):
+            with _autograd_kernel_hack(self, params, param_names):
+                return self.forward(x1, x2, **kwargs)
+        else:
+            return self.forward(x1, x2, **kwargs)
+
     def _lengthscale_param(self, m: Kernel) -> Tensor:
         # Used by the lengthscale_prior
         return m.lengthscale
@@ -451,7 +522,7 @@ def sub_kernels(self) -> Iterable[Kernel]:
             yield kernel
 
     def __call__(
-        self, x1: Tensor, x2: Optional[Tensor] = None, diag: bool = False, last_dim_is_batch: bool = False, **params
+        self, x1: Tensor, x2: Optional[Tensor] = None, diag: bool = False, last_dim_is_batch: bool = False, **kwargs
     ) -> Union[LazyEvaluatedKernelTensor, LinearOperator, Tensor]:
         r"""
         Computes the covariance between :math:`\mathbf x_1` and :math:`\mathbf x_2`.
@@ -508,7 +579,7 @@ def __call__(
                 )
 
         if diag:
-            res = super(Kernel, self).__call__(x1_, x2_, diag=True, last_dim_is_batch=last_dim_is_batch, **params)
+            res = super(Kernel, self).__call__(x1_, x2_, diag=True, last_dim_is_batch=last_dim_is_batch, **kwargs)
             # Did this Kernel eat the diag option?
             # If it does not return a LazyEvaluatedKernelTensor, we can call diag on the output
             if not isinstance(res, LazyEvaluatedKernelTensor):
@@ -517,11 +588,42 @@ def __call__(
             return res
 
         else:
-            if settings.lazily_evaluate_kernels.on():
-                res = LazyEvaluatedKernelTensor(x1_, x2_, kernel=self, last_dim_is_batch=last_dim_is_batch, **params)
+            if (settings.lazily_evaluate_kernels.on() and self._lazily_evaluate) or last_dim_is_batch:
+                num_outputs_per_input = self.num_outputs_per_input(x1_, x2_)
+                named_parameters = tuple(self.named_parameters())
+
+                if last_dim_is_batch:
+                    x1_ = x1_.transpose(-1, -2).unsqueeze(-1)
+                    x2_ = x2_.transpose(-1, -2).unsqueeze(-1)
+
+                if len(named_parameters):
+                    param_names, params = zip(*named_parameters)
+                    param_batch_shapes = [self.batch_shape] * len(params)
+                    if last_dim_is_batch:
+                        params = [
+                            param.unsqueeze(len(param_batch_shape)).transpose(-1, len(param_batch_shape))
+                            for param, param_batch_shape in zip(params, param_batch_shapes)
+                        ]
+                        param_batch_shapes = [
+                            torch.Size([*param_batch_shape, x1_.size(-3)]) for param_batch_shape in param_batch_shapes
+                        ]
+                    res = KernelLinearOperator(
+                        x1_,
+                        x2_,
+                        *params,
+                        covar_func=self._kernel_linear_operator_covar_func,
+                        num_outputs_per_input=num_outputs_per_input,
+                        param_batch_shapes=param_batch_shapes,
+                        param_names=param_names,
+                        **kwargs,
+                    )
+                else:
+                    res = KernelLinearOperator(
+                        x1_, x2_, covar_func=self.forward, num_outputs_per_input=num_outputs_per_input, **kwargs
+                    )
             else:
                 res = to_linear_operator(
-                    super(Kernel, self).__call__(x1_, x2_, last_dim_is_batch=last_dim_is_batch, **params)
+                    super(Kernel, self).__call__(x1_, x2_, last_dim_is_batch=last_dim_is_batch, **kwargs)
                 )
             return res
 
@@ -593,13 +695,17 @@ class AdditiveKernel(Kernel):
     :param kernels: Kernels to add together.
     """
 
+    def __init__(self, *kernels: Iterable[Kernel]):
+        super(AdditiveKernel, self).__init__()
+        self.kernels = ModuleList(kernels)
+
     @property
     def is_stationary(self) -> bool:
         return all(k.is_stationary for k in self.kernels)
 
-    def __init__(self, *kernels: Iterable[Kernel]):
-        super(AdditiveKernel, self).__init__()
-        self.kernels = ModuleList(kernels)
+    @property
+    def _lazily_evaluate(self) -> bool:
+        return all(k._lazily_evaluate for k in self.kernels)
 
     def forward(self, x1: Tensor, x2: Tensor, diag: bool = False, **params) -> Union[Tensor, LinearOperator]:
         res = ZeroLinearOperator() if not diag else 0
@@ -635,13 +741,17 @@ class ProductKernel(Kernel):
     :param kernels: Kernels to multiply together.
     """
 
+    def __init__(self, *kernels: Iterable[Kernel]):
+        super(ProductKernel, self).__init__()
+        self.kernels = ModuleList(kernels)
+
     @property
     def is_stationary(self) -> bool:
         return all(k.is_stationary for k in self.kernels)
 
-    def __init__(self, *kernels: Iterable[Kernel]):
-        super(ProductKernel, self).__init__()
-        self.kernels = ModuleList(kernels)
+    @property
+    def _lazily_evaluate(self) -> bool:
+        return False
 
     def forward(self, x1: Tensor, x2: Tensor, diag: bool = False, **params) -> Union[Tensor, LinearOperator]:
         x1_eq_x2 = torch.equal(x1, x2)