Updates from gpytorch 1.6.0-1.8.1

gpleiss · gpleiss · commit 3e85dc877fc8 · 2022-08-16T21:06:07.000Z
diff --git a/linear_operator/functions/_pivoted_cholesky.py b/linear_operator/functions/_pivoted_cholesky.py
@@ -103,7 +103,11 @@ def backward(ctx, grad_output, _):
 
         with torch.enable_grad():
             # Create a new set of matrix args that we can backpropagate through
-            matrix_args = [matrix_arg.detach().requires_grad_(True) for matrix_arg in _matrix_args]
+            matrix_args = []
+            for matrix_arg in _matrix_args:
+                if matrix_arg.dtype in (torch.float, torch.double, torch.half):
+                    matrix_arg = matrix_arg.detach().requires_grad_(True)
+                matrix_args.append(matrix_arg)
 
             # Create new linear operator using new matrix args
             matrix = ctx.representation_tree(*matrix_args)
diff --git a/linear_operator/operators/_linear_operator.py b/linear_operator/operators/_linear_operator.py
@@ -723,7 +723,8 @@ def _set_requires_grad(self, val: bool) -> None:
                     arg.requires_grad_(val)
         for arg in self._kwargs.values():
             if hasattr(arg, "requires_grad"):
-                arg.requires_grad_(val)
+                if arg.dtype in (torch.float, torch.double, torch.half):
+                    arg.requires_grad_(val)
 
     def _solve(self, rhs: torch.Tensor, preconditioner: Callable, num_tridiag: int = 0) -> torch.Tensor:
         r"""
@@ -1759,6 +1760,10 @@ def mul(self, other: Union[float, torch.Tensor, "LinearOperator"]) -> LinearOper
 
         return self._mul_matrix(to_linear_operator(other))
 
+    @property
+    def ndim(self) -> int:
+        return self.ndimension()
+
     def ndimension(self) -> int:
         """
         Returns the number of dimensions.
@@ -2687,7 +2692,7 @@ def __torch_function__(
         if kwargs is None:
             kwargs = {}
 
-        if not isinstance(args[0], LinearOperator):
+        if not isinstance(args[0], cls):
             if func not in _HANDLED_SECOND_ARG_FUNCTIONS or not all(
                 issubclass(t, (torch.Tensor, LinearOperator)) for t in types
             ):
diff --git a/linear_operator/operators/block_diag_linear_operator.py b/linear_operator/operators/block_diag_linear_operator.py
@@ -12,7 +12,7 @@
 
 
 # metaclass of BlockDiagLinearOperator, overwrites behavior of constructor call
-# _MetaBlockDiagLinearOperator(base_linear_op, block_dim=-3) to return a DiagLazyTensor
+# _MetaBlockDiagLinearOperator(base_linear_op, block_dim=-3) to return a DiagLinearOperator
 # if base_linear_op is a DiagLinearOperator itself
 class _MetaBlockDiagLinearOperator(ABCMeta):
     def __call__(cls, base_linear_op: LinearOperator, block_dim=-3):
@@ -136,6 +136,18 @@ def inv_quad_logdet(self, inv_quad_rhs=None, logdet=False, reduce_inv_quad=True)
             logdet_res = logdet_res.view(*logdet_res.shape).sum(-1)
         return inv_quad_res, logdet_res
 
+    def matmul(self, other):
+        from .diag_linear_operator import DiagLinearOperator
+
+        # this is trivial if we multiply two BlockDiagLinearOperator
+        if isinstance(other, BlockDiagLinearOperator):
+            return BlockDiagLinearOperator(self.base_linear_op @ other.base_linear_op)
+        # special case if we have a DiagLinearOperator
+        if isinstance(other, DiagLinearOperator):
+            diag_reshape = other._diag.view(*self.base_linear_op.shape[:-2], 1, -1)
+            return BlockDiagLinearOperator(self.base_linear_op * diag_reshape)
+        return super().matmul(other)
+
     @cached(name="svd")
     def _svd(self) -> Tuple["LinearOperator", Tensor, "LinearOperator"]:
         U, S, V = self.base_linear_op.svd()
diff --git a/linear_operator/operators/cat_linear_operator.py b/linear_operator/operators/cat_linear_operator.py
@@ -209,7 +209,10 @@ def _get_indices(self, row_index, col_index, *batch_indices):
         if len(res_list) == 1:
             return res_list[0].view(target_shape).to(self.device)
         else:
-            return torch.cat(res_list).view(target_shape).to(self.device)
+            # Explicitly move tensors to one device as torch.cat no longer moves tensors:
+            # https://github.com/pytorch/pytorch/issues/35045
+            res_list = [linear_op.to(self.device) for linear_op in res_list]
+            return torch.cat(res_list).view(target_shape)
 
     def _getitem(
         self,
diff --git a/linear_operator/operators/diag_linear_operator.py b/linear_operator/operators/diag_linear_operator.py
@@ -8,6 +8,7 @@
 from .. import settings
 from ..utils.memoize import cached
 from ._linear_operator import LinearOperator
+from .block_diag_linear_operator import BlockDiagLinearOperator
 from .dense_linear_operator import DenseLinearOperator
 from .triangular_linear_operator import TriangularLinearOperator
 
@@ -163,15 +164,17 @@ def log(self) -> "DiagLinearOperator":
         return self.__class__(self._diag.log())
 
     def matmul(self, other: Union[Tensor, LinearOperator]) -> Union[Tensor, LinearOperator]:
-        from .triangular_linear_operator import TriangularLinearOperator
-
         # this is trivial if we multiply two DiagLinearOperators
         if isinstance(other, DiagLinearOperator):
             return DiagLinearOperator(self._diag * other._diag)
         # special case if we have a DenseLinearOperator
         if isinstance(other, DenseLinearOperator):
             return DenseLinearOperator(self._diag.unsqueeze(-1) * other.tensor)
-        # and if we have a triangular one
+        # special case if we have a BlockDiagLinearOperator
+        if isinstance(other, BlockDiagLinearOperator):
+            diag_reshape = self._diag.view(*other.base_linear_op.shape[:-1], 1)
+            return BlockDiagLinearOperator(diag_reshape * other.base_linear_op)
+        # special case if we have a TriangularLinearOperator
         if isinstance(other, TriangularLinearOperator):
             return TriangularLinearOperator(self._diag.unsqueeze(-1) * other._tensor, upper=other.upper)
         return super().matmul(other)
diff --git a/linear_operator/operators/interpolated_linear_operator.py b/linear_operator/operators/interpolated_linear_operator.py
@@ -12,6 +12,7 @@
 from ..utils.interpolation import left_interp, left_t_interp
 from ._linear_operator import LinearOperator
 from .dense_linear_operator import DenseLinearOperator, to_linear_operator
+from .diag_linear_operator import DiagLinearOperator
 from .root_linear_operator import RootLinearOperator
 
 
@@ -409,6 +410,17 @@ def matmul(self, tensor):
         # what we get from the function factory.
         # The _matmul_closure is optimized for repeated calls, such as for _solve
 
+        if isinstance(tensor, DiagLinearOperator):
+            # if we know the rhs is diagonal this is easy
+            new_right_interp_values = self.right_interp_values * tensor._diag.unsqueeze(-1)
+            return InterpolatedLinearOperator(
+                base_linear_op=self.base_linear_op,
+                left_interp_indices=self.left_interp_indices,
+                left_interp_values=self.left_interp_values,
+                right_interp_indices=self.right_interp_indices,
+                right_interp_values=new_right_interp_values,
+            )
+
         if tensor.ndimension() == 1:
             is_vector = True
             tensor = tensor.unsqueeze(-1)
diff --git a/linear_operator/test/linear_operator_test_case.py b/linear_operator/test/linear_operator_test_case.py
@@ -8,6 +8,7 @@
 import torch
 
 import linear_operator
+from linear_operator.operators import DiagLinearOperator, to_dense
 from linear_operator.settings import linalg_dtypes
 from linear_operator.utils.errors import CachingError
 from linear_operator.utils.memoize import get_from_cache
@@ -34,14 +35,16 @@ def _test_matmul(self, rhs):
         linear_op = self.create_linear_op().detach().requires_grad_(True)
         linear_op_copy = torch.clone(linear_op).detach().requires_grad_(True)
         evaluated = self.evaluate_linear_op(linear_op_copy)
+        rhs_evaluated = to_dense(rhs)
 
         # Test operator
         res = linear_op @ rhs
-        actual = evaluated.matmul(rhs)
-        self.assertAllClose(res, actual)
+        actual = evaluated.matmul(rhs_evaluated)
+        res_evaluated = to_dense(res)
+        self.assertAllClose(res_evaluated, actual)
 
-        grad = torch.randn_like(res)
-        res.backward(gradient=grad)
+        grad = torch.randn_like(res_evaluated)
+        res_evaluated.backward(gradient=grad)
         actual.backward(gradient=grad)
         for arg, arg_copy in zip(linear_op.representation(), linear_op_copy.representation()):
             if arg_copy.requires_grad and arg_copy.is_leaf and arg_copy.grad is not None:
@@ -50,7 +53,7 @@ def _test_matmul(self, rhs):
         # Test __torch_function__
         res = torch.matmul(linear_op, rhs)
         actual = evaluated.matmul(rhs)
-        self.assertAllClose(res, actual)
+        self.assertAllClose(to_dense(res), actual)
 
     def _test_rmatmul(self, lhs):
         linear_op = self.create_linear_op().detach().requires_grad_(True)
@@ -305,6 +308,12 @@ def test_rmatmul_matrix(self):
         lhs = torch.randn(*linear_op.batch_shape, 4, linear_op.size(-2))
         return self._test_rmatmul(lhs)
 
+    def test_matmul_diag_matrix(self):
+        linear_op = self.create_linear_op()
+        diag = torch.rand(*linear_op.batch_shape, linear_op.size(-1))
+        rhs = DiagLinearOperator(diag)
+        return self._test_matmul(rhs)
+
     def test_matmul_matrix_broadcast(self):
         linear_op = self.create_linear_op()
 
diff --git a/test/operators/test_identity_linear_operator.py b/test/operators/test_identity_linear_operator.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from linear_operator.operators import IdentityLinearOperator
+from linear_operator.operators import IdentityLinearOperator, to_dense
 from linear_operator.test.linear_operator_test_case import LinearOperatorTestCase
 
 
@@ -13,10 +13,12 @@ def _test_matmul(self, rhs):
         linear_op = self.create_linear_op().detach().requires_grad_(True)
         linear_op_copy = linear_op.clone().detach().requires_grad_(True)
         evaluated = self.evaluate_linear_op(linear_op_copy)
+        rhs_evaluated = to_dense(rhs)
 
         res = linear_op.matmul(rhs)
-        actual = evaluated.matmul(rhs)
-        self.assertAllClose(res, actual)
+        actual = evaluated.matmul(rhs_evaluated)
+        res_evaluated = to_dense(res)
+        self.assertAllClose(res_evaluated, actual)
 
     def _test_rmatmul(self, lhs):
         linear_op = self.create_linear_op().detach().requires_grad_(True)