Restore LU Op

jessegrabowski · jessegrabowski · commit 23d2737aac14 · 2025-02-16T17:11:13.000+08:00
diff --git a/pytensor/tensor/slinalg.py b/pytensor/tensor/slinalg.py
@@ -1,6 +1,6 @@
 import logging
-import typing
 import warnings
+from collections.abc import Sequence
 from functools import reduce
 from typing import Literal, cast
 
@@ -9,6 +9,7 @@
 
 import pytensor
 import pytensor.tensor as pt
+from pytensor.gradient import DisconnectedType
 from pytensor.graph.basic import Apply
 from pytensor.graph.op import Op
 from pytensor.tensor import TensorLike, as_tensor_variable
@@ -295,31 +296,16 @@ def L_op(self, inputs, outputs, output_gradients):
         # We need to return (dC/d[inv(A)], dC/db)
         c_bar = output_gradients[0]
 
-        solve_args = {k: getattr(self, k) for k in self.__props__}
-
-        # Some solvers can solve A.T x = b directly, without ever computing the transpose
-        has_trans = "transposed" in self.__props__
-
-        if has_trans:
-            # If the solver can do transposed solves, we do the opposite of the forward in the reverse. If we solved
-            # C = solve(A, b), then b_bar = solve(A.T, c_bar). If we solved C = solve(A.T, b), then
-            # b_bar = solve(A, c_bar)
-            solve_args["transposed"] = not solve_args["transposed"]
-            solve_op = type(self)(**solve_args)
-            b_bar = solve_op(A, c_bar)
-
-        else:
-            # Otherwise, we have to actually do the transpose of whatever was given
-            solve_op = type(self)(**solve_args)
-            b_bar = solve_op(A.T, c_bar)
+        trans_solve_op = type(self)(
+            **{
+                k: (not getattr(self, k) if k == "lower" else getattr(self, k))
+                for k in self.__props__
+            }
+        )
+        b_bar = trans_solve_op(A.T, c_bar)
 
         # force outer product if vector second input
-        A_bar = -ptm.outer(b_bar, c) if c.ndim == 1 else -b_bar @ c.T
-
-        if has_trans and not solve_args["transposed"]:
-            # If we did a transposed solve in the forward pass, the program is expecting the
-            # gradients of A.T, not A
-            A_bar = A_bar.T
+        A_bar = -ptm.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
 
         return [A_bar, b_bar]
 
@@ -396,6 +382,186 @@ def cho_solve(c_and_lower, b, *, check_finite=True, b_ndim: int | None = None):
     )(A, b)
 
 
+class LU(Op):
+    """Decompose a matrix into lower and upper triangular matrices."""
+
+    __props__ = ("permute_l", "overwrite_a", "check_finite", "p_indices")
+
+    def __init__(
+        self, *, permute_l=False, overwrite_a=False, check_finite=True, p_indices=False
+    ):
+        self.permute_l = permute_l
+        self.check_finite = check_finite
+        self.p_indices = p_indices
+        self.overwrite_a = overwrite_a
+
+        if self.permute_l:
+            # permute_l overrides p_indices in the scipy function. We can copy that behavior
+            self.gufunc_signature = "(m,m)->(m,m),(m,m)"
+        elif self.p_indices:
+            self.gufunc_signature = "(m,m)->(m),(m,m),(m,m)"
+        else:
+            self.gufunc_signature = "(m,m)->(m,m),(m,m),(m,m)"
+
+        if self.overwrite_a:
+            self.destroy_map = {0: [0]}
+
+    def infer_shape(self, fgraph, node, shapes):
+        n = shapes[0][0]
+        if self.permute_l:
+            return [(n, n), (n, n)]
+        elif self.p_indices:
+            return [(n,), (n, n), (n, n)]
+        else:
+            return [(n, n), (n, n), (n, n)]
+
+    def make_node(self, x):
+        x = as_tensor_variable(x)
+        if x.type.ndim != 2:
+            raise TypeError(
+                f"LU only allowed on matrix (2-D) inputs, got {x.type.ndim}-D input"
+            )
+
+        real_dtype = "f" if np.dtype(x.type.dtype).char in "fF" else "d"
+        p_dtype = "int32" if self.p_indices else np.dtype(real_dtype)
+
+        L = tensor(shape=x.type.shape, dtype=real_dtype)
+        U = tensor(shape=x.type.shape, dtype=real_dtype)
+
+        if self.permute_l:
+            # In this case, L is actually P @ L
+            return Apply(self, inputs=[x], outputs=[L, U])
+        elif self.p_indices:
+            p = tensor(shape=(x.type.shape[0],), dtype=p_dtype)
+            return Apply(self, inputs=[x], outputs=[p, L, U])
+        else:
+            P = tensor(shape=x.type.shape, dtype=p_dtype)
+            return Apply(self, inputs=[x], outputs=[P, L, U])
+
+    def perform(self, node, inputs, outputs):
+        [A] = inputs
+
+        out = scipy.linalg.lu(
+            A,
+            permute_l=self.permute_l,
+            overwrite_a=self.overwrite_a,
+            check_finite=self.check_finite,
+            p_indices=self.p_indices,
+        )
+
+        outputs[0][0] = out[0]
+        outputs[1][0] = out[1]
+
+        if not self.permute_l:
+            # In all cases except permute_l, there are three returns
+            outputs[2][0] = out[2]
+
+    def inplace_on_inputs(self, allowed_inplace_inputs: list[int]) -> "Op":
+        if 0 in allowed_inplace_inputs:
+            new_props = self._props_dict()  # type: ignore
+            new_props["overwrite_a"] = True
+            return type(self)(**new_props)
+        else:
+            return self
+
+    def L_op(
+        self,
+        inputs: Sequence[ptb.Variable],
+        outputs: Sequence[ptb.Variable],
+        output_grads: Sequence[ptb.Variable],
+    ) -> list[ptb.Variable]:
+        r"""
+        Derivation is due to Differentiation of Matrix Functionals Using Triangular Factorization
+        F. R. De Hoog, R.S. Anderssen, M. A. Lukas
+        """
+        [A] = inputs
+        A = cast(TensorVariable, A)
+
+        if self.permute_l:
+            PL_bar, U_bar = output_grads
+
+            # TODO: Rewrite into permute_l = False for graphs where we need to compute the gradient
+            P, L, U = lu(  # type: ignore
+                A, permute_l=False, check_finite=self.check_finite, p_indices=False
+            )
+
+            # Permutation matrix is orthogonal
+            L_bar = (
+                P.T @ PL_bar
+                if not isinstance(PL_bar.type, DisconnectedType)
+                else pt.zeros_like(A)
+            )
+
+        elif self.p_indices:
+            p, L, U = outputs
+
+            # TODO: rewrite to p_indices = False for graphs where we need to compute the gradient
+            P = pt.eye(A.shape[0])[p]
+            _, L_bar, U_bar = output_grads
+        else:
+            P, L, U = outputs
+            _, L_bar, U_bar = output_grads
+
+        L_bar = (
+            L_bar if not isinstance(L_bar.type, DisconnectedType) else pt.zeros_like(A)
+        )
+        U_bar = (
+            U_bar if not isinstance(U_bar.type, DisconnectedType) else pt.zeros_like(A)
+        )
+
+        x1 = ptb.tril(L.T @ L_bar, k=-1)
+        x2 = ptb.triu(U_bar @ U.T)
+
+        L_inv_x = solve_triangular(L.T, x1 + x2, lower=False, unit_diagonal=True)
+        A_bar = P @ solve_triangular(U, L_inv_x.T, lower=False).T
+
+        return [A_bar]
+
+
+def lu(
+    a: TensorLike, permute_l=False, check_finite=True, p_indices=False
+) -> (
+    tuple[TensorVariable, TensorVariable, TensorVariable]
+    | tuple[TensorVariable, TensorVariable]
+):
+    """
+    Factorize a matrix as the product of a unit lower triangular matrix and an upper triangular matrix:
+
+    ... math::
+
+        A = P L U
+
+    Where P is a permutation matrix, L is lower triangular with unit diagonal elements, and U is upper triangular.
+
+    Parameters
+    ----------
+    a: TensorLike
+        Matrix to be factorized
+    permute_l: bool
+        If True, L is a product of permutation and unit lower triangular matrices. Only two values, PL and U, will
+        be returned in this case, and PL will not be lower triangular.
+    check_finite: bool
+        Whether to check that the input matrix contains only finite numbers.
+    p_indices: bool
+        If True, return integer matrix indices for the permutation matrix. Otherwise, return the permutation matrix
+        itself.
+
+    Returns
+    -------
+    P: TensorVariable
+        Permutation matrix, or array of integer indices for permutation matrix. Not returned if permute_l is True.
+    L: TensorVariable
+        Lower triangular matrix, or product of permutation and unit lower triangular matrices if permute_l is True.
+    U: TensorVariable
+        Upper triangular matrix
+    """
+    return cast(
+        tuple[TensorVariable, TensorVariable, TensorVariable]
+        | tuple[TensorVariable, TensorVariable],
+        LU(permute_l=permute_l, check_finite=check_finite, p_indices=p_indices)(a),
+    )
+
+
 class SolveTriangular(SolveBase):
     """Solve a system of linear equations."""
 
@@ -513,13 +679,13 @@ class Solve(SolveBase):
     def __init__(self, *, assume_a="gen", transposed=False, **kwargs):
         if assume_a not in ("gen", "sym", "her", "pos"):
             raise ValueError(f"{assume_a} is not a recognized matrix structure")
+
         super().__init__(**kwargs)
         self.assume_a = assume_a
         self.transposed = transposed
 
     def perform(self, node, inputs, outputs):
         a, b = inputs
-
         outputs[0][0] = scipy.linalg.solve(
             a=a,
             b=b,
@@ -1083,7 +1249,7 @@ def solve_discrete_are(
     )
 
 
-def _largest_common_dtype(tensors: typing.Sequence[TensorVariable]) -> np.dtype:
+def _largest_common_dtype(tensors: Sequence[TensorVariable]) -> np.dtype:
     return reduce(lambda l, r: np.promote_types(l, r), [x.dtype for x in tensors])