pymc-devs
diff --git a/‎pytensor/tensor/slinalg.py‎
Lines changed: 171 additions & 23 deletions b/‎pytensor/tensor/slinalg.py‎
Lines changed: 171 additions & 23 deletions
@@ -10,6 +10,7 @@
 
 import pytensor
 import pytensor.tensor as pt
+from pytensor.compile.builders import OpFromGraph
 from pytensor.gradient import DisconnectedType
 from pytensor.graph.basic import Apply
 from pytensor.graph.op import Op
@@ -226,6 +227,7 @@ def __init__(
     ):
         self.lower = lower
         self.check_finite = check_finite
+
         assert b_ndim in (1, 2)
         self.b_ndim = b_ndim
         if b_ndim == 1:
@@ -303,10 +305,14 @@ def L_op(self, inputs, outputs, output_gradients):
 
         solve_op = type(self)(**props_dict)
 
-        b_bar = solve_op(A.T, c_bar)
+        b_bar = solve_op(A.mT, c_bar)
         # force outer product if vector second input
         A_bar = -ptm.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
 
+        if props_dict.get("unit_diagonal", False):
+            n = A_bar.shape[-1]
+            A_bar = A_bar[pt.arange(n), pt.arange(n)].set(pt.zeros(n))
+
         return [A_bar, b_bar]
 
 
@@ -577,12 +583,42 @@ def lu(
     )
 
 
+def _pivot_to_permutation(pivots):
+    """
+    Converts a sequence of row exchanges to a permutation matrix that represents the same row exchanges. This
+    represents the inverse permutation, which can be used to reconstruct the original matrix from its LU factorization.
+    To get the actual permutation, the inverse permutation must be argsorted.
+    """
+
+    def step(i, permutation, swaps):
+        j = swaps[i]
+        x = permutation[i]
+        y = permutation[j]
+
+        permutation = permutation[i].set(y)
+        return permutation[j].set(x)
+
+    pivots = as_tensor_variable(pivots)
+    n = pivots.shape[0]
+    p_inv, _ = pytensor.scan(
+        step,
+        sequences=[pt.arange(n.copy())],
+        outputs_info=[pt.arange(n.copy())],
+        non_sequences=[pivots],
+    )
+
+    return p_inv[-1]
+
+
 class LUFactor(Op):
-    __props__ = ("overwrite_a", "check_finite")
+    __props__ = ("overwrite_a", "check_finite", "permutation_indices")
 
-    def __init__(self, *, overwrite_a=False, check_finite=True):
+    def __init__(
+        self, *, overwrite_a=False, check_finite=True, permutation_indices=False
+    ):
         self.overwrite_a = overwrite_a
         self.check_finite = check_finite
+        self.permutation_indices = permutation_indices
         self.gufunc_signature = "(m,m)->(m,m),(m)"
 
         if self.overwrite_a:
@@ -596,8 +632,9 @@ def make_node(self, A):
             )
 
         LU = matrix(shape=A.type.shape, dtype=A.type.dtype)
-        pivots = vector(shape=(A.type.shape[0],), dtype="int32")
-        return Apply(self, [A], [LU, pivots])
+        pivots_or_permutations = vector(shape=(A.type.shape[0],), dtype="int32")
+
+        return Apply(self, [A], [LU, pivots_or_permutations])
 
     def infer_shape(self, fgraph, node, shapes):
         n = shapes[0][0]
@@ -613,25 +650,40 @@ def inplace_on_inputs(self, allowed_inplace_inputs: list[int]) -> "Op":
 
     def perform(self, node, inputs, outputs):
         A = inputs[0]
-        LU, pivots = scipy_linalg.lu_factor(
-            A,
-            overwrite_a=self.overwrite_a,
-            check_finite=self.check_finite,
-        )
+
+        if self.permutation_indices:
+            p, L, U = cast(
+                tuple[np.ndarray, np.ndarray, np.ndarray],
+                scipy_linalg.lu(
+                    A,
+                    overwrite_a=self.overwrite_a,
+                    check_finite=self.check_finite,
+                    p_indices=True,
+                    permute_l=False,
+                ),
+            )
+            LU = np.tril(L, k=-1) + U
+
+        else:
+            LU, p = scipy_linalg.lu_factor(
+                A, overwrite_a=self.overwrite_a, check_finite=self.check_finite
+            )
 
         outputs[0][0] = LU
-        outputs[1][0] = pivots
+        outputs[1][0] = p
 
     def L_op(self, inputs, outputs, output_gradients):
-        A = inputs[0]
+        [A] = inputs
         LU_bar, _ = output_gradients
+        LU, p_indices = outputs
 
-        # We need the permutation matrix P, not the pivot indices. Easiest way is to just do another LU forward.
-        # Alternative is to do a scan over the pivot indices to convert them to permutation indices. I don't know if
-        # that's faster or slower.
-        P, L, U = lu(
-            A, permute_l=False, check_finite=self.check_finite, p_indices=False
-        )
+        eye = ptb.identity_like(A)
+        L = cast(TensorVariable, ptb.tril(LU, k=-1) + eye)
+        U = cast(TensorVariable, ptb.triu(LU))
+
+        if not self.permutation_indices:
+            p_indices_inv = _pivot_to_permutation(cast(TensorVariable, p_indices))
+            p_indices = pt.argsort(p_indices_inv)
 
         # Split LU_bar into L_bar and U_bar. This is valid because of the triangular structure of L and U
         L_bar = ptb.tril(LU_bar, k=-1)
@@ -642,13 +694,14 @@ def L_op(self, inputs, outputs, output_gradients):
         x2 = ptb.triu(U_bar @ U.T)
 
         LT_inv_x = solve_triangular(L.T, x1 + x2, lower=False, unit_diagonal=True)
-        A_bar = P @ solve_triangular(U, LT_inv_x.T, lower=False).T
+        B_bar = solve_triangular(U, LT_inv_x.T, lower=False).T
+        A_bar = B_bar[p_indices]
 
         return [A_bar]
 
 
 def lu_factor(
-    a: TensorLike, *, check_finite=True
+    a: TensorLike, *, check_finite: bool = True, permutation_indices: bool = False
 ) -> tuple[TensorVariable, TensorVariable]:
     """
     LU factorization with partial pivoting.
@@ -659,21 +712,112 @@ def lu_factor(
         Matrix to be factorized
     check_finite: bool
         Whether to check that the input matrix contains only finite numbers.
+    permutation_indices: bool
+        If True, returns permutation indices such that L[p] @ U = A. Otherwise returns the pivot indices, which give
+        a record of row swaps that occured at each iteration of the LU factorization. Default is False, which matches
+        the behavior of scipy.linalg.lu_factor.
 
     Returns
     -------
     LU: TensorVariable
         LU decomposition of `a`
-    pivots: TensorVariable
-        Permutation indices
+    pivots_or_permutations: TensorVariable
+        An array of integers representing either the pivot indices or permutation indices, depending on the value of
+        `permutation_indices`.
     """
 
     return cast(
         tuple[TensorVariable, TensorVariable],
-        Blockwise(LUFactor(check_finite=check_finite))(a),
+        Blockwise(
+            LUFactor(check_finite=check_finite, permutation_indices=permutation_indices)
+        )(a),
     )
 
 
+class LUSolve(OpFromGraph):
+    """Solve a system of linear equations given the LU decomposition of the matrix."""
+
+    __props__ = ("trans", "b_ndim", "check_finite", "overwrite_b")
+
+    def __init__(
+        self,
+        *args,
+        trans: bool = False,
+        b_ndim: int | None = None,
+        check_finite: bool = False,
+        overwrite_b: bool = False,
+        **kwargs,
+    ):
+        self.trans = trans
+        self.b_ndim = b_ndim
+        self.check_finite = check_finite
+        self.overwrite_b = overwrite_b
+
+        super().__init__(*args, **kwargs)
+
+
+def lu_solve(
+    LU_and_pivots: tuple[TensorLike, TensorLike],
+    b: TensorLike,
+    trans: bool = False,
+    b_ndim: int | None = None,
+    check_finite: bool = True,
+):
+    """
+    Solve a system of linear equations given the LU decomposition of the matrix.
+
+    Parameters
+    ----------
+    LU_and_pivots: tuple[TensorLike, TensorLike]
+        LU decomposition of the matrix, as returned by `lu_factor`
+    b: TensorLike
+        Right-hand side of the equation
+    trans: bool
+        If True, solve A^T x = b, instead of Ax = b. Default is False
+    b_ndim: int, optional
+        The number of core dimensions in b. Used to distinguish between a batch of vectors (b_ndim=1) and a matrix
+        of vectors (b_ndim=2). Default is None, which will infer the number of core dimensions from the input.
+    check_finite: bool
+        If True, check that the input matrices contain only finite numbers. Default is True.
+    """
+    b_ndim = _default_b_ndim(b, b_ndim)
+    LU, pivots = LU_and_pivots
+
+    LU, pivots, b = map(pt.as_tensor_variable, [LU, pivots, b])
+    inv_permutation = _pivot_to_permutation(pivots)
+
+    x = b[inv_permutation] if not trans else b
+
+    x = solve_triangular(
+        LU,
+        x,
+        lower=not trans,
+        unit_diagonal=not trans,
+        trans=trans,
+        b_ndim=b_ndim,
+        check_finite=check_finite,
+    )
+
+    x = solve_triangular(
+        LU,
+        x,
+        lower=trans,
+        unit_diagonal=trans,
+        trans=trans,
+        b_ndim=b_ndim,
+        check_finite=check_finite,
+    )
+    x = x[pt.argsort(inv_permutation)] if trans else x
+
+    return LUSolve(
+        inputs=[LU, pivots, b],
+        outputs=[x],
+        trans=trans,
+        b_ndim=b_ndim,
+        check_finite=check_finite,
+    )(LU, pivots, b)
+
+
 class SolveTriangular(SolveBase):
     """Solve a system of linear equations."""
 
@@ -688,6 +832,9 @@ class SolveTriangular(SolveBase):
     def __init__(self, *, unit_diagonal=False, **kwargs):
         if kwargs.get("overwrite_a", False):
             raise ValueError("overwrite_a is not supported for SolverTriangulare")
+
+        # There's a naming inconsistency between solve_triangular (trans) and solve (transposed). Internally, we can use
+        # transpose everywhere, but expose the same API as scipy.linalg.solve_triangular
         super().__init__(**kwargs)
         self.unit_diagonal = unit_diagonal
 
@@ -1546,4 +1693,5 @@ def block_diag(*matrices: TensorVariable):
     "cho_solve",
     "lu",
     "lu_factor",
+    "lu_solve",
 ]