Add lu_solve function

jessegrabowski · jessegrabowski · commit 91815b3b0a82 · 2025-03-18T23:45:32.000+08:00
diff --git a/pytensor/tensor/slinalg.py b/pytensor/tensor/slinalg.py
@@ -226,6 +226,7 @@ def __init__(
     ):
         self.lower = lower
         self.check_finite = check_finite
+
         assert b_ndim in (1, 2)
         self.b_ndim = b_ndim
         if b_ndim == 1:
@@ -303,10 +304,14 @@ def L_op(self, inputs, outputs, output_gradients):
 
         solve_op = type(self)(**props_dict)
 
-        b_bar = solve_op(A.T, c_bar)
+        b_bar = solve_op(A.mT, c_bar)
         # force outer product if vector second input
         A_bar = -ptm.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
 
+        if props_dict.get("unit_diagonal", False):
+            n = A_bar.shape[-1]
+            A_bar = A_bar[pt.arange(n), pt.arange(n)].set(pt.zeros(n))
+
         return [A_bar, b_bar]
 
 
@@ -577,12 +582,42 @@ def lu(
     )
 
 
+def _pivot_to_permutation(pivots):
+    """
+    Converts a sequence of row exchanges to a permutation matrix that represents the same row exchanges. This
+    represents the inverse permutation, which can be used to reconstruct the original matrix from its LU factorization.
+    To get the actual permutation, the inverse permutation must be argsorted.
+    """
+
+    def step(i, permutation, swaps):
+        j = swaps[i]
+        x = permutation[i]
+        y = permutation[j]
+
+        permutation = permutation[i].set(y)
+        return permutation[j].set(x)
+
+    pivots = as_tensor_variable(pivots)
+    n = pivots.shape[0]
+    p_inv, _ = pytensor.scan(
+        step,
+        sequences=[pt.arange(n.copy())],
+        outputs_info=[pt.arange(n.copy())],
+        non_sequences=[pivots],
+    )
+
+    return p_inv[-1]
+
+
 class LUFactor(Op):
-    __props__ = ("overwrite_a", "check_finite")
+    __props__ = ("overwrite_a", "check_finite", "permutation_indices")
 
-    def __init__(self, *, overwrite_a=False, check_finite=True):
+    def __init__(
+        self, *, overwrite_a=False, check_finite=True, permutation_indices=False
+    ):
         self.overwrite_a = overwrite_a
         self.check_finite = check_finite
+        self.permutation_indices = permutation_indices
         self.gufunc_signature = "(m,m)->(m,m),(m)"
 
         if self.overwrite_a:
@@ -596,8 +631,9 @@ def make_node(self, A):
             )
 
         LU = matrix(shape=A.type.shape, dtype=A.type.dtype)
-        pivots = vector(shape=(A.type.shape[0],), dtype="int32")
-        return Apply(self, [A], [LU, pivots])
+        pivots_or_permutations = vector(shape=(A.type.shape[0],), dtype="int32")
+
+        return Apply(self, [A], [LU, pivots_or_permutations])
 
     def infer_shape(self, fgraph, node, shapes):
         n = shapes[0][0]
@@ -613,25 +649,40 @@ def inplace_on_inputs(self, allowed_inplace_inputs: list[int]) -> "Op":
 
     def perform(self, node, inputs, outputs):
         A = inputs[0]
-        LU, pivots = scipy_linalg.lu_factor(
-            A,
-            overwrite_a=self.overwrite_a,
-            check_finite=self.check_finite,
-        )
+
+        if self.permutation_indices:
+            p, L, U = cast(
+                tuple[np.ndarray, np.ndarray, np.ndarray],
+                scipy_linalg.lu(
+                    A,
+                    overwrite_a=self.overwrite_a,
+                    check_finite=self.check_finite,
+                    p_indices=True,
+                    permute_l=False,
+                ),
+            )
+            LU = np.tril(L, k=-1) + U
+
+        else:
+            LU, p = scipy_linalg.lu_factor(
+                A, overwrite_a=self.overwrite_a, check_finite=self.check_finite
+            )
 
         outputs[0][0] = LU
-        outputs[1][0] = pivots
+        outputs[1][0] = p
 
     def L_op(self, inputs, outputs, output_gradients):
-        A = inputs[0]
+        [A] = inputs
         LU_bar, _ = output_gradients
+        LU, p_indices = outputs
 
-        # We need the permutation matrix P, not the pivot indices. Easiest way is to just do another LU forward.
-        # Alternative is to do a scan over the pivot indices to convert them to permutation indices. I don't know if
-        # that's faster or slower.
-        P, L, U = lu(
-            A, permute_l=False, check_finite=self.check_finite, p_indices=False
-        )
+        eye = ptb.identity_like(A)
+        L = cast(TensorVariable, ptb.tril(LU, k=-1) + eye)
+        U = cast(TensorVariable, ptb.triu(LU))
+
+        if not self.permutation_indices:
+            p_indices_inv = _pivot_to_permutation(cast(TensorVariable, p_indices))
+            p_indices = pt.argsort(p_indices_inv)
 
         # Split LU_bar into L_bar and U_bar. This is valid because of the triangular structure of L and U
         L_bar = ptb.tril(LU_bar, k=-1)
@@ -642,13 +693,14 @@ def L_op(self, inputs, outputs, output_gradients):
         x2 = ptb.triu(U_bar @ U.T)
 
         LT_inv_x = solve_triangular(L.T, x1 + x2, lower=False, unit_diagonal=True)
-        A_bar = P @ solve_triangular(U, LT_inv_x.T, lower=False).T
+        B_bar = solve_triangular(U, LT_inv_x.T, lower=False).T
+        A_bar = B_bar[p_indices]
 
         return [A_bar]
 
 
 def lu_factor(
-    a: TensorLike, *, check_finite=True
+    a: TensorLike, *, check_finite: bool = True, permutation_indices: bool = False
 ) -> tuple[TensorVariable, TensorVariable]:
     """
     LU factorization with partial pivoting.
@@ -659,21 +711,63 @@ def lu_factor(
         Matrix to be factorized
     check_finite: bool
         Whether to check that the input matrix contains only finite numbers.
+    permutation_indices: bool
+        If True, returns permutation indices such that L[p] @ U = A. Otherwise returns the pivot indices, which give
+        a record of row swaps that occured at each iteration of the LU factorization. Default is False, which matches
+        the behavior of scipy.linalg.lu_factor.
 
     Returns
     -------
     LU: TensorVariable
         LU decomposition of `a`
-    pivots: TensorVariable
-        Permutation indices
+    pivots_or_permutations: TensorVariable
+        An array of integers representing either the pivot indices or permutation indices, depending on the value of
+        `permutation_indices`.
     """
 
     return cast(
         tuple[TensorVariable, TensorVariable],
-        Blockwise(LUFactor(check_finite=check_finite))(a),
+        Blockwise(
+            LUFactor(check_finite=check_finite, permutation_indices=permutation_indices)
+        )(a),
     )
 
 
+def lu_solve(
+    LU_and_pivots: tuple[TensorVariable, TensorVariable],
+    b: TensorVariable,
+    trans=False,
+    b_ndim=None,
+    check_finite=True,
+):
+    LU, pivots = LU_and_pivots
+    inv_permutation = _pivot_to_permutation(pivots)
+
+    x = b[inv_permutation] if not trans else b
+
+    x = solve_triangular(
+        LU,
+        x,
+        lower=not trans,
+        unit_diagonal=not trans,
+        trans=trans,
+        b_ndim=b_ndim,
+        check_finite=check_finite,
+    )
+
+    x = solve_triangular(
+        LU,
+        x,
+        lower=trans,
+        unit_diagonal=trans,
+        trans=trans,
+        b_ndim=b_ndim,
+        check_finite=check_finite,
+    )
+
+    return x[pt.argsort(inv_permutation)] if trans else x
+
+
 class SolveTriangular(SolveBase):
     """Solve a system of linear equations."""
 
@@ -688,6 +782,9 @@ class SolveTriangular(SolveBase):
     def __init__(self, *, unit_diagonal=False, **kwargs):
         if kwargs.get("overwrite_a", False):
             raise ValueError("overwrite_a is not supported for SolverTriangulare")
+
+        # There's a naming inconsistency between solve_triangular (trans) and solve (transposed). Internally, we can use
+        # transpose everywhere, but expose the same API as scipy.linalg.solve_triangular
         super().__init__(**kwargs)
         self.unit_diagonal = unit_diagonal
 
@@ -1546,4 +1643,5 @@ def block_diag(*matrices: TensorVariable):
     "cho_solve",
     "lu",
     "lu_factor",
+    "lu_solve",
 ]
diff --git a/tests/link/numba/test_slinalg.py b/tests/link/numba/test_slinalg.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pytest
 from numpy.testing import assert_allclose
+from scipy import linalg as scipy_linalg
 
 import pytensor
 import pytensor.tensor as pt
@@ -266,15 +267,13 @@ def test_block_diag():
 
 
 def test_lamch():
-    from scipy.linalg import get_lapack_funcs
-
     from pytensor.link.numba.dispatch.slinalg import _xlamch
 
     @numba.njit()
     def xlamch(kind):
         return _xlamch(kind)
 
-    lamch = get_lapack_funcs("lamch", (np.array([0.0], dtype=floatX),))
+    lamch = scipy_linalg.get_lapack_funcs("lamch", (np.array([0.0], dtype=floatX),))
 
     np.testing.assert_allclose(xlamch("E"), lamch("E"))
     np.testing.assert_allclose(xlamch("S"), lamch("S"))
@@ -289,23 +288,19 @@ def xlamch(kind):
 )
 def test_xlange(ord_numba, ord_scipy):
     # xlange is called internally only, we don't dispatch pt.linalg.norm to it
-    from scipy import linalg
-
     from pytensor.link.numba.dispatch.slinalg import _xlange
 
     @numba.njit()
     def xlange(x, ord):
         return _xlange(x, ord)
 
     x = np.random.normal(size=(5, 5)).astype(floatX)
-    np.testing.assert_allclose(xlange(x, ord_numba), linalg.norm(x, ord_scipy))
+    np.testing.assert_allclose(xlange(x, ord_numba), scipy_linalg.norm(x, ord_scipy))
 
 
 @pytest.mark.parametrize("ord_numba, ord_scipy", [("1", 1), ("I", np.inf)])
 def test_xgecon(ord_numba, ord_scipy):
     # gecon is called internally only, we don't dispatch pt.linalg.norm to it
-    from scipy.linalg import get_lapack_funcs
-
     from pytensor.link.numba.dispatch.slinalg import _xgecon, _xlange
 
     @numba.njit()
@@ -320,7 +315,7 @@ def gecon(x, norm):
 
     # Test against direct call to the underlying LAPACK functions
     # Solution does **not** agree with 1 / np.linalg.cond(x) !
-    lange, gecon = get_lapack_funcs(("lange", "gecon"), (x,))
+    lange, gecon = scipy_linalg.get_lapack_funcs(("lange", "gecon"), (x,))
     norm = lange(ord_numba, x)
     rcond2, _ = gecon(x, norm, norm=ord_numba)
 
@@ -330,8 +325,6 @@ def gecon(x, norm):
 
 @pytest.mark.parametrize("overwrite_a", [True, False])
 def test_getrf(overwrite_a):
-    from scipy.linalg import lu_factor
-
     from pytensor.link.numba.dispatch.slinalg import _getrf
 
     # TODO: Refactor this test to use compare_numba_and_py after we implement lu_factor in pytensor
@@ -345,7 +338,7 @@ def getrf(x, overwrite_a):
         x
     )  # x needs to be fortran-contiguous going into getrf for the overwrite option to work
 
-    lu, ipiv = lu_factor(x, overwrite_a=False)
+    lu, ipiv = scipy_linalg.lu_factor(x, overwrite_a=False)
     LU, IPIV, info = getrf(x, overwrite_a=overwrite_a)
 
     assert info == 0
@@ -364,9 +357,6 @@ def getrf(x, overwrite_a):
 @pytest.mark.parametrize("overwrite_b", [True, False])
 @pytest.mark.parametrize("b_shape", [(5,), (5, 3)], ids=["b_1d", "b_2d"])
 def test_getrs(trans, overwrite_a, overwrite_b, b_shape):
-    from scipy.linalg import lu_factor
-    from scipy.linalg import lu_solve as sp_lu_solve
-
     from pytensor.link.numba.dispatch.slinalg import _getrf, _getrs
 
     # TODO: Refactor this test to use compare_numba_and_py after we implement lu_solve in pytensor
@@ -384,8 +374,8 @@ def lu_solve(a, b, trans, overwrite_a, overwrite_b):
     a = np.asfortranarray(a)
     b = np.asfortranarray(b)
 
-    lu_and_piv = lu_factor(a, overwrite_a=False)
-    x_sp = sp_lu_solve(lu_and_piv, b, trans, overwrite_b=False)
+    lu_and_piv = scipy_linalg.lu_factor(a, overwrite_a=False)
+    x_sp = scipy_linalg.lu_solve(lu_and_piv, b, trans, overwrite_b=False)
 
     x, lu, info = lu_solve(
         a, b, trans, overwrite_a=overwrite_a, overwrite_b=overwrite_b
diff --git a/tests/tensor/test_slinalg.py b/tests/tensor/test_slinalg.py