Add numba dispatch for LU

jessegrabowski · jessegrabowski · commit 70ef520ae043 · 2025-03-18T23:45:32.000+08:00
diff --git a/pytensor/link/numba/dispatch/slinalg.py b/pytensor/link/numba/dispatch/slinalg.py
@@ -1,8 +1,10 @@
 import warnings
 from collections.abc import Callable
+from typing import cast as typing_cast
 
 import numba
 import numpy as np
+import scipy.linalg
 from numba.core import types
 from numba.extending import overload
 from numba.np.linalg import _copy_to_fortran_order, ensure_lapack
@@ -18,6 +20,7 @@
 )
 from pytensor.link.numba.dispatch.basic import numba_funcify
 from pytensor.tensor.slinalg import (
+    LU,
     BlockDiagonal,
     Cholesky,
     CholeskySolve,
@@ -476,10 +479,11 @@ def impl(A: np.ndarray, A_norm: float, norm: str) -> tuple[np.ndarray, int]:
 def _getrf(A, overwrite_a=False) -> tuple[np.ndarray, np.ndarray, int]:
     """
     Placeholder for LU factorization; used by linalg.solve.
-
-    # TODO: Implement an LU_factor Op, then dispatch to this function in numba mode.
     """
-    return  # type: ignore
+    getrf = scipy.linalg.get_lapack_funcs("getrf", (A,))
+    A_copy, ipiv, info = getrf(A, overwrite_a=overwrite_a)
+
+    return A_copy, ipiv
 
 
 @overload(_getrf)
@@ -515,6 +519,263 @@ def impl(
     return impl
 
 
+def _lu_1(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Thin wrapper around scipy.linalg.lu. Used as an overload target to avoid side-effects on users to import Pytensor.
+
+    Called when permute_l is True and p_indices is False, and returns a tuple of (perm, L, U), where perm an integer
+    array of row swaps, such that L[perm] @ U = A.
+    """
+    return typing_cast(
+        tuple[np.ndarray, np.ndarray, np.ndarray],
+        linalg.lu(
+            a,
+            permute_l=permute_l,
+            check_finite=check_finite,
+            p_indices=p_indices,
+            overwrite_a=overwrite_a,
+        ),
+    )
+
+
+def _lu_2(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Thin wrapper around scipy.linalg.lu. Used as an overload target to avoid side-effects on users to import Pytensor.
+
+    Called when permute_l is False and p_indices is True, and returns a tuple of (PL, U), where PL is the
+    permuted L matrix, PL = P @ L.
+    """
+    return typing_cast(
+        tuple[np.ndarray, np.ndarray],
+        linalg.lu(
+            a,
+            permute_l=permute_l,
+            check_finite=check_finite,
+            p_indices=p_indices,
+            overwrite_a=overwrite_a,
+        ),
+    )
+
+
+def _lu_3(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Thin wrapper around scipy.linalg.lu. Used as an overload target to avoid side-effects on users to import Pytensor.
+
+    Called when permute_l is False and p_indices is False, and returns a tuple of (P, L, U), where P is the permutation
+    matrix, P @ L @ U = A.
+    """
+    return typing_cast(
+        tuple[np.ndarray, np.ndarray, np.ndarray],
+        linalg.lu(
+            a,
+            permute_l=permute_l,
+            check_finite=check_finite,
+            p_indices=p_indices,
+            overwrite_a=overwrite_a,
+        ),
+    )
+
+
+@overload(_lu_1)
+def lu_impl_1(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> Callable[
+    [np.ndarray, bool, bool, bool, bool], tuple[np.ndarray, np.ndarray, np.ndarray]
+]:
+    """
+    Overload scipy.linalg.lu with a numba function. This function is called when permute_l is True and p_indices is
+    False. Returns a tuple of (perm, L, U), where perm an integer array of row swaps, such that L[perm] @ U = A.
+    """
+    ensure_lapack()
+    _check_scipy_linalg_matrix(a, "lu")
+    dtype = a.dtype
+
+    def impl(
+        a: np.ndarray,
+        permute_l: bool,
+        check_finite: bool,
+        p_indices: bool,
+        overwrite_a: bool,
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+        A_copy, IPIV, INFO = _getrf(a, overwrite_a=overwrite_a)
+
+        L = np.eye(A_copy.shape[-1], dtype=dtype)
+        L += np.tril(A_copy, k=-1)
+        U = np.triu(A_copy)
+
+        # Fortran is 1 indexed, so we need to subtract 1 from the IPIV array
+        IPIV = IPIV - 1
+        p_inv = np.arange(len(IPIV))
+        for i in range(len(IPIV)):
+            p_inv[i], p_inv[IPIV[i]] = p_inv[IPIV[i]], p_inv[i]
+
+        perm = np.argsort(p_inv)
+        return perm, L, U
+
+    return impl
+
+
+@overload(_lu_2)
+def lu_impl_2(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> Callable[[np.ndarray, bool, bool, bool, bool], tuple[np.ndarray, np.ndarray]]:
+    """
+    Overload scipy.linalg.lu with a numba function. This function is called when permute_l is False and p_indices is
+    True. Returns a tuple of (PL, U), where PL is the permuted L matrix, PL = P @ L.
+    """
+
+    ensure_lapack()
+    _check_scipy_linalg_matrix(a, "lu")
+    dtype = a.dtype
+
+    def impl(
+        a: np.ndarray,
+        permute_l: bool,
+        check_finite: bool,
+        p_indices: bool,
+        overwrite_a: bool,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        A_copy, IPIV, INFO = _getrf(a, overwrite_a=overwrite_a)
+
+        L = np.eye(A_copy.shape[-1], dtype=dtype)
+        L += np.tril(A_copy, k=-1)
+        U = np.triu(A_copy)
+
+        # Fortran is 1 indexed, so we need to subtract 1 from the IPIV array
+        IPIV = IPIV - 1
+        p_inv = np.arange(len(IPIV))
+        for i in range(len(IPIV)):
+            p_inv[i], p_inv[IPIV[i]] = p_inv[IPIV[i]], p_inv[i]
+
+        perm = np.argsort(p_inv)
+        PL = L[perm]
+        return PL, U
+
+    return impl
+
+
+@overload(_lu_3)
+def lu_impl_3(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> Callable[
+    [np.ndarray, bool, bool, bool, bool], tuple[np.ndarray, np.ndarray, np.ndarray]
+]:
+    """
+    Overload scipy.linalg.lu with a numba function. This function is called when permute_l is True and p_indices is
+    False. Returns a tuple of (P, L, U), such that P @ L @ U = A.
+    """
+    ensure_lapack()
+    _check_scipy_linalg_matrix(a, "lu")
+    dtype = a.dtype
+
+    def impl(
+        a: np.ndarray,
+        permute_l: bool,
+        check_finite: bool,
+        p_indices: bool,
+        overwrite_a: bool,
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+        A_copy, IPIV, INFO = _getrf(a, overwrite_a=overwrite_a)
+
+        L = np.eye(A_copy.shape[-1], dtype=dtype)
+        L += np.tril(A_copy, k=-1)
+        U = np.triu(A_copy)
+
+        # Fortran is 1 indexed, so we need to subtract 1 from the IPIV array
+        IPIV = IPIV - 1
+        p_inv = np.arange(len(IPIV))
+        for i in range(len(IPIV)):
+            p_inv[i], p_inv[IPIV[i]] = p_inv[IPIV[i]], p_inv[i]
+
+        perm = np.argsort(p_inv)
+        P = np.eye(A_copy.shape[-1], dtype=dtype)[perm]
+
+        return P, L, U
+
+    return impl
+
+
+@numba_funcify.register(LU)
+def numba_funcify_LU(op, node, **kwargs):
+    permute_l = op.permute_l
+    check_finite = op.check_finite
+    p_indices = op.p_indices
+    overwrite_a = op.overwrite_a
+
+    dtype = node.inputs[0].dtype
+    if str(dtype).startswith("complex"):
+        raise NotImplementedError(
+            "Complex inputs not currently supported by lu in Numba mode"
+        )
+
+    @numba_basic.numba_njit(inline="always")
+    def lu(a):
+        if check_finite:
+            if np.any(np.bitwise_or(np.isinf(a), np.isnan(a))):
+                raise np.linalg.LinAlgError(
+                    "Non-numeric values (nan or inf) found in input to lu"
+                )
+
+        if p_indices:
+            res = _lu_1(
+                a,
+                permute_l=permute_l,
+                check_finite=check_finite,
+                p_indices=p_indices,
+                overwrite_a=overwrite_a,
+            )
+        elif permute_l:
+            res = _lu_2(
+                a,
+                permute_l=permute_l,
+                check_finite=check_finite,
+                p_indices=p_indices,
+                overwrite_a=overwrite_a,
+            )
+        else:
+            res = _lu_3(
+                a,
+                permute_l=permute_l,
+                check_finite=check_finite,
+                p_indices=p_indices,
+                overwrite_a=overwrite_a,
+            )
+
+        return res
+
+    return lu
+
+
 def _getrs(
     LU: np.ndarray, B: np.ndarray, IPIV: np.ndarray, trans: int, overwrite_b: bool
 ) -> tuple[np.ndarray, int]:
diff --git a/pytensor/tensor/slinalg.py b/pytensor/tensor/slinalg.py
@@ -1447,4 +1447,5 @@ def block_diag(*matrices: TensorVariable):
     "solve_triangular",
     "block_diag",
     "cho_solve",
+    "lu",
 ]
diff --git a/tests/link/numba/test_slinalg.py b/tests/link/numba/test_slinalg.py
@@ -496,3 +496,35 @@ def test_cho_solve(b_func, b_size, lower):
     RTOL = 1e-8 if floatX.endswith("64") else 1e-4
 
     np.testing.assert_allclose(A @ X_np, b, atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "permute_l, p_indices",
+    [(True, False), (False, True), (False, False)],
+    ids=["PL", "p_indices", "P"],
+)
+@pytest.mark.parametrize("shape", [(3, 5, 5), (5, 5)], ids=["batched", "not_batched"])
+def test_numba_lu(permute_l, p_indices, shape: tuple[int]):
+    rng = np.random.default_rng()
+    A = pt.tensor(
+        "A",
+        shape=shape,
+        dtype=config.floatX,
+    )
+
+    out = pt.linalg.lu(A, permute_l=permute_l, p_indices=p_indices)
+    f = pytensor.function([A], out, mode="NUMBA")
+
+    A_val = rng.normal(size=shape).astype(config.floatX)
+    if len(shape) == 2:
+        compare_numba_and_py([A], out, test_inputs=[A_val], inplace=True)
+
+    else:
+        # compare_numba_and_py fails: NotImplementedError: Non-jitted BlockwiseWithCoreShape not implemented
+        nb_out = f(A_val.copy())
+        sp_out = scipy_linalg.lu(
+            A_val.copy(), permute_l=permute_l, p_indices=p_indices, check_finite=False
+        )
+
+        for a, b in zip(nb_out, sp_out, strict=True):
+            np.testing.assert_allclose(a, b)

Original file line number	Diff line number	Diff line change
`@@ -1447,4 +1447,5 @@ def block_diag(*matrices: TensorVariable):`
`1447`	`1447`	`"solve_triangular",`
`1448`	`1448`	`"block_diag",`
`1449`	`1449`	`"cho_solve",`
	`1450`	`+ "lu",`
`1450`	`1451`	`]`