Fix contiguity bugs in Numba lapack routines

ricardoV94 · ricardoV94 · commit 3d2d8b4589a1 · 2025-03-19T15:08:38.000+01:00
diff --git a/pytensor/link/numba/dispatch/slinalg.py b/pytensor/link/numba/dispatch/slinalg.py
@@ -26,6 +26,12 @@
 )
 
 
+@numba_basic.numba_njit(inline="always")
+def _copy_to_fortran_order_even_if_1d(x):
+    # Numba's _copy_to_fortran_order doesn't do anything for vectors
+    return x.copy() if x.ndim == 1 else _copy_to_fortran_order(x)
+
+
 @numba_basic.numba_njit(inline="always")
 def _solve_check(n, info, lamch=False, rcond=None):
     """
@@ -497,10 +503,10 @@ def impl(
     ) -> tuple[np.ndarray, np.ndarray, int]:
         _M, _N = np.int32(A.shape[-2:])  # type: ignore
 
-        if not overwrite_a:
-            A_copy = _copy_to_fortran_order(A)
-        else:
+        if overwrite_a and A.flags.f_contiguous:
             A_copy = A
+        else:
+            A_copy = _copy_to_fortran_order(A)
 
         M = val_to_int_ptr(_M)  # type: ignore
         N = val_to_int_ptr(_N)  # type: ignore
@@ -545,13 +551,14 @@ def impl(
 
         B_is_1d = B.ndim == 1
 
-        if not overwrite_b:
-            B_copy = _copy_to_fortran_order(B)
-        else:
+        if overwrite_b and B.flags.f_contiguous:
             B_copy = B
+        else:
+            B_copy = _copy_to_fortran_order_even_if_1d(B)
 
         if B_is_1d:
             B_copy = np.expand_dims(B_copy, -1)
+            assert B_copy.flags.f_contiguous
 
         NRHS = 1 if B_is_1d else int(B_copy.shape[-1])
 
@@ -681,19 +688,22 @@ def impl(
         _LDA, _N = np.int32(A.shape[-2:])  # type: ignore
         _solve_check_input_shapes(A, B)
 
-        if not overwrite_a:
-            A_copy = _copy_to_fortran_order(A)
-        else:
+        if overwrite_a and (A.flags.f_contiguous or A.flags.c_contiguous):
+            # A symmetric c_contiguous is the same as a symmetric f_contiguous
             A_copy = A
+        else:
+            A_copy = _copy_to_fortran_order(A)
 
         B_is_1d = B.ndim == 1
 
-        if not overwrite_b:
-            B_copy = _copy_to_fortran_order(B)
-        else:
+        if overwrite_b and B.flags.f_contiguous:
             B_copy = B
+        else:
+            B_copy = _copy_to_fortran_order_even_if_1d(B)
+
         if B_is_1d:
-            B_copy = np.asfortranarray(np.expand_dims(B_copy, -1))
+            B_copy = np.expand_dims(B_copy, -1)
+            assert B_copy.flags.f_contiguous
 
         NRHS = 1 if B_is_1d else int(B.shape[-1])
 
@@ -864,7 +874,7 @@ def _posv(
     overwrite_b: bool,
     check_finite: bool,
     transposed: bool,
-) -> tuple[np.ndarray, int]:
+) -> tuple[np.ndarray, np.ndarray, int]:
     """
     Placeholder for solving a linear system with a positive-definite matrix; used by linalg.solve.
     """
@@ -881,7 +891,8 @@ def posv_impl(
     check_finite: bool,
     transposed: bool,
 ) -> Callable[
-    [np.ndarray, np.ndarray, bool, bool, bool, bool, bool], tuple[np.ndarray, int]
+    [np.ndarray, np.ndarray, bool, bool, bool, bool, bool],
+    tuple[np.ndarray, np.ndarray, int],
 ]:
     ensure_lapack()
     _check_scipy_linalg_matrix(A, "solve")
@@ -903,17 +914,23 @@ def impl(
 
         _N = np.int32(A.shape[-1])
 
-        if not overwrite_a:
-            A_copy = _copy_to_fortran_order(A)
+        if overwrite_a:
+            if A.flags.c_contiguous:
+                # A lower c_contiguous is the same as an upper f_contiguous
+                # And an upper c_contiguous is the same as a lower f_contiguous
+                A_copy = A
+                lower = not lower
+            elif not A.flags.f_contiguous:
+                A_copy = _copy_to_fortran_order(A)
         else:
-            A_copy = A
+            A_copy = _copy_to_fortran_order(A)
 
         B_is_1d = B.ndim == 1
 
-        if not overwrite_b:
-            B_copy = _copy_to_fortran_order(B)
-        else:
+        if overwrite_b and B.flags.f_contiguous:
             B_copy = B
+        else:
+            B_copy = _copy_to_fortran_order_even_if_1d(B)
 
         if B_is_1d:
             B_copy = np.expand_dims(B_copy, -1)
@@ -939,8 +956,9 @@ def impl(
         )
 
         if B_is_1d:
-            return B_copy[..., 0], int_ptr_to_val(INFO)
-        return B_copy, int_ptr_to_val(INFO)
+            B_copy = B_copy[..., 0]
+
+        return A_copy, B_copy, int_ptr_to_val(INFO)
 
     return impl
 
@@ -1041,10 +1059,12 @@ def impl(
     ) -> np.ndarray:
         _solve_check_input_shapes(A, B)
 
-        x, info = _posv(A, B, lower, overwrite_a, overwrite_b, check_finite, transposed)
+        lu, x, info = _posv(
+            A, B, lower, overwrite_a, overwrite_b, check_finite, transposed
+        )
         _solve_check(A.shape[-1], info)
 
-        rcond, info = _pocon(x, _xlange(A))
+        rcond, info = _pocon(lu, _xlange(A))
         _solve_check(A.shape[-1], info=info, lamch=True, rcond=rcond)
 
         return x
diff --git a/tests/link/numba/test_basic.py b/tests/link/numba/test_basic.py
@@ -261,7 +261,7 @@ def assert_fn(x, y):
                 x, y
             )
 
-    if any(inp.owner is not None for inp in graph_inputs):
+    if any(isinstance(inp, Variable) and inp.owner is not None for inp in graph_inputs):
         raise ValueError("Inputs must be root variables")
 
     pytensor_py_fn = function(
diff --git a/tests/link/numba/test_slinalg.py b/tests/link/numba/test_slinalg.py
@@ -8,8 +8,9 @@
 
 import pytensor
 import pytensor.tensor as pt
-from pytensor import config
-from pytensor.tensor.slinalg import SolveTriangular
+from pytensor import In, config
+from pytensor.tensor import TensorVariable
+from pytensor.tensor.slinalg import Solve, SolveTriangular
 from tests import unittest_tools as utt
 from tests.link.numba.test_basic import compare_numba_and_py
 
@@ -408,66 +409,109 @@ def lu_solve(a, b, trans, overwrite_a, overwrite_b):
 @pytest.mark.filterwarnings(
     'ignore:Cannot cache compiled function "numba_funcified_fgraph"'
 )
-def test_solve(b_shape: tuple[int], assume_a: Literal["gen", "sym", "pos"]):
-    A = pt.matrix("A", dtype=floatX)
-    b = pt.tensor("b", shape=b_shape, dtype=floatX)
-
-    A_val = np.asfortranarray(np.random.normal(size=(5, 5)).astype(floatX))
-    b_val = np.asfortranarray(np.random.normal(size=b_shape).astype(floatX))
-
+@pytest.mark.parametrize(
+    "overwrite_a, overwrite_b",
+    [(False, False), (True, False), (False, True)],
+    ids=["no_overwrite", "overwrite_a", "overwrite_b"],
+)
+def test_solve(
+    b_shape: tuple[int],
+    assume_a: Literal["gen", "sym", "pos"],
+    overwrite_a: bool,
+    overwrite_b: bool,
+):
     def A_func(x):
         if assume_a == "pos":
             x = x @ x.T
         elif assume_a == "sym":
             x = (x + x.T) / 2
+        elif assume_a == "tridiagonal":
+            lib = pt if isinstance(x, TensorVariable) else np
+            diag_fn = getattr(lib, "diag")
+            eye_fn = getattr(lib, "eye")
+            concatenate_fn = getattr(lib, "concatenate")
+
+            ud = diag_fn(x, 1)
+            ld = diag_fn(x, -1)
+            # Set ud and ld to zeros
+            d = (x - diag_fn(ud, 1) - diag_fn(ld, -1)).sum(0)
+            return x * (
+                eye_fn(x.shape[1], k=0) * d
+                + eye_fn(x.shape[1], k=-1) * concatenate_fn([[0], ld], axis=-1)
+                + eye_fn(x.shape[1], k=1) * concatenate_fn([ud, [0]], axis=-1)
+            )
         return x
 
+    A = pt.matrix("A", dtype=floatX)
+    b = pt.tensor("b", shape=b_shape, dtype=floatX)
+
+    rng = np.random.default_rng(418)
+    A_val = np.asfortranarray(A_func(rng.normal(size=(5, 5))).astype(floatX))
+    b_val = np.asfortranarray(rng.normal(size=b_shape).astype(floatX))
+
     X = pt.linalg.solve(
-        A_func(A),
+        A,
         b,
         assume_a=assume_a,
         b_ndim=len(b_shape),
     )
-    f = pytensor.function(
-        [pytensor.In(A, mutable=True), pytensor.In(b, mutable=True)], X, mode="NUMBA"
-    )
-    op = f.maker.fgraph.outputs[0].owner.op
-
-    compare_numba_and_py([A, b], [X], test_inputs=[A_val, b_val], inplace=True)
-
-    # Calling this is destructive and will rewrite b_val to be the answer. Store copies of the inputs first.
-    A_val_copy = A_val.copy()
-    b_val_copy = b_val.copy()
 
-    X_np = f(A_val, b_val)
-
-    # overwrite_b is preferred when both inputs can be destroyed
-    assert op.destroy_map == {0: [1]}
-
-    # Confirm inputs were destroyed by checking against the copies
-    assert (A_val == A_val_copy).all() == (op.destroy_map.get(0, None) != [0])
-    assert (b_val == b_val_copy).all() == (op.destroy_map.get(0, None) != [1])
-
-    ATOL = 1e-8 if floatX.endswith("64") else 1e-4
-    RTOL = 1e-8 if floatX.endswith("64") else 1e-4
+    f, res = compare_numba_and_py(
+        [In(A, mutable=overwrite_a), In(b, mutable=overwrite_b)],
+        X,
+        test_inputs=[A_val, b_val],
+        inplace=True,
+        numba_mode="NUMBA",  # Default numba mode inplace rewrites get triggered
+    )
+    f.dprint(print_memory_map=True)
 
-    # Confirm b_val is used to store to solution
-    np.testing.assert_allclose(X_np, b_val, atol=ATOL, rtol=RTOL)
-    assert not np.allclose(b_val, b_val_copy)
+    op = f.maker.fgraph.outputs[0].owner.op
+    assert isinstance(op, Solve)
+    destroy_map = op.destroy_map
+    if overwrite_a and overwrite_b:
+        raise NotImplementedError(
+            "Test not implemented for symultaneous overwrite_a and overwrite_b, as that's not currently supported by PyTensor"
+        )
+    elif overwrite_a:
+        assert destroy_map == {0: [0]}
+    elif overwrite_b:
+        assert destroy_map == {0: [1]}
+    else:
+        assert destroy_map == {}
+
+    # Test inputs are destroyed if possible
+    A_val_f_contig = np.copy(A_val, order="F")
+    b_val_f_contig = np.copy(b_val, order="F")
+    res_f_contig = f(A_val_f_contig, b_val_f_contig)
+    np.testing.assert_allclose(res_f_contig, res)
+    assert (A_val == A_val_f_contig).all() == (op.destroy_map.get(0, None) != [0])
+    assert (b_val == b_val_f_contig).all() == (op.destroy_map.get(0, None) != [1])
+
+    # Test right results even if input cannot be destroyed because it is not F-contiguous
+    A_val_c_contig = np.copy(A_val, order="C")
+    b_val_c_contig = np.copy(b_val, order="C")
+    res_c_contig = f(A_val_c_contig, b_val_c_contig)
+    np.testing.assert_allclose(res_c_contig, res)
+    if assume_a == "sym" and overwrite_a:
+        # We can actually destroy either C or F-contiguous arrays, since they are equivalent
+        assert not np.allclose(A_val_c_contig, A_val)
+    else:
+        np.testing.assert_allclose(A_val_c_contig, A_val)
+    np.testing.assert_allclose(b_val_c_contig, b_val)
 
-    # Test that the result is numerically correct. Need to use the unmodified copy
-    np.testing.assert_allclose(
-        A_func(A_val_copy) @ X_np, b_val_copy, atol=ATOL, rtol=RTOL
+    # Test right results if inputs are not contiguous in either format
+    A_val_not_contig = np.repeat(A_val, 2, axis=0)[::2]
+    assert not (
+        A_val_not_contig.flags.c_contiguous or A_val_not_contig.flags.f_contiguous
     )
-
-    # See the note in tensor/test_slinalg.py::test_solve_correctness for details about the setup here
-    utt.verify_grad(
-        lambda A, b: pt.linalg.solve(
-            A_func(A), b, lower=False, assume_a=assume_a, b_ndim=len(b_shape)
-        ),
-        [A_val_copy, b_val_copy],
-        mode="NUMBA",
+    b_val_not_contig = np.repeat(b_val, 2, axis=0)[::2]
+    assert not (
+        b_val_not_contig.flags.c_contiguous or b_val_not_contig.flags.f_contiguous
     )
+    res_not_contig = f(A_val_not_contig, b_val_not_contig)
+    np.testing.assert_allclose(res_not_contig, res)
+    np.testing.assert_allclose(A_val_not_contig, A_val)
+    np.testing.assert_allclose(b_val_not_contig, b_val)
 
 
 @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -261,7 +261,7 @@ def assert_fn(x, y):`
`261`	`261`	`x, y`
`262`	`262`	`)`
`263`	`263`
`264`		`- if any(inp.owner is not None for inp in graph_inputs):`
	`264`	`+ if any(isinstance(inp, Variable) and inp.owner is not None for inp in graph_inputs):`
`265`	`265`	`raise ValueError("Inputs must be root variables")`
`266`	`266`
`267`	`267`	`pytensor_py_fn = function(`