Respect overwrite_a and overwrite_b arguments

jessegrabowski · jessegrabowski · commit 54a2c6e06b4d · 2025-01-05T13:07:47.000+08:00
diff --git a/pytensor/link/numba/dispatch/slinalg.py b/pytensor/link/numba/dispatch/slinalg.py
@@ -107,6 +107,7 @@ def impl(A, B, trans=0, lower=False, unit_diagonal=False):
         _solve_check_input_shapes(A, B)
 
         B_is_1d = B.ndim == 1
+
         if B_is_1d:
             B_copy = np.asfortranarray(np.expand_dims(B, -1))
         else:
@@ -387,7 +388,6 @@ def xgecon_impl(A, A_norm, norm):
 
     def impl(A, A_norm, norm):
         _N = np.int32(A.shape[-1])
-        A_copy = _copy_to_fortran_order(A)
 
         N = val_to_int_ptr(_N)
         LDA = val_to_int_ptr(_N)
@@ -401,7 +401,7 @@ def impl(A, A_norm, norm):
         numba_gecon(
             NORM,
             N,
-            A_copy.view(w_type).ctypes,
+            A.view(w_type).ctypes,
             LDA,
             A_NORM.view(w_type).ctypes,
             RCOND.view(w_type).ctypes,
@@ -425,16 +425,20 @@ def _getrf():
 
 
 @overload(_getrf)
-def getrf_impl(A):
+def getrf_impl(A, overwrite_a=False):
     ensure_lapack()
     _check_scipy_linalg_matrix(A, "getrf")
     dtype = A.dtype
     w_type = _get_underlying_float(dtype)
     numba_getrf = _LAPACK().numba_xgetrf(dtype)
 
-    def impl(A):
+    def impl(A, overwrite_a=False):
         _M, _N = np.int32(A.shape[-2:])
-        A_copy = _copy_to_fortran_order(A)
+
+        if not overwrite_a:
+            A_copy = _copy_to_fortran_order(A)
+        else:
+            A_copy = A
 
         M = val_to_int_ptr(_M)
         N = val_to_int_ptr(_N)
@@ -459,23 +463,27 @@ def _getrs():
 
 
 @overload(_getrs)
-def getrs_impl(LU, B, IPIV, trans=0):
+def getrs_impl(LU, B, IPIV, trans=0, overwrite_b=False):
     ensure_lapack()
     _check_scipy_linalg_matrix(LU, "getrs")
     _check_scipy_linalg_matrix(B, "getrs")
     dtype = LU.dtype
     w_type = _get_underlying_float(dtype)
     numba_getrs = _LAPACK().numba_xgetrs(dtype)
 
-    def impl(LU, B, IPIV, trans=0):
+    def impl(LU, B, IPIV, trans=0, overwrite_b=False):
         _N = np.int32(LU.shape[-1])
         _solve_check_input_shapes(LU, B)
 
         B_is_1d = B.ndim == 1
-        if B_is_1d:
-            B_copy = np.asfortranarray(np.expand_dims(B, -1))
-        else:
+
+        if not overwrite_b:
             B_copy = _copy_to_fortran_order(B)
+        else:
+            B_copy = B
+        if B_is_1d:
+            B_copy = np.asfortranarray(np.expand_dims(B_copy, -1))
+
         B_NDIM = 1 if B_is_1d else int(B.shape[-1])
 
         TRANS = val_to_int_ptr(_trans_char_to_int(trans))
@@ -591,13 +599,21 @@ def sysv_impl(A, B, lower=False, overwrite_a=False, overwrite_b=False):
     def impl(A, B, lower=False, overwrite_a=False, overwrite_b=False):
         _LDA, _N = np.int32(A.shape[-2:])
         _solve_check_input_shapes(A, B)
-        A_copy = _copy_to_fortran_order(A)
 
-        B_is_1d = B.ndim == 1
-        if B_is_1d:
-            B_copy = np.asfortranarray(np.expand_dims(B, -1))
+        if not overwrite_a:
+            A_copy = _copy_to_fortran_order(A)
         else:
+            A_copy = A
+
+        B_is_1d = B.ndim == 1
+
+        if not overwrite_b:
             B_copy = _copy_to_fortran_order(B)
+        else:
+            B_copy = B
+        if B_is_1d:
+            B_copy = np.asfortranarray(np.expand_dims(B_copy, -1))
+
         B_NDIM = 1 if B_is_1d else int(B.shape[-1])
 
         UPLO = val_to_int_ptr(ord("L") if lower else ord("U"))
@@ -790,13 +806,22 @@ def impl(
         _solve_check_input_shapes(A, B)
 
         _N = np.int32(A.shape[-1])
-        A_copy = _copy_to_fortran_order(A)
 
-        B_is_1d = B.ndim == 1
-        if B_is_1d:
-            B_copy = np.asfortranarray(np.expand_dims(B, -1))
+        if not overwrite_a:
+            A_copy = _copy_to_fortran_order(A)
         else:
+            A_copy = A
+
+        B_is_1d = B.ndim == 1
+
+        if not overwrite_b:
             B_copy = _copy_to_fortran_order(B)
+        else:
+            B_copy = B
+        if B_is_1d:
+            B_copy = np.asfortranarray(np.expand_dims(B_copy, -1))
+
+        B_NDIM = 1 if B_is_1d else int(B.shape[-1])
 
         UPLO = val_to_int_ptr(ord("L") if lower else ord("U"))
         B_NDIM = 1 if B_is_1d else int(B.shape[-1])
@@ -889,6 +914,11 @@ def numba_funcify_Solve(op, node, **kwargs):
         solve_fn = _solve_gen
     elif assume_a == "sym":
         solve_fn = _solve_symmetric
+    elif assume_a == "her":
+        raise NotImplementedError(
+            'Use assume_a = "sym" for symmetric real matrices. If you need compelx support, '
+            "please open an issue on github."
+        )
     elif assume_a == "pos":
         solve_fn = _solve_psd
     else:
diff --git a/pytensor/tensor/slinalg.py b/pytensor/tensor/slinalg.py
@@ -436,6 +436,7 @@ def solve_triangular(
     trans: int | str = 0,
     lower: bool = False,
     unit_diagonal: bool = False,
+    overwrite_b: bool = False,
     check_finite: bool = True,
     b_ndim: int | None = None,
 ) -> TensorVariable:
@@ -461,6 +462,8 @@ def solve_triangular(
         Whether to check that the input matrices contain only finite numbers.
         Disabling may give a performance gain, but may result in problems
         (crashes, non-termination) if the inputs do contain infinities or NaNs.
+    overwrite_b: bool, optional
+        If True, memory allocated to input B will be re-used for the output. Default is False.
     b_ndim : int
         Whether the core case of b is a vector (1) or matrix (2).
         This will influence how batched dimensions are interpreted.
@@ -472,6 +475,7 @@ def solve_triangular(
             trans=trans,
             unit_diagonal=unit_diagonal,
             check_finite=check_finite,
+            overwrite_b=overwrite_b,
             b_ndim=b_ndim,
         )
     )(a, b)
@@ -537,6 +541,8 @@ def solve(
     lower=False,
     check_finite=True,
     transposed=False,
+    overwrite_a=False,
+    overwrite_b=False,
     b_ndim: int | None = None,
 ):
     """Solves the linear equation set ``a * x = b`` for the unknown ``x`` for square ``a`` matrix.
@@ -574,6 +580,10 @@ def solve(
         (crashes, non-termination) if the inputs do contain infinities or NaNs.
     assume_a : str, optional
         Valid entries are explained above.
+    overwrite_a: bool, optional
+        If True, use A as a work space to avoid allocating new memory. Default is False
+    overwrite_b: bool, optional
+        If True, use B to store result. Otherwise, allocate new memory. Default is False
     transposed: bool, optional
         If True, solve ``A.T @ x = b``
     b_ndim : int
@@ -588,6 +598,8 @@ def solve(
             assume_a=assume_a,
             b_ndim=b_ndim,
             transposed=transposed,
+            overwrite_a=overwrite_a,
+            overwrite_b=overwrite_b,
         )
     )(a, b)
 
diff --git a/tests/link/numba/test_slinalg.py b/tests/link/numba/test_slinalg.py
@@ -2,12 +2,12 @@
 
 import numpy as np
 import pytest
+from numpy.testing import assert_allclose
 
 import pytensor
 import pytensor.tensor as pt
 from pytensor.graph import FunctionGraph
 from tests.link.numba.test_basic import compare_numba_and_py
-from tests.unittest_tools import assert_allclose
 
 
 numba = pytest.importorskip("numba")
@@ -42,7 +42,10 @@ def transpose_func(x, trans):
 @pytest.mark.filterwarnings(
     'ignore:Cannot cache compiled function "numba_funcified_fgraph"'
 )
-def test_solve_triangular(b_func, b_size, lower, trans, unit_diag, complex):
+@pytest.mark.parametrize("overwrite_b", [True, False])
+def test_solve_triangular(
+    b_func, b_size, lower, trans, unit_diag, complex, overwrite_b
+):
     if complex:
         # TODO: Complex raises ValueError: To change to a dtype of a different size, the last axis must be contiguous,
         #  why?
@@ -55,7 +58,7 @@ def test_solve_triangular(b_func, b_size, lower, trans, unit_diag, complex):
     b = b_func("b", dtype=dtype)
 
     X = pt.linalg.solve_triangular(
-        A, b, lower=lower, trans=trans, unit_diagonal=unit_diag
+        A, b, lower=lower, trans=trans, unit_diagonal=unit_diag, overwrite_b=overwrite_b
     )
     f = pytensor.function([A, b], X, mode="NUMBA")
 
@@ -84,6 +87,9 @@ def test_solve_triangular(b_func, b_size, lower, trans, unit_diag, complex):
         transpose_func(A_tri, trans) @ X_np, b, atol=ATOL, rtol=RTOL
     )
 
+    if overwrite_b:
+        assert_allclose(X_np, b)
+
 
 @pytest.mark.parametrize("value", [np.nan, np.inf])
 @pytest.mark.filterwarnings(
@@ -235,31 +241,42 @@ def gecon(x, norm):
     np.testing.assert_allclose(rcond, rcond2)
 
 
-def test_getrf():
+@pytest.mark.parametrize("overwrite_a", [True, False])
+def test_getrf(overwrite_a):
     from scipy.linalg import lu_factor
 
     from pytensor.link.numba.dispatch.slinalg import _getrf
 
     # TODO: Refactor this test to use compare_numba_and_py after we implement lu_factor in pytensor
 
     @numba.njit()
-    def getrf(x):
-        return _getrf(x)
+    def getrf(x, overwrite_a):
+        return _getrf(x, overwrite_a=overwrite_a)
 
     x = np.random.normal(size=(5, 5)).astype(floatX)
-    LU, IPIV, info = getrf(x)
-    lu, ipiv = lu_factor(x)
+    x = np.asfortranarray(
+        x
+    )  # x needs to be fortran-contiguous going into getrf for the overwrite option to work
+
+    lu, ipiv = lu_factor(x, overwrite_a=False)
+    LU, IPIV, info = getrf(x, overwrite_a=overwrite_a)
 
     assert info == 0
     assert_allclose(LU, lu)
 
+    if overwrite_a:
+        assert_allclose(x, LU)
+
     # TODO: It seems IPIV is 1-indexed in FORTRAN, so we need to subtract 1. I can't find evidence that scipy is doing
     #  this, though.
     assert_allclose(IPIV - 1, ipiv)
 
 
 @pytest.mark.parametrize("trans", [0, 1])
-def test_getrs(trans):
+@pytest.mark.parametrize("overwrite_a", [True, False])
+@pytest.mark.parametrize("overwrite_b", [True, False])
+@pytest.mark.parametrize("b_shape", [(5,), (5, 3)], ids=["b_1d", "b_2d"])
+def test_getrs(trans, overwrite_a, overwrite_b, b_shape):
     from scipy.linalg import lu_factor
     from scipy.linalg import lu_solve as sp_lu_solve
 
@@ -268,19 +285,29 @@ def test_getrs(trans):
     # TODO: Refactor this test to use compare_numba_and_py after we implement lu_solve in pytensor
 
     @numba.njit()
-    def lu_solve(a, b, trans):
-        lu, ipiv, info = _getrf(a)
-        x, info = _getrs(lu, b, ipiv, trans)
-        return x, info
+    def lu_solve(a, b, trans, overwrite_a, overwrite_b):
+        lu, ipiv, info = _getrf(a, overwrite_a=overwrite_a)
+        x, info = _getrs(lu, b, ipiv, trans=trans, overwrite_b=overwrite_b)
+        return x, lu, info
 
     a = np.random.normal(size=(5, 5)).astype(floatX)
-    b = np.random.normal(size=(5, 3)).astype(floatX)
+    b = np.random.normal(size=b_shape).astype(floatX)
 
-    lu_and_piv = lu_factor(a)
+    # inputs need to be fortran-contiguous going into getrf and getrs for the overwrite option to work
+    a = np.asfortranarray(a)
+    b = np.asfortranarray(b)
 
-    x_sp = sp_lu_solve(lu_and_piv, b, trans)
-    x, info = lu_solve(a, b, trans)
+    lu_and_piv = lu_factor(a, overwrite_a=False)
+    x_sp = sp_lu_solve(lu_and_piv, b, trans, overwrite_b=False)
+
+    x, lu, info = lu_solve(
+        a, b, trans, overwrite_a=overwrite_a, overwrite_b=overwrite_b
+    )
     assert info == 0
+    if overwrite_a:
+        assert_allclose(a, lu)
+    if overwrite_b:
+        assert_allclose(b, x)
 
     assert_allclose(x, x_sp)
 
@@ -295,12 +322,21 @@ def lu_solve(a, b, trans):
 @pytest.mark.filterwarnings(
     'ignore:Cannot cache compiled function "numba_funcified_fgraph"'
 )
-def test_solve(b_func, b_size, assume_a, transposed):
+@pytest.mark.parametrize("overwrite_a", [True, False])
+@pytest.mark.parametrize("overwrite_b", [True, False])
+def test_solve(b_func, b_size, assume_a, transposed, overwrite_a, overwrite_b):
     A = pt.matrix("A", dtype=floatX)
     b = b_func("b", dtype=floatX)
 
     X = pt.linalg.solve(
-        A, b, lower=False, assume_a=assume_a, transposed=transposed, b_ndim=len(b_size)
+        A,
+        b,
+        lower=False,
+        assume_a=assume_a,
+        overwrite_a=overwrite_a,
+        overwrite_b=overwrite_b,
+        transposed=transposed,
+        b_ndim=len(b_size),
     )
     f = pytensor.function([A, b], X, mode="NUMBA")