pymc-devs
diff --git a/‎pytensor/link/numba/dispatch/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎pytensor/link/numba/dispatch/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytensor/link/numba/dispatch/blas.py‎
Lines changed: 34 additions & 0 deletions b/‎pytensor/link/numba/dispatch/blas.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎pytensor/link/numba/dispatch/linalg/dot/banded.py‎
Lines changed: 99 additions & 24 deletions b/‎pytensor/link/numba/dispatch/linalg/dot/banded.py‎
Lines changed: 99 additions & 24 deletions
diff --git a/‎pytensor/link/numba/dispatch/slinalg.py‎
Lines changed: 0 additions & 18 deletions b/‎pytensor/link/numba/dispatch/slinalg.py‎
Lines changed: 0 additions & 18 deletions
@@ -14,6 +14,6 @@
 import pytensor.link.numba.dispatch.sparse
 import pytensor.link.numba.dispatch.subtensor
 import pytensor.link.numba.dispatch.tensor_basic
-
+import pytensor.link.numba.dispatch.blas
 
 # isort: on
@@ -0,0 +1,34 @@
+from pytensor.link.numba.dispatch import numba_funcify
+from pytensor.link.numba.dispatch.basic import numba_njit
+from pytensor.link.numba.dispatch.linalg.dot.banded import _gbmv
+from pytensor.link.numba.dispatch.slinalg import _COMPLEX_DTYPE_NOT_SUPPORTED_MSG
+from pytensor.tensor.blas import BandedGEMV
+from pytensor.tensor.type import complex_dtypes
+
+
+@numba_funcify.register(BandedGEMV)
+def numba_funcify_BandedGEMV(op, node, **kwargs):
+    kl = op.lower_diags
+    ku = op.upper_diags
+    overwrite_y = op.overwrite_y
+    trans = int(op.transpose)
+    dtype = node.inputs[0].dtype
+
+    if dtype in complex_dtypes:
+        raise NotImplementedError(_COMPLEX_DTYPE_NOT_SUPPORTED_MSG.format(op=op))
+
+    @numba_njit(cache=False)
+    def banded_gemv(A, x, y, alpha, beta):
+        return _gbmv(
+            A=A,
+            x=x,
+            kl=kl,
+            ku=ku,
+            y=y,
+            alpha=alpha,
+            beta=beta,
+            overwrite_y=overwrite_y,
+            trans=trans,
+        )
+
+    return banded_gemv
@@ -12,7 +12,11 @@
     _get_underlying_float,
     val_to_int_ptr,
 )
-from pytensor.link.numba.dispatch.linalg.utils import _check_scipy_linalg_matrix
+from pytensor.link.numba.dispatch.linalg.utils import (
+    _check_scipy_linalg_matrix,
+    _copy_to_fortran_order_even_if_1d,
+    _trans_char_to_int,
+)
 
 
 @numba_njit(inline="always")
@@ -32,69 +36,140 @@ def A_to_banded(A: np.ndarray, kl: int, ku: int) -> np.ndarray:
     return A_banded
 
 
-def _dot_banded(A: np.ndarray, x: np.ndarray, kl: int, ku: int) -> Any:
+def _gbmv(
+    alpha: np.ndarray,
+    A: np.ndarray,
+    x: np.ndarray,
+    kl: int,
+    ku: int,
+    beta: np.ndarray | None = None,
+    y: np.ndarray | None = None,
+    overwrite_y: bool = False,
+    trans: int = 1,
+) -> Any:
     """
     Thin wrapper around gmbv. This code will only be called if njit is disabled globally
     (e.g. during testing)
     """
-    fn = linalg.get_blas_funcs("gbmv", (A, x))
+    (fn,) = linalg.get_blas_funcs(("gbmv",), (A, x))
     m, n = A.shape
     A_banded = A_to_banded(A, kl=kl, ku=ku)
 
-    return fn(m=m, n=n, kl=kl, ku=ku, alpha=1, a=A_banded, x=x)
-
-
-@overload(_dot_banded)
-def dot_banded_impl(
-    A: np.ndarray, x: np.ndarray, kl: int, ku: int
-) -> Callable[[np.ndarray, np.ndarray, int, int], np.ndarray]:
+    incx = x.strides[0] // x.itemsize
+    incy = y.strides[0] // y.itemsize if y is not None else 1
+
+    offx = 0 if incx >= 0 else -x.size + 1
+    offy = 0 if incy >= 0 else -y.size + 1
+
+    return fn(
+        m=m,
+        n=n,
+        kl=kl,
+        ku=ku,
+        a=A_banded,
+        alpha=alpha,
+        x=x,
+        incx=incx,
+        offx=offx,
+        beta=beta,
+        y=y,
+        overwrite_y=overwrite_y,
+        incy=incy,
+        offy=offy,
+        trans=trans,
+    )
+
+
+@overload(_gbmv)
+def gbmv_impl(
+    alpha: np.ndarray,
+    A: np.ndarray,
+    x: np.ndarray,
+    kl: int,
+    ku: int,
+    beta: np.ndarray | None = None,
+    y: np.ndarray | None = None,
+    overwrite_y: bool = False,
+    trans: int = 1,
+) -> Callable[
+    [
+        np.ndarray,
+        np.ndarray,
+        np.ndarray,
+        int,
+        int,
+        np.ndarray | None,
+        np.ndarray | None,
+        bool,
+        int,
+    ],
+    np.ndarray,
+]:
     ensure_lapack()
     ensure_blas()
     _check_scipy_linalg_matrix(A, "dot_banded")
     dtype = A.dtype
     w_type = _get_underlying_float(dtype)
     numba_gbmv = _BLAS().numba_xgbmv(dtype)
 
-    def impl(A: np.ndarray, x: np.ndarray, kl: int, ku: int) -> np.ndarray:
+    def impl(
+        alpha: np.ndarray,
+        A: np.ndarray,
+        x: np.ndarray,
+        kl: int,
+        ku: int,
+        beta: np.ndarray | None = None,
+        y: np.ndarray | None = None,
+        overwrite_y: bool = False,
+        trans: int = 1,
+    ) -> np.ndarray:
         m, n = A.shape
 
         A_banded = A_to_banded(A, kl=kl, ku=ku)
-        stride = x.strides[0] // x.itemsize
+        x_stride = x.strides[0] // x.itemsize
+
+        if beta is None:
+            beta = np.zeros((), dtype=dtype)
 
-        TRANS = val_to_int_ptr(ord("N"))
+        if y is None:
+            y_copy = np.empty(shape=(m,), dtype=dtype)
+        elif overwrite_y and y.flags.f_contiguous:
+            y_copy = y
+        else:
+            y_copy = _copy_to_fortran_order_even_if_1d(y)
+
+        y_stride = y_copy.strides[0] // y_copy.itemsize
+
+        TRANS = val_to_int_ptr(_trans_char_to_int(trans))
         M = val_to_int_ptr(m)
         N = val_to_int_ptr(n)
         LDA = val_to_int_ptr(A_banded.shape[0])
 
         KL = val_to_int_ptr(kl)
         KU = val_to_int_ptr(ku)
 
-        ALPHA = np.array(1.0, dtype=dtype)
-
-        INCX = val_to_int_ptr(stride)
-        BETA = np.array(0.0, dtype=dtype)
-        Y = np.empty(m, dtype=dtype)
-        INCY = val_to_int_ptr(1)
+        INCX = val_to_int_ptr(x_stride)
+        INCY = val_to_int_ptr(y_stride)
 
         numba_gbmv(
             TRANS,
             M,
             N,
             KL,
             KU,
-            ALPHA.view(w_type).ctypes,
+            alpha.view(w_type).ctypes,
             A_banded.view(w_type).ctypes,
             LDA,
             # x.view().ctypes is creating a pointer to the beginning of the memory where the array is. When we have
             # a negative stride, we need to trick BLAS by pointing to the last element of the array.
             # The [-1:] slice is a workaround to make sure x remains an array (otherwise it has no .ctypes)
-            (x if stride >= 0 else x[-1:]).view(w_type).ctypes,
+            (x if x_stride >= 0 else x[-1:]).view(w_type).ctypes,
             INCX,
-            BETA.view(w_type).ctypes,
-            Y.view(w_type).ctypes,
+            beta.view(w_type).ctypes,
+            y_copy.view(w_type).ctypes,
             INCY,
         )
 
-        return Y
+        return y_copy
 
     return impl
@@ -11,7 +11,6 @@
     _pivot_to_permutation,
 )
 from pytensor.link.numba.dispatch.linalg.decomposition.lu_factor import _lu_factor
-from pytensor.link.numba.dispatch.linalg.dot.banded import _dot_banded
 from pytensor.link.numba.dispatch.linalg.solve.cholesky import _cho_solve
 from pytensor.link.numba.dispatch.linalg.solve.general import _solve_gen
 from pytensor.link.numba.dispatch.linalg.solve.posdef import _solve_psd
@@ -20,7 +19,6 @@
 from pytensor.link.numba.dispatch.linalg.solve.tridiagonal import _solve_tridiagonal
 from pytensor.tensor.slinalg import (
     LU,
-    BandedDot,
     BlockDiagonal,
     Cholesky,
     CholeskySolve,
@@ -313,19 +311,3 @@ def cho_solve(c, b):
         )
 
     return cho_solve
-
-
-@numba_funcify.register(BandedDot)
-def numba_funcify_BandedDot(op, node, **kwargs):
-    kl = op.lower_diags
-    ku = op.upper_diags
-    dtype = node.inputs[0].dtype
-
-    if dtype in complex_dtypes:
-        raise NotImplementedError(_COMPLEX_DTYPE_NOT_SUPPORTED_MSG.format(op=op))
-
-    @numba_njit(cache=False)
-    def banded_dot(A, x):
-        return _dot_banded(A, x, kl=kl, ku=ku)
-
-    return banded_dot