Faster implementation of numba convolve1d

ricardoV94 · ricardoV94 · commit 158f2237ca4b · 2025-04-24T19:06:23.000+02:00
diff --git a/pytensor/link/numba/dispatch/signal/conv.py b/pytensor/link/numba/dispatch/signal/conv.py
@@ -1,4 +1,5 @@
 import numpy as np
+from numba.np.arraymath import _get_inner_prod
 
 from pytensor.link.numba.dispatch import numba_funcify
 from pytensor.link.numba.dispatch.basic import numba_njit
@@ -7,10 +8,66 @@
 
 @numba_funcify.register(Convolve1d)
 def numba_funcify_Conv1d(op, node, **kwargs):
+    # This specialized version is faster than the overloaded numba np.convolve,
+    # as it avoids several runtime checks that don't seem to be inlined.
     mode = op.mode
 
-    @numba_njit
-    def conv1d(data, kernel):
-        return np.convolve(data, kernel, mode=mode)
+    a_dt = np.dtype(node.inputs[0].dtype)
+    b_dt = np.dtype(node.inputs[1].dtype)
+    dt = np.promote_types(a_dt, b_dt)
+    innerprod = _get_inner_prod(a_dt, b_dt)
 
-    return conv1d
+    if mode == "valid":
+
+        def valid_convolve1d(x, y):
+            nx = len(x)
+            ny = len(y)
+            if nx < ny:
+                x, y = y, x
+                nx, ny = ny, nx
+            y_flipped = y[::-1]
+
+            length = nx - ny + 1
+            ret = np.empty(length, dt)
+
+            for i in range(length):
+                ret[i] = innerprod(x[i : i + ny], y_flipped)
+
+            return ret
+
+        return numba_njit(valid_convolve1d)
+
+    elif mode == "full":
+
+        def full_convolve1d(x, y):
+            nx = len(x)
+            ny = len(y)
+            if nx < ny:
+                x, y = y, x
+                nx, ny = ny, nx
+            y_flipped = y[::-1]
+
+            length = nx + ny - 1
+            ret = np.empty(length, dt)
+            idx = 0
+
+            for i in range(ny - 1):
+                k = i + 1
+                ret[idx] = innerprod(x[:k], y_flipped[-k:])
+                idx = idx + 1
+
+            for i in range(nx - ny + 1):
+                ret[idx] = innerprod(x[i : i + ny], y_flipped)
+                idx = idx + 1
+
+            for i in range(ny - 1):
+                k = ny - i - 1
+                ret[idx] = innerprod(x[-k:], y_flipped[:k])
+                idx = idx + 1
+
+            return ret
+
+        return numba_njit(full_convolve1d)
+
+    else:
+        raise ValueError(f"Unsupported mode: {mode}")
diff --git a/tests/link/numba/signal/test_conv.py b/tests/link/numba/signal/test_conv.py
@@ -1,7 +1,8 @@
 import numpy as np
 import pytest
 
-from pytensor.tensor import dmatrix
+from pytensor import function
+from pytensor.tensor import dmatrix, vector
 from pytensor.tensor.signal import convolve1d
 from tests.link.numba.test_basic import compare_numba_and_py
 
@@ -10,13 +11,33 @@
 
 
 @pytest.mark.parametrize("mode", ["full", "valid", "same"])
-def test_convolve1d(mode):
+@pytest.mark.parametrize("x_smaller", (False, True))
+def test_convolve1d(x_smaller, mode):
     x = dmatrix("x")
     y = dmatrix("y")
-    out = convolve1d(x[None], y[:, None], mode=mode)
+    if x_smaller:
+        out = convolve1d(x[None], y[:, None], mode=mode)
+    else:
+        out = convolve1d(y[:, None], x[None], mode=mode)
 
     rng = np.random.default_rng()
     test_x = rng.normal(size=(3, 5))
     test_y = rng.normal(size=(7, 11))
     # Blockwise dispatch for numba can't be run on object mode
     compare_numba_and_py([x, y], out, [test_x, test_y], eval_obj_mode=False)
+
+
+@pytest.mark.parametrize("mode", ("full", "valid"))
+def test_convolve_benchmark(mode, benchmark):
+    x = vector(shape=(183,))
+    y = vector(shape=(6,))
+    out = convolve1d(x, y, mode=mode)
+    fn = function([x, y], out, mode="NUMBA", trust_input=True)
+
+    rng = np.random.default_rng()
+    x_test = rng.normal(size=(x.type.shape)).astype(x.type.dtype)
+    y_test = rng.normal(size=(y.type.shape)).astype(y.type.dtype)
+    np.testing.assert_allclose(
+        fn(x_test, y_test), np.convolve(x_test, y_test, mode=mode)
+    )
+    benchmark(fn, x_test, y_test)