Specialize AdvancedSubtensor1 mode for compile time valid indices

ricardoV94 · ricardoV94 · commit 2491bc206540 · 2025-04-07T14:31:52.000+02:00
diff --git a/pytensor/tensor/subtensor.py b/pytensor/tensor/subtensor.py
@@ -2120,16 +2120,12 @@ def make_node(self, x, ilist):
         out_shape = (ilist_.type.shape[0], *x_.type.shape[1:])
         return Apply(self, [x_, ilist_], [TensorType(dtype=x.dtype, shape=out_shape)()])
 
-    def perform(self, node, inp, out_):
+    def perform(self, node, inp, output_storage):
         x, i = inp
-        (out,) = out_
-        # Copy always implied by numpy advanced indexing semantic.
-        if out[0] is not None and out[0].shape == (len(i),) + x.shape[1:]:
-            o = out[0]
-        else:
-            o = None
 
-        out[0] = x.take(i, axis=0, out=o)
+        # Numpy take is always slower when out is provided
+        # https://github.com/numpy/numpy/issues/28636
+        output_storage[0][0] = x.take(i, axis=0, out=None)
 
     def connection_pattern(self, node):
         rval = [[True], *([False] for _ in node.inputs[1:])]
@@ -2174,42 +2170,70 @@ def c_code(self, node, name, input_names, output_names, sub):
                 "c_code defined for AdvancedSubtensor1, not for child class",
                 type(self),
             )
+        x, idxs = node.inputs
+        shape0 = x.type.shape[0]
+        if (
+            shape0 is not None
+            and isinstance(idxs, Constant)
+            and (
+                (idxs.data.max() < shape0)
+                and ((idxs.data.min() >= 0) or (idxs.data.min() > -shape0))
+            )
+        ):
+            # We can know ahead of time that all indices are valid, so we can use a faster mode
+            mode = "NPY_WRAP"  # This seems to be faster than NPY_CLIP
+        else:
+            mode = "NPY_RAISE"
         a_name, i_name = input_names[0], input_names[1]
         output_name = output_names[0]
         fail = sub["fail"]
-        return f"""
-            if ({output_name} != NULL) {{
-                npy_intp nd, i, *shape;
-                nd = PyArray_NDIM({a_name}) + PyArray_NDIM({i_name}) - 1;
-                if (PyArray_NDIM({output_name}) != nd) {{
+        if mode == "NPY_RAISE":
+            # numpy_take always makes an intermediate copy if NPY_RAISE which is slower than just allocating a new buffer
+            # We can remove this special case after https://github.com/numpy/numpy/issues/28636
+            manage_pre_allocated_out = f"""
+                if ({output_name} != NULL) {{
+                    // Numpy TakeFrom is always slower when copying
+                    // https://github.com/numpy/numpy/issues/28636
                     Py_CLEAR({output_name});
                 }}
-                else {{
-                    shape = PyArray_DIMS({output_name});
-                    for (i = 0; i < PyArray_NDIM({i_name}); i++) {{
-                        if (shape[i] != PyArray_DIMS({i_name})[i]) {{
-                            Py_CLEAR({output_name});
-                            break;
-                        }}
+            """
+        else:
+            manage_pre_allocated_out = f"""
+                if ({output_name} != NULL) {{
+                    npy_intp nd = PyArray_NDIM({a_name}) + PyArray_NDIM({i_name}) - 1;
+                    if (PyArray_NDIM({output_name}) != nd) {{
+                        Py_CLEAR({output_name});
                     }}
-                    if ({output_name} != NULL) {{
-                        for (; i < nd; i++) {{
-                            if (shape[i] != PyArray_DIMS({a_name})[
-                                                i-PyArray_NDIM({i_name})+1]) {{
+                    else {{
+                        int i;
+                        npy_intp* shape = PyArray_DIMS({output_name});
+                        for (i = 0; i < PyArray_NDIM({i_name}); i++) {{
+                            if (shape[i] != PyArray_DIMS({i_name})[i]) {{
                                 Py_CLEAR({output_name});
                                 break;
                             }}
                         }}
+                        if ({output_name} != NULL) {{
+                            for (; i < nd; i++) {{
+                                if (shape[i] != PyArray_DIMS({a_name})[i-PyArray_NDIM({i_name})+1]) {{
+                                    Py_CLEAR({output_name});
+                                    break;
+                                }}
+                            }}
+                        }}
                     }}
                 }}
-            }}
+            """
+
+        return f"""
+            {manage_pre_allocated_out}
             {output_name} = (PyArrayObject*)PyArray_TakeFrom(
-                        {a_name}, (PyObject*){i_name}, 0, {output_name}, NPY_RAISE);
+                        {a_name}, (PyObject*){i_name}, 0, {output_name}, {mode});
             if ({output_name} == NULL) {fail};
         """
 
     def c_code_cache_version(self):
-        return (4,)
+        return (5,)
 
 
 advanced_subtensor1 = AdvancedSubtensor1()
diff --git a/tests/tensor/test_subtensor.py b/tests/tensor/test_subtensor.py
@@ -3003,3 +3003,28 @@ def test_flip(size: tuple[int]):
         z = flip(x_pt, axis=list(axes))
         f = pytensor.function([x_pt], z, mode="FAST_COMPILE")
         np.testing.assert_allclose(expected, f(x), atol=ATOL, rtol=RTOL)
+
+
+class TestBenchmarks:
+    @pytest.mark.parametrize(
+        "static_shape", (False, True), ids=lambda x: f"static_shape={x}"
+    )
+    @pytest.mark.parametrize("gc", (False, True), ids=lambda x: f"gc={x}")
+    def test_advanced_subtensor1(self, static_shape, gc, benchmark):
+        x = vector("x", shape=(85 if static_shape else None,))
+
+        x_values = np.random.normal(size=(85,))
+        idxs_values = np.arange(85).repeat(11)
+
+        # With static shape and constant indices we know all idxs are valid
+        # And can use faster mode in numpy.take
+        out = x[idxs_values]
+
+        fn = pytensor.function(
+            [x],
+            pytensor.Out(out, borrow=True),
+            on_unused_input="ignore",
+            trust_input=True,
+        )
+        fn.vm.allow_gc = gc
+        benchmark(fn, x_values, idxs_values)