Specialized C-impl for vector AdvancedIncSubtensor1

ricardoV94 · ricardoV94 · commit 7c3d45174817 · 2025-04-07T14:54:44.000+02:00
diff --git a/pytensor/tensor/subtensor.py b/pytensor/tensor/subtensor.py
@@ -2320,6 +2320,9 @@ def copy_of_x(self, x):
                 NPY_ARRAY_ENSURECOPY, NULL)"""
 
     def c_support_code(self, **kwargs):
+        if numpy_version < "1.8.0" or using_numpy_2:
+            return None
+
         types = [
             "npy_" + t
             for t in [
@@ -2510,15 +2513,105 @@ def gen_num(typen):
         return code
 
     def c_code(self, node, name, input_names, output_names, sub):
-        if numpy_version < "1.8.0" or using_numpy_2:
-            raise NotImplementedError
-
         x, y, idx = input_names
-        out = output_names[0]
+        [out] = output_names
         copy_of_x = self.copy_of_x(x)
         params = sub["params"]
         fail = sub["fail"]
 
+        x_, y_, idx_ = node.inputs
+        y_dtype = y_.type.dtype_specs()[1]
+        idx_dtype = idx_.type.dtype_specs()[1]
+        out_dtype = node.outputs[0].type.dtype_specs()[1]
+        y_bcast = y_.type.broadcastable != idx_.type.broadcastable
+        if (
+            x_.type.ndim == 1
+            and x_.type.dtype not in complex_dtypes
+            and not y_bcast
+            and y_.type.dtype not in complex_dtypes
+        ):
+            # Simple implementation for vector x, y cases
+            idx_may_be_neg = not (isinstance(idx_, Constant) and idx_.data.min() >= 0)
+            shape0 = x_.type.shape[0]
+            idx_may_be_invalid = not (
+                shape0 is not None
+                and isinstance(idx_, Constant)
+                and (idx_.data.min() > 0 or idx_.data.min() >= -shape0)
+                and (idx_.data.max() < 0 or idx_.data.max() < shape0)
+            )
+            # This is used to make sure that when we trust the indices to be valid
+            # we are not fooled by a wrong static shape
+            unexpected_shape0 = (
+                f"PyArray_SHAPE({x})[0] != {shape0}" if shape0 is not None else "0"
+            )
+
+            op = "=" if self.set_instead_of_inc else "+="
+            code = f"""
+            if ({params}->inplace)
+            {{
+                if ({x} != {out})
+                {{
+                    Py_XDECREF({out});
+                    Py_INCREF({x});
+                    {out} = {x};
+                }}
+            }}
+            else
+            {{
+                Py_XDECREF({out});
+                {out} = {copy_of_x};
+                if (!{out}) {{
+                    // Exception already set
+                    {fail}
+                }}
+            }}
+
+            if ((PyArray_NDIM({out}) != 1) || ({unexpected_shape0})) {{
+                PyErr_SetString(PyExc_ValueError, "Input x to AdvancedIncSubtensor1 does not have right shape or ndim");
+                {fail}
+            }}
+            if (PyArray_NDIM({idx}) != 1) {{
+                PyErr_SetString(PyExc_ValueError, "Input idx to AdvancedIncSubtensor1 ndim != 1");
+                {fail}
+            }}
+            if ((PyArray_NDIM({y}) != 1) || (PyArray_SHAPE({y})[0] != PyArray_SHAPE({idx})[0])) {{
+                PyErr_SetString(PyExc_ValueError, "Input y to AdvancedIncSubtensor1 does not have right shape or ndim");
+                {fail}
+            }}
+
+            {{
+                npy_intp out_shape0 = PyArray_SHAPE({out})[0];
+                {out_dtype}* out_data = ({out_dtype}*)PyArray_DATA({out});
+                {y_dtype}* y_data = ({y_dtype}*)PyArray_DATA({y});
+                {idx_dtype}* idx_data = ({idx_dtype}*)PyArray_DATA({idx});
+                npy_intp n = PyArray_SHAPE({idx})[0];
+                npy_intp out_jump = PyArray_STRIDES({out})[0] / PyArray_ITEMSIZE({out});
+                npy_intp y_jump = PyArray_STRIDES({y})[0] / PyArray_ITEMSIZE({y});
+                npy_intp idx_jump = PyArray_STRIDES({idx})[0] / PyArray_ITEMSIZE({idx});
+
+                for(int i = 0; i < n; i++){{
+                    {idx_dtype} idx = idx_data[i * idx_jump];
+                    if ({int(idx_may_be_neg)}){{
+                        if (idx < 0) {{
+                            idx += out_shape0;
+                        }}
+                    }}
+                    if ({int(idx_may_be_invalid)}){{
+                        if ((idx < 0) || (idx >= out_shape0)) {{
+                            PyErr_Format(PyExc_IndexError,"index out of bounds");
+                            {fail}
+                        }}
+                    }}
+                    out_data[idx * out_jump] {op} y_data[i * y_jump];
+                }}
+
+            }}
+            """
+            return code
+
+        if numpy_version < "1.8.0" or using_numpy_2:
+            raise NotImplementedError
+
         return f"""
         PyObject* rval = NULL;
         if ({params}->inplace)
@@ -2546,7 +2639,8 @@ def c_code(self, node, name, input_names, output_names, sub):
         """
 
     def c_code_cache_version(self):
-        return (8,)
+        return None
+        return (9,)
 
     def perform(self, node, inp, out_):
         x, y, idx = inp
diff --git a/tests/tensor/test_subtensor.py b/tests/tensor/test_subtensor.py
@@ -3028,3 +3028,29 @@ def test_advanced_subtensor1(self, static_shape, gc, benchmark):
         )
         fn.vm.allow_gc = gc
         benchmark(fn, x_values, idxs_values)
+
+    @pytest.mark.parametrize(
+        "static_shape", (False, True), ids=lambda x: f"static_shape={x}"
+    )
+    @pytest.mark.parametrize("gc", (False, True), ids=lambda x: f"gc={x}")
+    @pytest.mark.parametrize("func", (inc_subtensor, set_subtensor))
+    def test_advanced_incsubtensor1(self, func, static_shape, gc, benchmark):
+        x = vector("x", shape=(85 if static_shape else None,))
+        x_values = np.zeros((85,))
+        buffer = ptb.zeros_like(x)
+        y_values = np.random.normal(size=(85 * 11,))
+        idxs_values = np.arange(85).repeat(11)
+
+        # With static shape and constant indices we know all idxs are valid
+        # Reuse same buffer of zeros, to check we rather allocate twice than copy inside IncSubtensor
+        out1 = func(buffer[idxs_values], y_values)
+        out2 = func(buffer[idxs_values[::-1]], y_values)
+
+        fn = pytensor.function(
+            [x],
+            [pytensor.Out(out1, borrow=True), pytensor.Out(out2, borrow=True)],
+            on_unused_input="ignore",
+            trust_input=True,
+        )
+        fn.vm.allow_gc = gc
+        benchmark(fn, x_values)