Speedup python blockwise

ricardoV94 · ricardoV94 · commit 74156ecbd2de · 2025-05-07T14:28:31.000+02:00
diff --git a/pytensor/tensor/blockwise.py b/pytensor/tensor/blockwise.py
@@ -1,7 +1,8 @@
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from typing import Any, cast
 
 import numpy as np
+from numpy import broadcast_shapes, empty, ndindex, nditer
 
 from pytensor import config
 from pytensor.compile.builders import OpFromGraph
@@ -28,6 +29,67 @@
 from pytensor.tensor.variable import TensorVariable
 
 
+def _vectorize_node_perform(core_node, batch_ndim: int):
+    core_op_perform = core_node.op.perform
+    n_outs = len(core_node.outputs)
+
+    def vectorized_func(
+        *args,
+        core_node=core_node,
+        core_op_perform=core_op_perform,
+        batch_ndim=batch_ndim,
+        n_outs=n_outs,
+    ):
+        batch_shape = broadcast_shapes(*(arg.shape[:batch_ndim] for arg in args))
+        args = list(args)
+        for i, arg in enumerate(args):
+            if arg.shape[:batch_ndim] != batch_shape:
+                # Main logic of `np.broadcast_to`
+                it = nditer(
+                    (arg,),
+                    flags=["multi_index", "zerosize_ok"],
+                    op_flags=["readonly"],
+                    itershape=batch_shape + arg.shape[batch_ndim:],
+                    order="C",
+                )
+                with it:
+                    args[i] = it.itviews[0]
+
+        core_output_storage = [[None] for _ in range(n_outs)]
+        ndindex_iterator = ndindex(*batch_shape)
+        # Call once to get the output shapes
+        try:
+            # TODO: Pass core shape as input like BlockwiseWithCoreShape does?
+            index0 = next(ndindex_iterator)
+        except StopIteration:
+            raise NotImplementedError("vectorize with zero iterations not implemented")
+        else:
+            core_op_perform(
+                core_node,
+                [np.asarray(arg[index0]) for arg in args],
+                core_output_storage,
+            )
+            outputs = tuple(
+                empty(batch_shape + core_output[0].shape, dtype=core_output[0].dtype)
+                for core_output in core_output_storage
+            )
+            for output, core_output in zip(outputs, core_output_storage):  # noqa: B905
+                output[index0] = core_output[0]
+
+        for index in ndindex_iterator:
+            core_op_perform(
+                core_node,
+                [np.asarray(a[index]) for a in args],
+                core_output_storage,
+            )
+            for output, core_output in zip(outputs, core_output_storage):  # noqa: B905
+                output[index] = core_output[0]
+
+        return outputs
+
+    return vectorized_func
+
+
 class Blockwise(Op):
     """Generalizes a core `Op` to work with batched dimensions.
 
@@ -308,46 +370,29 @@ def L_op(self, inputs, outs, ograds):
 
         return rval
 
-    def _create_node_gufunc(self, node) -> None:
+    def _create_node_gufunc(self, node: Apply) -> Callable:
         """Define (or retrieve) the node gufunc used in `perform`.
 
         If the Blockwise or core_op have a `gufunc_spec`, the relevant numpy or scipy gufunc is used directly.
         Otherwise, we default to `np.vectorize` of the core_op `perform` method for a dummy node.
 
         The gufunc is stored in the tag of the node.
         """
-        gufunc_spec = self.gufunc_spec or getattr(self.core_op, "gufunc_spec", None)
-
-        if gufunc_spec is not None:
+        if (
+            gufunc_spec := self.gufunc_spec
+            or getattr(self.core_op, "gufunc_spec", None)
+        ) is not None:
             gufunc = import_func_from_string(gufunc_spec[0])
             if gufunc is None:
                 raise ValueError(f"Could not import gufunc {gufunc_spec[0]} for {self}")
-
         else:
-            # Wrap core_op perform method in numpy vectorize
-            n_outs = len(self.outputs_sig)
             core_node = self._create_dummy_core_node(node.inputs)
-            inner_outputs_storage = [[None] for _ in range(n_outs)]
-
-            def core_func(
-                *inner_inputs,
-                core_node=core_node,
-                inner_outputs_storage=inner_outputs_storage,
-            ):
-                self.core_op.perform(
-                    core_node,
-                    [np.asarray(inp) for inp in inner_inputs],
-                    inner_outputs_storage,
-                )
-
-                if n_outs == 1:
-                    return inner_outputs_storage[0][0]
-                else:
-                    return tuple(r[0] for r in inner_outputs_storage)
-
-            gufunc = np.vectorize(core_func, signature=self.signature)
+            gufunc = _vectorize_node_perform(
+                core_node,
+                batch_ndim=self.batch_ndim(node),
+            )
 
-        node.tag.gufunc = gufunc
+        return gufunc
 
     def _check_runtime_broadcast(self, node, inputs):
         batch_ndim = self.batch_ndim(node)
@@ -375,23 +420,15 @@ def perform(self, node, inputs, output_storage):
         gufunc = getattr(node.tag, "gufunc", None)
 
         if gufunc is None:
-            # Cache it once per node
-            self._create_node_gufunc(node)
-            gufunc = node.tag.gufunc
+            gufunc = node.tag.gufunc = self._create_node_gufunc(node)
 
         self._check_runtime_broadcast(node, inputs)
 
         res = gufunc(*inputs)
         if not isinstance(res, tuple):
             res = (res,)
 
-        # strict=False because we are in a hot loop
-        for node_out, out_storage, r in zip(
-            node.outputs, output_storage, res, strict=False
-        ):
-            out_dtype = getattr(node_out, "dtype", None)
-            if out_dtype and out_dtype != r.dtype:
-                r = np.asarray(r, dtype=out_dtype)
+        for node_out, out_storage, r in zip(node.outputs, output_storage, res):  # noqa: B905
             out_storage[0] = r
 
     def __str__(self):