Apply useless blockwise rewrite when there are only dummy batch dims

ricardoV94 · ricardoV94 · commit 10c36d2a2240 · 2023-12-10T11:56:35.000Z
Also extend eager rewrite to more Ops

The Blockwise MatrixInverse grad test became more sensitive in float32, because desired stabilization rewrites (mainly `inv_as_solve`) that target Dot of Blockwise{MatrixInverse} are now triggered in the default blockwise grad but not in the non-default non-blockwise grad
diff --git a/pytensor/tensor/blockwise.py b/pytensor/tensor/blockwise.py
@@ -163,16 +163,16 @@ def make_node(self, *inputs):
 
         return Apply(self, batched_inputs, batched_outputs)
 
-    def _batch_ndim_from_outputs(self, outputs: Sequence[TensorVariable]) -> int:
-        return cast(int, outputs[0].type.ndim - len(self.outputs_sig[0]))
+    def batch_ndim(self, node: Apply) -> int:
+        return cast(int, node.outputs[0].type.ndim - len(self.outputs_sig[0]))
 
     def infer_shape(
         self, fgraph, node, input_shapes
     ) -> list[tuple[TensorVariable, ...]]:
         from pytensor.tensor import broadcast_shape
         from pytensor.tensor.shape import Shape_i
 
-        batch_ndims = self._batch_ndim_from_outputs(node.outputs)
+        batch_ndims = self.batch_ndim(node)
         core_dims: dict[str, Any] = {}
         batch_shapes = []
         for input_shape, sig in zip(input_shapes, self.inputs_sig):
@@ -278,7 +278,7 @@ def L_op(self, inputs, outs, ograds):
             return new_rval
 
         # Sum out the broadcasted dimensions
-        batch_ndims = self._batch_ndim_from_outputs(outs)
+        batch_ndims = self.batch_ndim(outs[0].owner)
         batch_shape = outs[0].type.shape[:batch_ndims]
         for i, (inp, sig) in enumerate(zip(inputs, self.inputs_sig)):
             if isinstance(rval[i].type, (NullType, DisconnectedType)):
@@ -320,7 +320,7 @@ def core_func(*inner_inputs):
         return self._gufunc
 
     def _check_runtime_broadcast(self, node, inputs):
-        batch_ndim = self._batch_ndim_from_outputs(node.outputs)
+        batch_ndim = self.batch_ndim(node)
 
         for dims_and_bcast in zip(
             *[
diff --git a/pytensor/tensor/rewriting/blockwise.py b/pytensor/tensor/rewriting/blockwise.py
@@ -2,9 +2,15 @@
 from pytensor.graph import node_rewriter
 from pytensor.graph.replace import vectorize_node
 from pytensor.graph.rewriting.basic import copy_stack_trace, out2in
+from pytensor.tensor.basic import Alloc, ARange, shape_padleft
 from pytensor.tensor.blockwise import Blockwise
-from pytensor.tensor.math import _matrix_matrix_matmul
-from pytensor.tensor.rewriting.basic import register_canonicalize
+from pytensor.tensor.math import Dot
+from pytensor.tensor.rewriting.basic import (
+    register_canonicalize,
+    register_specialize,
+    register_stabilize,
+)
+from pytensor.tensor.subtensor import AdvancedIncSubtensor, AdvancedSubtensor, Subtensor
 
 
 @node_rewriter([Blockwise])
@@ -29,8 +35,17 @@ def local_useless_unbatched_blockwise(fgraph, node):
     op = node.op
     inputs = node.inputs
 
-    if max(inp.type.ndim - len(sig) for inp, sig in zip(inputs, op.inputs_sig)) == 0:
-        return copy_stack_trace(node.outputs, op.core_op.make_node(*inputs).outputs)
+    batch_ndims = node.op.batch_ndim(node)
+    if all(all(inp.type.broadcastable[:batch_ndims]) for inp in inputs):
+        if batch_ndims:
+            # Remove dummy batch dims
+            axis = tuple(range(batch_ndims))
+            inputs = [inp.squeeze(axis) for inp in inputs]
+        new_outs = op.core_op.make_node(*inputs).outputs
+        if batch_ndims:
+            # Reintroduce dummy batch dims
+            new_outs = [shape_padleft(out, batch_ndims) for out in new_outs]
+        return copy_stack_trace(node.outputs, new_outs)
 
 
 # We register this rewrite late, so that other rewrites need only target Blockwise Ops
@@ -46,6 +61,22 @@ def local_useless_unbatched_blockwise(fgraph, node):
 
 # Avoid redundant cases early on for Ops whose default form is not Blockwised
 @register_canonicalize
-@node_rewriter(tracks=[_matrix_matrix_matmul])
+@register_stabilize
+@register_specialize
+@node_rewriter(tracks=[Blockwise])
 def local_eager_useless_unbatched_blockwise(fgraph, node):
-    return local_useless_unbatched_blockwise.fn(fgraph, node)
+    if isinstance(
+        node.op.core_op,
+        (
+            # Many Dot-related rewrites (e.g., all of BlasOpt) happen before specialize
+            Dot,
+            # These Ops can't always be trivially vectorized at runtime,
+            # Since their inputs may imply non-rectangular shapes.
+            Alloc,
+            ARange,
+            Subtensor,
+            AdvancedSubtensor,
+            AdvancedIncSubtensor,
+        ),
+    ):
+        return local_useless_unbatched_blockwise.fn(fgraph, node)
diff --git a/tests/tensor/test_blockwise.py b/tests/tensor/test_blockwise.py
@@ -293,7 +293,7 @@ def test_grad(self):
                     pt_out,
                     np_out,
                     rtol=1e-7 if config.floatX == "float64" else 1e-5,
-                    atol=1e-6 if config.floatX == "float64" else 1e-5,
+                    atol=1e-6 if config.floatX == "float64" else 1e-4,
                 )
 
 

Original file line number	Diff line number	Diff line change
`@@ -293,7 +293,7 @@ def test_grad(self):`
`293`	`293`	`pt_out,`
`294`	`294`	`np_out,`
`295`	`295`	`rtol=1e-7 if config.floatX == "float64" else 1e-5,`
`296`		`- atol=1e-6 if config.floatX == "float64" else 1e-5,`
	`296`	`+ atol=1e-6 if config.floatX == "float64" else 1e-4,`
`297`	`297`	`)`
`298`	`298`
`299`	`299`