.WIP

ricardoV94 · ricardoV94 · commit 2c7f98084d57 · 2025-07-08T17:43:33.000+02:00
diff --git a/pytensor/tensor/rewriting/blas.py b/pytensor/tensor/rewriting/blas.py
@@ -916,6 +916,10 @@ def specialize_matmul_to_batched_dot(fgraph, node):
     """
     x, y = node.inputs
 
+    if x.type.ndim < 3:
+        # This doesn't actually have a batch dimension
+        return None
+
     # BatchedDot does not allow implicit broadcasting of the batch dimensions
     # We do not want to explicitly broadcast as it may result in huge arrays
     if x.type.broadcastable[:-2] != y.type.broadcastable[:-2]:
@@ -926,6 +930,7 @@ def specialize_matmul_to_batched_dot(fgraph, node):
     if len(x_shape) > 3:
         # If we have more than one batch dim, ravel it
         x = x.reshape((-1, x_shape[-2], x_shape[-1]))
+    if len(y_shape) > 3:
         y = y.reshape((-1, y_shape[-2], y_shape[-1]))
 
     new_out = _batched_dot(x, y)
diff --git a/pytensor/tensor/rewriting/math.py b/pytensor/tensor/rewriting/math.py
@@ -194,16 +194,15 @@ def local_lift_transpose_through_dot(fgraph, node):
         return ret
 
 
-@register_canonicalize
-@register_specialize
-@node_rewriter(tracks=[_matmul])
-def local_batched_matmul_to_core_matmul(fgraph, node):
+def _batched_matmul_to_core_matmul(fgraph, node, allow_reshape: bool):
     """Move batch dimensions of matmul operands to core matmul
 
     Example, if x has batch dimensions that don't overlap with batch dimensions of y
     x @ y -> (x.reshape(-1, x.shape[-1]) @ y).reshape(*x.shape[:-1], y.shape[-1])
 
     It also works for batch dimensions of y that don't overlap with batch dimensions of x
+
+    The rewrite only uses reshape when mixing dimensions, and it can refuse to apply if `allow_reshape=False`
     """
 
     x, y = node.inputs
@@ -247,6 +246,9 @@ def local_batched_matmul_to_core_matmul(fgraph, node):
             # x was a row matrix, squeeze it to clean up the graph
             x_stacked = x_stacked.squeeze(-2)
         if n_x_axis_to_merge > 1 or not x_is_row:
+            if not allow_reshape:
+                return None
+
             # Ravel moved batch dims together with (m) if needed
             x_stacked_shape = tuple(x_stacked.shape)
             x_stacked = x_stacked.reshape(
@@ -262,6 +264,8 @@ def local_batched_matmul_to_core_matmul(fgraph, node):
             # y was a column matrix, squeeze it to clean up the graph
             y_stacked = y_stacked.squeeze(-1)
         if n_y_axis_to_merge > 1 or not y_is_col:
+            if not allow_reshape:
+                return False
             # Ravel moved batch dims together with (n) if needed
             y_stacked_shape = tuple(y_stacked.shape)
             y_stacked = y_stacked.reshape(
@@ -319,6 +323,21 @@ def local_batched_matmul_to_core_matmul(fgraph, node):
     return [out]
 
 
+@register_canonicalize
+@node_rewriter(tracks=[_matmul])
+def local_batched_matmul_to_core_matmul(fgraph, node):
+    # Allow passing batch dimensions of matmul to core vector / column matrices
+    return _batched_matmul_to_core_matmul(fgraph, node, allow_reshape=False)
+
+
+@register_specialize
+@node_rewriter(tracks=[_matmul])
+def local_batched_matmul_to_core_matmul_with_reshape(fgraph, node):
+    # Allow stacking batch dimensions of matmul with core dimensions, with a reshape operation
+    # We only apply this in specialize, because grahs with reshape are hard to work with
+    return _batched_matmul_to_core_matmul(fgraph, node, allow_reshape=True)
+
+
 @register_canonicalize
 @register_specialize
 @node_rewriter([_matmul])
diff --git a/tests/tensor/rewriting/test_math.py b/tests/tensor/rewriting/test_math.py
@@ -4661,7 +4661,15 @@ def count_matvec_nodes(graph):
     assert count_matvec_nodes(out) == 1
 
     rewritten_out = rewrite_graph(
-        out, exclude=("local_eager_useless_unbatched_blockwise",)
+        out,
+        include=(
+            "canonicalize",
+            "specialize",
+        ),
+        exclude=(
+            "local_eager_useless_unbatched_blockwise",
+            "specialize_matmul_to_batched_dot",
+        ),
     )
     # No `matvec` in the rewritten out if one of the vector can be treated as a matrix
     expected = not any(
@@ -4675,7 +4683,9 @@ def count_matvec_nodes(graph):
             for vec_dim, mat_dim in zip(vec_shape[:-1], mat_shape[:-2])
         )
 
-    assert count_matvec_nodes(rewritten_out) == expected
+    assert count_matvec_nodes(rewritten_out) == expected, rewritten_out.dprint(
+        print_shape=True
+    )
 
     rng = np.random.default_rng(mat_shape + vec_shape)
     eval_dict = {mat: rng.random(mat.type.shape), vec: rng.random(vec.type.shape)}