fix swizzle kernel w/ fullgraph (#2705)

drisspg · web-flow · commit 9f12f146f8ec · 2025-08-06T19:37:04.000-07:00
stack-info: PR: #2705, branch: drisspg/stack/86
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -1448,6 +1448,7 @@ def triton_scale_swizzle(
             scales_flat,
         )
 
+    @torch.library.custom_op("torchao::triton_mx_block_rearrange", mutates_args=())
     def triton_mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor:
         """
         Rearranges an E8M0 tensor scale from row-major format to block-scaled swizzle format.
@@ -1716,6 +1717,15 @@ def _(x, per_tensor_scale=None):
         xq = torch.empty(M, N // 2, device=x.device, dtype=torch.uint8)
         return scales, xq
 
+    @triton_mx_block_rearrange.register_fake
+    def _(scale_tensor):
+        rows, cols = scale_tensor.shape
+        n_row_blocks = triton.cdiv(rows, 128)
+        n_col_blocks = triton.cdiv(cols, 4)
+        padded_rows = n_row_blocks * 128
+        padded_cols = n_col_blocks * 4
+
+        return scale_tensor.new_empty((padded_rows, padded_cols))
 else:
 
     def triton_to_mxfp8_dim1(
diff --git a/torchao/prototype/mx_formats/utils.py b/torchao/prototype/mx_formats/utils.py
@@ -15,7 +15,7 @@ def ceil_div(a, b):
     return (a + b - 1) // b
 
 
-def to_blocked(input_matrix, use_triton_kernel: bool = True) -> Tensor:
+def to_blocked(input_matrix, use_triton_kernel: bool = False) -> Tensor:
     """
     Rearrange a large matrix by breaking it into blocks and applying the rearrangement pattern.