[mxfp] fix bf16 x mxfp4 bug with SUBTILE_FACTOR > 1 (#8478)

jongsoo-openai · web-flow · commit 2172bb49df3b · 2025-10-18T09:44:01.000-07:00
``` pytest -rxs --pdb python/triton_kernels/tests/test_matmul.py::test_op ``` Before this PR ``` E triton.compiler.errors.CompilationError: at 337:23: E if is_out_microscaled: E MX_SCALE_BLOCK_N: tl.constexpr = OUT_BLOCK_N // MXFP_BLOCK_SIZE E N_MX_BLOCK: tl.constexpr = tl.cdiv(N, MXFP_BLOCK_SIZE) E E for a_i in tl.static_range(len(accs)): E acc_tile = accs[a_i] E acc_tile *= x_scale * w_scale E E if SWAP_XW: E acc_tile = acc_tile.T E E acc_tile = acc_tile + biases[a_i][None, :] * betas[:, None] E ^ E ValueError('Cannot make_shape_compatible: incompatible dimensions at index 0: 64 and 16') ```  # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -235,6 +235,7 @@ class Case:
             # mx types:
             Case(16, 256, 256, "plain", "bfloat16", "mxfloat4_e2m1", 1, 1),
             Case(16, 256, 256, "plain", "bfloat16", "mxfloat4_e2m1", 1, 1, hbm_swizzling=True),
+            Case(16, 256, 256, "plain", "bfloat16", "mxfloat4_e2m1", 1, 1, hbm_swizzling=True, epilogue_subtile=4),
             Case(16, 256, 256, "ragged", "bfloat16", "mxfloat4_e2m1", 1, 1),
             Case(16, 256, 256, "ragged", "bfloat16", "mxfloat4_e2m1", 1, 1, hbm_swizzling=True),
             Case(1000, 700, 700, "batched", "bfloat16", "mxfloat4_e2m1", 8, 2),
@@ -321,7 +322,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
     if is_cuda():
         if "float8" in weight_dtype_str and torch.cuda.get_device_capability()[0] < 9:
             pytest.skip("Float8 not tested on A100")
-        if "float16" in act_dtype_str and "mx" in weight_dtype_str and torch.cuda.get_device_capability()[0] >= 10:
+        if act_dtype_str == "float16" and "mx" in weight_dtype_str and torch.cuda.get_device_capability()[0] >= 10:
             pytest.skip("float16 x mx not supported with cuda capability >= 10")
         if weight_dtype_str.startswith("mx"):
             if "float8" in act_dtype_str and torch.cuda.get_device_capability()[0] < 10:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
@@ -403,14 +403,24 @@ def _p_matmul_ogs(
         biases = (bias,)
 
         if SUBTILE_FACTOR >= 2:
-            acc0, acc1 = acc.reshape(BLOCK_M, 2, BLOCK_N // 2).permute(0, 2, 1).split()
+            if SWAP_XW:
+                acc = acc.reshape(2, BLOCK_N // 2, BLOCK_M).permute(1, 2, 0)
+            else:
+                acc = acc.reshape(BLOCK_M, 2, BLOCK_N // 2).permute(0, 2, 1)
+            acc0, acc1 = acc.split()
             accs = (acc0, acc1)
             bias0, bias1 = bias.reshape(2, BLOCK_N // 2).permute(1, 0).split()
             biases = (bias0, bias1)
 
         if SUBTILE_FACTOR >= 4:
-            acc00, acc01 = acc0.reshape(BLOCK_M, 2, BLOCK_N // 4).permute(0, 2, 1).split()
-            acc10, acc11 = acc1.reshape(BLOCK_M, 2, BLOCK_N // 4).permute(0, 2, 1).split()
+            if SWAP_XW:
+                acc0 = acc0.reshape(2, BLOCK_N // 4, BLOCK_M).permute(1, 2, 0)
+                acc1 = acc1.reshape(2, BLOCK_N // 4, BLOCK_M).permute(1, 2, 0)
+            else:
+                acc0 = acc0.reshape(BLOCK_M, 2, BLOCK_N // 4).permute(0, 2, 1)
+                acc1 = acc1.reshape(BLOCK_M, 2, BLOCK_N // 4).permute(0, 2, 1)
+            acc00, acc01 = acc0.split()
+            acc10, acc11 = acc1.split()
             accs = (acc00, acc01, acc10, acc11)
             bias00, bias01 = bias0.reshape(2, BLOCK_N // 4).permute(1, 0).split()
             bias10, bias11 = bias1.reshape(2, BLOCK_N // 4).permute(1, 0).split()