[KERNELS] fix mxfp4 constraints and split k in persistent matmul (#7119)

aeng-openai · web-flow · commit b010cf145e68 · 2025-06-10T01:21:13.000Z
- strides need to be a multiple of 32 bytes for the fp4 tma
- inner dim needs to be a multiple of 128 for the fp4 tma. previous code
only checked `w.shape[-1]` but this is the wrong axis when SWAP_XW mode
is used
- split k offsets and masks were computed incorrectly
- update test shapes so they actually exercise the persistent tma
codepath
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -193,16 +193,17 @@ class Case:
             Case(300, 400, 400, "batched", "bfloat16", "mxfloat8_e5m2", 32, 4),
             Case(1000, 700, 2, "batched", "bfloat16", "mxfloat4_e2m1", 8, 2),
             Case(16, 256, 256, "ragged", "float8_e5m2", "mxfloat4_e2m1", 128, 4, hbm_swizzling=True),
-            Case(1000, 704, 800, "batched", "float8_e5m2", "mxfloat4_e2m1", 3, 1, hbm_swizzling=True),
-            Case(1000, 704, 800, "batched", "float8_e5m2", "mxfloat4_e2m1", 3, 1),
-            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, split_k=9),
-            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, split_k=9, hbm_swizzling=True),
-            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2),
-            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, hbm_swizzling=True),
+            Case(1000, 704, 832, "batched", "float8_e5m2", "mxfloat4_e2m1", 3, 1, hbm_swizzling=True),
+            Case(1000, 704, 832, "batched", "float8_e5m2", "mxfloat4_e2m1", 3, 1, hbm_swizzling=True),
+            Case(1000, 704, 832, "batched", "float8_e5m2", "mxfloat4_e2m1", 3, 1),
+            Case(1000, 704, 832, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, split_k=9),
+            Case(1000, 704, 832, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, split_k=9, hbm_swizzling=True),
+            Case(1000, 704, 832, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2),
+            Case(1000, 704, 832, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, hbm_swizzling=True),
             Case(300, 400, 400, "ragged", "float8_e5m2", "mxfloat8_e4m3fn", 8, 4),
             Case(300, 400, 400, "ragged", "float8_e5m2", "mxfloat8_e4m3fn", 8, 4, hbm_swizzling=True),
-            Case(300, 400, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 4),
-            Case(300, 400, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 4, hbm_swizzling=True),
+            Case(300, 400, 832, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 4),
+            Case(300, 400, 832, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 4, hbm_swizzling=True),
             Case(300, 400, 400, "batched", "float8_e5m2", "mxfloat8_e4m3fn", 32, 4),
             Case(300, 400, 400, "batched", "float8_e5m2", "mxfloat8_e4m3fn", 32, 4, hbm_swizzling=True),
             # AMD
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -250,6 +250,8 @@ def mx_can_use_tma(mx_ctx: MicroscalingCtx):
 
 def can_use_persistent_tma(x, w, gather_indx, precision_config):
     mx_ctx = precision_config.mx_ctx
+    is_mxfp4 = mx_ctx.weight_scale is not None and w.dtype == torch.uint8
+    weight_stride_req = 32 if is_mxfp4 else 16
     return (
         # TMA requires CUDA 9.0, last dim contiguous, and multiple of 16-byte strides otherwise.
         target_info.cuda_capability_geq(9, 0) and
@@ -258,14 +260,10 @@ def can_use_persistent_tma(x, w, gather_indx, precision_config):
             x.stride(1) * x.element_size() % 16 == 0 and x.stride(2) == 1
         ) and (
             # Check W is either transposed or non-transposed, and with required stride.
-            (w.stride(1) * w.element_size() % 16 == 0 and w.stride(2) == 1) or
-            (w.stride(2) * w.element_size() % 16 == 0 and w.stride(1) == 1)
+            (w.stride(1) * w.element_size() % weight_stride_req == 0 and w.stride(2) == 1) or
+            (w.stride(2) * w.element_size() % weight_stride_req == 0 and w.stride(1) == 1)
         ) and (
             mx_ctx.weight_scale is None or mx_can_use_tma(mx_ctx)
-        ) and (
-            # MFXP4 tma requires 128 elements on the inner dim.
-            # MFXP4 is represented as packed uint8.
-            w.dtype != torch.uint8 or w.shape[-1] % 128 == 0
         )
         # compiler crash ?
         and (x.dtype.itemsize <= 1 or w.dtype != torch.uint8)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
@@ -35,20 +35,24 @@ def _update_tensor_desc(desc, ptr, shape=None):
     )
 
 @triton.jit
-def _make_tensor_desc(ptr, shape, strides, block_shape, transpose: tl.constexpr = False):
+def _multiple_of(a, b):
+    return tl.cdiv(a, b) * b
+
+@triton.jit
+def _make_tensor_desc(ptr, shape, strides, block_shape, transpose: tl.constexpr = False, pad_inner_shape: tl.constexpr = 1):
     tl.static_assert(len(shape) == len(strides))
     tl.static_assert(len(strides) == len(block_shape))
     if transpose:
         return tl.make_tensor_descriptor(
             ptr,
-            shape=shape[:-2] + [shape[-1], shape[-2]],
+            shape=shape[:-2] + [shape[-1], _multiple_of(shape[-2], pad_inner_shape)],
             strides=strides[:-2] + [strides[-1], tl.constexpr(1)],
             block_shape=block_shape[:-2] + [block_shape[-1], block_shape[-2]],
         )
     else:
         return tl.make_tensor_descriptor(
             ptr,
-            shape=shape,
+            shape=shape[:-1] + [_multiple_of(shape[-1], pad_inner_shape)],
             strides=strides[:-1] + [tl.constexpr(1)],
             block_shape=block_shape,
         )
@@ -235,12 +239,20 @@ def _p_matmul_ogs(
             block_shape=[BLOCK_M, BLOCK_K]
         )
 
+    # Pad the inner shape to 128 for mxfp4 weights; TMA requires this when the compiler uses CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B.
+    # This technically makes the shape masking incorrect, but it's fine because:
+    #  - When the N dim is padded, the scales will be masked to 0.
+    #  - When the K dim is padded, the activations we perform tl.dot with will be masked to 0.
+    #    Note: the scales can't be relied on for zeroing in this case, because they apply to groups
+    #    of 32 elements in the K dimension.
+    w_pad_inner_shape = 128 if is_microscaled_format and W.dtype.element_ty == tl.uint8 else 1
     w_desc = _make_tensor_desc(W,
         shape=[N_EXPTS_TOT if ExptData is not None else batch_size,
             (K + W_PACK_DIVISOR - 1) // W_PACK_DIVISOR, N],
         strides=[stride_w_e, stride_w_k, stride_w_n],
         block_shape=[1, PACKED_BLOCK_K_W, BLOCK_N],
-        transpose=W_TRANSPOSE)
+        transpose=W_TRANSPOSE,
+        pad_inner_shape=w_pad_inner_shape)
 
     if is_microscaled_format:
         PackedK = (K + MX_PACK_DIVISOR - 1) // MX_PACK_DIVISOR
@@ -320,7 +332,7 @@ def _p_matmul_ogs(
 
             if SPLIT_K > 1:
                 offs_mx_k += MX_SCALE_BLOCK_K * pid_k
-                offs_mx_inner += PACKED_MX_BLOCK * pid_k
+                offs_mx_inner += (MX_SCALE_BLOCK_K // 4) * pid_k * stride_mx_k
 
         if X_USE_LOAD_TMA:
             if ExptData is None:
@@ -357,13 +369,13 @@ def _p_matmul_ogs(
             else:
                 XPtrs = XBase + offs_x_m + offs_x_k
                 XBase += BLOCK_K * SPLIT_K * stride_x_k
+                mask_k = tl.arange(0, BLOCK_K) < K - off_k
                 if EVEN_K:
                     if SPLIT_K > 1:
-                        x = tl.load(XPtrs, mask=off_k < K, other=0.0)
+                        x = tl.load(XPtrs, mask=mask_k[None, :], other=0.0)
                     else:
                         x = tl.load(XPtrs)
                 else:
-                    mask_k = tl.arange(0, BLOCK_K) < K - off_k
                     x = tl.load(XPtrs, mask=mask_k[None, :], other=0.0)
 
             w = _load_tensor_desc(w_desc, [expt_id, off_k_w, off_n], transpose=W_TRANSPOSE)
@@ -381,17 +393,17 @@ def _p_matmul_ogs(
                         w_scales = unswizzle_mx_scale_bw(tl.load(MxPtrs))
                     else:
                         MxPtrs = MxScale + expt_id.to(index_type) * stride_mx_e + offs_mx_k.to(index_type)[None, :] * stride_mx_k + offs_w_n.to(index_type)[:, None] * stride_mx_n + ki * MX_SCALE_BLOCK_K * SPLIT_K * stride_mx_k
+                        mask_k = offs_mx_k < tl.cdiv(K - off_k, MX_PACK_DIVISOR)
                         if EVEN_K:
                             if SPLIT_K > 1:
-                                w_scales = tl.load(MxPtrs, mask=off_k < K, other=0.0)
+                                w_scales = tl.load(MxPtrs, mask=mask_k[None, :], other=0.0)
                             else:
                                 w_scales = tl.load(MxPtrs)
                         else:
-                            mask_k = offs_mx_k < tl.cdiv(K - off_k, MX_PACK_DIVISOR)
                             w_scales = tl.load(MxPtrs, mask=mask_k[None, :], other=0.0)
 
                 elif SWIZZLE_MX_SCALE == "BLACKWELL":
-                    w_scales = mx_desc.load([expt_id, off_n // 128, ki * (MX_SCALE_BLOCK_K // 4 * SPLIT_K), 0, 0])
+                    w_scales = mx_desc.load([expt_id, off_n // 128, pid_k * MX_SCALE_BLOCK_K // 4 + ki * (MX_SCALE_BLOCK_K // 4 * SPLIT_K), 0, 0])
                     w_scales = w_scales.reshape((w_scales.shape[1], w_scales.shape[2] * 32 * 4 * 4))
                     w_scales = unswizzle_mx_scale_bw(w_scales)
                 else: