[KERNEL] Fix _p_matmul_ogs when x is transposed. (#8156)

yongjik · ThomasRaoux · web-flow · commit aff4b7a74bc7 · 2025-09-12T06:22:28.000Z
Also added checks to disable persistent kernel when y is transposed.

---------

Co-authored-by: Thomas Raoux &lt;thomas.raoux@openai.com&gt;
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -159,6 +159,9 @@ class Case:
     split_k: int = 1
     hbm_swizzling: bool = False
     epilogue_subtile: Union[int, None] = None
+    x_transpose: bool = False
+    w_transpose: bool = False
+    y_transpose: bool = False
 
 
 @pytest.mark.parametrize(
@@ -252,6 +255,13 @@ class Case:
             Case(1000, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 3, 1),
             Case(600, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 4, 2),
             Case(600, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 4, 2, n_expt_shards=2),
+        ] + [
+            Case(320, 400, 400, mode, dtype, dtype, x_transpose=x_transpose, w_transpose=w_transpose, y_transpose=y_transpose)
+            for mode in ("batched", "ragged")
+            for dtype in ("float16", "float8_e5m2")
+            for x_transpose in (False, True)
+            for w_transpose in (False, True)
+            for y_transpose in (False, True)
         ]
     ],
 )
@@ -268,6 +278,7 @@ class Case:
 @pytest.mark.parametrize("is_persistent", [False, True])
 def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas, is_persistent, n_expts_tot,
             n_expts_act, n_expt_shards, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, epilogue_subtile,
+            x_transpose, w_transpose, y_transpose,
             device, opt_flags_scope, fresh_knobs):
     # TODO: remove when Triton FP8 supports proper RTNE
     if is_cuda():
@@ -372,6 +383,17 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
                                                                  has_y_gammas, requires_grad=test_bwd, device=device)
     x_ref, w_ref, bias_ref, gs0_ref, gs1_ref = apply_precision(x_tri, w_tri, bias_tri, gs0_tri, gs1_tri, precision_opt)
 
+    if x_transpose:
+        x_tri = x_tri.detach().transpose(-1, -2).contiguous().transpose(-1, -2).requires_grad_(test_bwd)
+    if w_transpose:
+        w_tri = w_tri.detach().transpose(-1, -2).contiguous().transpose(-1, -2).requires_grad_(test_bwd)
+    if y_transpose:
+        n_rows = m if gindx is None else gindx.dst_indx.shape[0]
+        yT_shape = (n_expts_tot, n, n_rows) if mode == "batched" else (n, n_rows)
+        y_tri_in = torch.empty(yT_shape, dtype=act_dtype, device=device).transpose(-1, -2)
+    else:
+        y_tri_in = None
+
     if w_tri.shape[0] == 1 and mode != "batched":
         # Test the case when weight has dim 2, i.e., shape (K, N).
         w_tri = w_tri.squeeze(0).detach().requires_grad_(test_bwd)
@@ -422,9 +444,14 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
 
     # triton
     try:
-        tri_y = matmul_ogs(x_tri, w_tri, bias_tri, rdata, gindx, sindx, precision_opt, gammas=gs1_ref, epilogue=epilogue)
+        tri_y = matmul_ogs(x_tri, w_tri, bias_tri, rdata, gindx, sindx, precision_opt,
+                           gammas=gs1_ref, epilogue=epilogue, y=y_tri_in)
     except (opt_flags.InapplicableConstraint, NotImplementedError):
         pytest.skip("inapplicable opt_flags constraint")
+    if y_tri_in is not None:
+        assert tri_y.data_ptr() == y_tri_in.data_ptr()
+        assert tri_y.shape == y_tri_in.shape
+        assert tri_y.stride() == y_tri_in.stride()
     # If split_k > 1, then the intermediate tensor is fp32.
     sep_gather = mode == "ragged" and do_gather and n_expts_act > 1 and split_k == 1
     sep_scatter = mode == "ragged" and do_scatter and n_expts_act > 1 and split_k == 1
@@ -534,7 +561,7 @@ def test_set_idle_sms():
     num_idle_sms = 24
     matmul_ogs_set_idle_sms(num_idle_sms)
     flags = make_opt_flags(torch.float32, torch.float32, torch.float32, PrecisionConfig(), \
-                           1, 1024, 1024, 1024, None, True, False, 1)
+                           1, 1024, 1024, 1024, None, True, False, 1, False)
     assert flags.idle_sms == num_idle_sms
 
 
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -177,6 +177,8 @@ def apply_allocation(allocation: MatmulAllocation, output):
     if output is None:
         output = torch.empty(allocation.output[0], device=allocation.device, dtype=allocation.output[1])
     else:
+        if output.ndim == 2:
+            output = output[None, :, :]
         assert output.shape == allocation.output[0]
     ret["output"] = output[None, :, :]
     ret["scratchpad"] = {
@@ -350,6 +352,7 @@ def matmul_ogs(x, w, bias,
         x_scale = Tensor(x_scale)
     if not isinstance(x, Tensor):
         x = Tensor(x, dtype=x.dtype)
+    x_transpose = x.stride(-1) != 1
     # determine shapes
     has_gather = gather_indx is not None
     has_scatter = scatter_indx is not None
@@ -362,14 +365,20 @@ def matmul_ogs(x, w, bias,
         assert x.shape[0] == w.shape[0]
     # compute optimization flags
     out_dtype = precision_config.out_dtype or x.dtype
-    can_use_tma = x.numel() > 0 and x.storage.is_tma_compliant() and \
-                  w.numel() > 0 and w.storage.is_tma_compliant() and \
-                 (w_scale is None or w_scale.storage.is_tma_compliant())
+    can_use_tma = (
+        x.numel() > 0 and x.storage.is_tma_compliant() and
+        w.numel() > 0 and w.storage.is_tma_compliant() and
+        (w_scale is None or w_scale.storage.is_tma_compliant()) and
+        (not is_ragged or x.stride(-1) == 1) and
+        # Currently we don't support tma if y is column major; may revisit later if this becomes an issue.
+        (y is None or y.stride(-1) == 1)
+    )
     # hopper w/ mxfp4 doesn't support TMA
     can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4)
     can_use_fused_scatter = has_scatter and (fused_activation.specs.fn is None) and (epilogue.specs.fn is None) and (routing_data.n_expts_act == 1)
     opt_flags = make_opt_flags(out_dtype, x.dtype, w.dtype, precision_config,
-        batch_size, M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,
+        batch_size, M, N, K, routing_data, can_use_tma, can_use_fused_scatter,
+        epilogue.effective_itemsize, x_transpose,
     )
     if not can_use_fused_scatter and opt_flags.fused_scatter:
         raise InapplicableConstraint("Fused scatter is not supported")
@@ -469,7 +478,7 @@ def matmul_ogs(x, w, bias,
                    y_tensor_or_tma, y_storage.data, *out_matmul.stride(),
                    *((None, out_matmul_scale, None) if out_matmul_has_mx else out_matmul_flex),
                    *out_matmul_scale_strides[-4:],
-                   x_tensor_or_tma, x_storage.data, *x_strides,
+                   x_tensor_or_tma, x_storage.data, *x_strides, x_transpose,
                    flex.lhs_data.scale,
                    None if x_scale is None else x_scale.data.view(torch.uint8), *x_scale_strides,
                    w_tensor_or_tma, w_storage.data, *w_storage.data.stride(), w_transpose,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -34,7 +34,7 @@ def _matmul_ogs(
              Y, YPtr, stride_y_k, stride_y_z, stride_y_m, stride_y_n,
              YExpectedScale, YActualScale, YChecksumScale,
              stride_y_mx_k, stride_y_mx_z, stride_y_mx_m, stride_y_mx_n,
-             X, XPtr, stride_x_z, stride_x_m, stride_x_k,
+             X, XPtr, stride_x_z, stride_x_m, stride_x_k, X_TRANSPOSE: tl.constexpr,
              XScale,
              XMxScale, stride_x_mx_z, stride_x_mx_m, stride_x_mx_k,
              W, WPtr, stride_w_e, stride_w_k, stride_w_n, W_TRANSPOSE: tl.constexpr,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
@@ -82,7 +82,7 @@ def _p_matmul_ogs(
              Y, YPtr, stride_y_k, stride_y_z, stride_y_m, stride_y_n,
              YExpectedScale, YActualScale, YChecksumScale,
              stride_y_mx_k, stride_y_mx_z, stride_y_mx_m, stride_y_mx_n,
-             X, XPtr, stride_x_z, stride_x_m, stride_x_k,
+             X, XPtr, stride_x_z, stride_x_m, stride_x_k, X_TRANSPOSE: tl.constexpr,
              XScale,
              XMxScale, stride_x_mx_z, stride_x_mx_m, stride_x_mx_k,
              W, WPtr, stride_w_e, stride_w_k, stride_w_n, W_TRANSPOSE: tl.constexpr,
@@ -287,8 +287,12 @@ def _p_matmul_ogs(
             if USE_GATHER_TMA:
                 x = X.gather(offs_x_m, off_k)
             elif X_TMA_MODE == "dense":
-                x = X.load([start_z, start_m + off_m, off_k])
-                x = x.reshape(BLOCK_M, BLOCK_K)
+                if X_TRANSPOSE:
+                    x = X.load([start_z, off_k, start_m + off_m])
+                    x = x.reshape(BLOCK_K, BLOCK_M).T
+                else:
+                    x = X.load([start_z, start_m + off_m, off_k])
+                    x = x.reshape(BLOCK_M, BLOCK_K)
             elif X_TMA_MODE == "ragged":
                 x = load_ragged(X, start_m, eM, [start_z, off_m, off_k], ragged_dim=1)
                 x = x.reshape(BLOCK_M, BLOCK_K)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -44,6 +44,7 @@ def make_default_opt_flags_amd(
     can_use_fused_scatter,
     enforce_bitwise_invariance,
     epilogue_effective_itemsize,
+    x_transpose,
     constraints,
 ):
     constraints_supported = ["block_m", "block_n", "block_k", "split_k", "fused_scatter", "is_persistent", "epilogue_subtile"]
@@ -143,6 +144,7 @@ def make_default_opt_flags_nvidia(
     can_use_fused_scatter,
     enforce_bitwise_invariance,
     epilogue_effective_itemsize,
+    x_transpose,
     constraints,
 ):
     constraints_supported = ["block_m", "block_k", "split_k", "is_persistent", "fused_scatter", "epilogue_subtile", "num_stages", "idle_sms"]
@@ -207,6 +209,7 @@ def make_default_opt_flags_nvidia(
         out_dtype,
         lhs_dtype,
         rhs_dtype,
+        x_transpose,
     )
 
     if constraints.get("epilogue_subtile", None) is not None:
@@ -286,6 +289,7 @@ def make_opt_flags(
     can_use_persistent_tma,
     can_use_fused_scatter,
     epilogue_effective_itemsize,
+    x_transpose,
 ):
     if _opt_flags_constraints.get("is_persistent", False) and not can_use_persistent_tma:
         raise InapplicableConstraint("cannot enforce `is_persistent=True` constraint")
@@ -297,7 +301,7 @@ def make_opt_flags(
         return _opt_flags
     args = [out_dtype, lhs_dtype, rhs_dtype, precision_config, batch_size, m, n, k,
             routing_data, can_use_persistent_tma, can_use_fused_scatter,
-            enforce_bitwise_invariance, epilogue_effective_itemsize,
+            enforce_bitwise_invariance, epilogue_effective_itemsize, x_transpose,
             _opt_flags_constraints]
     backend = triton.runtime.driver.active.get_current_target().backend
     if backend == "hip":
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py
@@ -72,6 +72,7 @@ def compute_num_stages(
     out_dtype,
     lhs_dtype,
     rhs_dtype,
+    x_transpose,
     epilogue_subtile,
     epilogue_effective_itemsize,
 ):
@@ -103,6 +104,8 @@ def compute_num_stages(
         # pipelined layout conversion before store of the accumulator
         # note: layout conversion has some padding
         smem_capacity -= int((block_m + 4) * acc_block_n * acc_size)
+        if x_transpose:
+            smem_capacity -= block_m * block_k * lhs_dtype.itemsize
         if precision_config.weight_scale is not None:
             # mx scales
             stage_size += block_n * (block_k // int(MXFP_BLOCK_SIZE))