[triton_kernels] use static TMAs in matmul_ogs.py (#7803)

ptillet · apgoucher · web-flow · commit b3e9fadea14e · 2025-08-09T00:32:57.000-07:00
Co-authored-by: apgoucher &lt;apgoucher@openai.com&gt;
diff --git a/python/test/unit/cuda/test_tma_descriptor.py b/python/test/unit/cuda/test_tma_descriptor.py
@@ -55,9 +55,12 @@ def example_load_store_kernel(X, Y, x_off, y_off, x_size, y_size):
     store_ragged(Y, y_off, y_size, [0, 0], data)
 
 
-@pytest.mark.parametrize("write_only", [False, True])
-@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"])
-def test_ragged_tma(dtype, write_only):
+@pytest.mark.parametrize("dtype", [
+    "bfloat16", "float16", "float32", "float64",  # floating-point
+    "int8", "int16", "int32", "int64",  # signed integers
+    "uint8", "uint16", "uint32", "uint64"  # unsigned integers
+])
+def test_ragged_tma(dtype):
 
     if not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 9:
         pytest.skip("Test requires Hopper or Blackwell target.")
@@ -67,10 +70,10 @@ def test_ragged_tma(dtype, write_only):
 
     src = torch.randn((1024, 80), dtype=torch.float32, device="cuda").to(dtype)
     ref = torch.randn((1024, 80), dtype=torch.float32, device="cuda").to(dtype)
-    dst = 1.0 * ref
+    dst = ref.clone()
 
     X = create_ragged_descriptor(src, [32, 128])
-    Y = create_ragged_descriptor(dst, [32, 128], write_only=write_only)
+    Y = create_ragged_descriptor(dst, [32, 128])
 
     x_off = 42
     y_off = 51
diff --git a/python/triton/tools/ragged_tma.py b/python/triton/tools/ragged_tma.py
@@ -4,16 +4,8 @@
 
 # fmt: off
 
-class TensorDescriptorPtr:
-    def __init__(self, data_ptr, dtype):
-        self._data_ptr = data_ptr
-        self.dtype = dtype
 
-    def data_ptr(self):
-        return self._data_ptr
-
-
-def create_ragged_descriptor(T, block_shape, ragged_dim=0, write_only=False):
+def create_ragged_descriptor(T, block_shape, ragged_dim=0):
     """
     Given a 2- or 3-dimensional tensor T, this creates a 'ragged descriptor'
     which behaves like a concatenation (along the first axis) of subarrays
@@ -33,11 +25,7 @@ def create_ragged_descriptor(T, block_shape, ragged_dim=0, write_only=False):
         ragged_dim += rank
 
     assert 0 <= ragged_dim < rank - 1, "last dimension cannot be ragged"
-
-    if write_only:
-        assert rank <= 4, "write-only ragged descriptors must have at most 4 dimensions"
-    else:
-        assert rank <= 3, "read-write ragged descriptors must have at most 3 dimensions"
+    assert rank <= 3, "read-write ragged descriptors must have at most 3 dimensions"
 
     assert len(block_shape) == rank, "block shape must have same length as tensor shape"
 
@@ -53,15 +41,8 @@ def create_ragged_descriptor(T, block_shape, ragged_dim=0, write_only=False):
     tma_stride = [2**34 - ragged_stride, ragged_stride] + [T.stride(i) for i in range(rank)]
     tma_shape  = [max_int, max_int] + tensor_shape
     box_shape  = [1, 1] + block_shape
-    ptr = T.data_ptr()
 
-    if write_only:
-        tma_stride = tma_stride[1:]
-        tma_shape = tma_shape[1:]
-        box_shape = box_shape[1:]
-        ptr = (ptr - billion * ragged_stride * T.element_size()) % (2**64)
-
-    return TensorDescriptor(TensorDescriptorPtr(ptr, T.dtype), tma_shape, tma_stride, box_shape)
+    return TensorDescriptor(T, tma_shape, tma_stride, box_shape)
 
 
 @triton.jit
@@ -106,18 +87,6 @@ def store_ragged(TMA, batch_offset, batch_size, coords, data, ragged_dim: tl.con
     TMA.store().
     """
 
-    if len(TMA.shape) == len(coords) + 1:
-        write_only: tl.constexpr = True
-    elif len(TMA.shape) == len(coords) + 2:
-        write_only: tl.constexpr = False
-    else:
-        tl.static_assert(False, "TMA must be a ragged descriptor")
-
     c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
-
-    if write_only:
-        data = tl.reshape(data, [1] + data.shape)
-        TMA.store([c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:], data)
-    else:
-        data = tl.reshape(data, [1, 1] + data.shape)
-        TMA.store([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:], data)
+    data = tl.reshape(data, [1, 1] + data.shape)
+    TMA.store([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:], data)
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -301,7 +301,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
                 pytest.skip("Hopper swizzling acts on a 64x64 tile (4x1 mma tiles).")
 
     # launch metadata for batched / mx types may not work yet.
-    test_launch_metadata = (mode == "ragged") and ("mx" not in weight_dtype_str)
+    test_launch_metadata = (mode == "ragged") and ("mx" not in weight_dtype_str) and fused_scatter
 
     torch.manual_seed(0)
 
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -427,6 +427,7 @@ def matmul_ogs(x, w, bias,
     if not isinstance(x, Tensor):
         x = Tensor(x, dtype=x.dtype)
     # determine shapes
+    is_ragged = routing_data.expt_hist is not None
     M = x.shape[-2] if gather_indx is None else gather_indx.src_indx.shape[0]
     batch_size = w.shape[0] if routing_data.expt_hist is None and w.ndim == 3 else 1
     K, N = w.shape[-2:]
@@ -505,19 +506,31 @@ def matmul_ogs(x, w, bias,
     grid = min(target_info.num_sms() - opt_flags.idle_sms, max_grid) if opt_flags.is_persistent else max_grid
     # canonicalize storage
     has_gather = gather_indx is not None
-    x_storage = _canonicalize_storage(x.storage, 2 if has_gather else 3, flex.lhs_data)
+    has_scatter = writeback_idxs is not None
+    has_gather_tma = has_gather and target_info.has_tma_gather()
+    has_scatter_tma = has_scatter and target_info.has_tma_gather()
+    y = wrap_torch_tensor(out0.view(-1, out0.shape[-1]) if has_scatter else out0.view(-1, *out0.shape[-2:]))
+    x_storage = _canonicalize_storage(x.storage, 2 if has_gather_tma else 3, flex.lhs_data)
     w_storage = _canonicalize_storage(w.storage, 3, flex.rhs_data)
+    y_storage = _canonicalize_storage(y.storage, 2 if has_scatter_tma else 3, flex.out_data)
     # create tma descriptor for x
-    x_has_tma = ((not has_gather) or (has_gather and target_info.has_tma_gather())) and opt_flags.is_persistent
-    x_block_tma = ([1] if has_gather else [1, opt_flags.block_m]) + [opt_flags.block_k]
-    x_tensor_or_tma = x_storage.make_tma(x_block_tma) if x_has_tma else x_storage.data
+    x_has_tma = opt_flags.is_persistent and (has_gather_tma or not has_gather)
+    x_tma_block_size = [1, opt_flags.block_k] if has_gather_tma else [1, opt_flags.block_m, opt_flags.block_k]
+    x_tma_mode = None if not x_has_tma else "ragged" if is_ragged and not has_gather_tma else "dense"
+    x_tensor_or_tma = x_storage.make_tma(x_tma_block_size, x_tma_mode) if x_has_tma else x_storage.data
+    # create tma descriptor for y
+    y_has_tma = opt_flags.is_persistent and (has_scatter_tma or not has_scatter)
+    block_n = opt_flags.block_n // opt_flags.epilogue_subtile // fused_activation.reduction_n
+    y_tma_block_size = [1, block_n] if has_scatter_tma else [1, opt_flags.block_m, block_n]
+    y_tma_mode = None if not y_has_tma else "ragged" if is_ragged and not has_scatter_tma else "dense"
+    y_tensor_or_tma = y_storage.make_tma(y_tma_block_size, y_tma_mode) if y_has_tma else y_storage.data
     # create tma descriptor for w
     w_has_tma = opt_flags.is_persistent
-    w_tensor_or_tma = w_storage.make_tma([1, opt_flags.block_k, opt_flags.block_n]) if w_has_tma else w_storage.data
+    w_tensor_or_tma = w_storage.make_tma([1, opt_flags.block_k, opt_flags.block_n], "dense") if w_has_tma else w_storage.data
     # create tma descriptor for w_scale
     w_scale_tensor_or_tma = w_scale
     w_scale_has_tma = opt_flags.is_persistent and w_scale is not None
-    w_scale_tensor_or_tma =  w_scale.storage.make_tma([opt_flags.block_n, opt_flags.block_k]) if w_scale_has_tma else w_scale
+    w_scale_tensor_or_tma =  w_scale.storage.make_tma([opt_flags.block_n, opt_flags.block_k], "dense") if w_scale_has_tma else w_scale
     # canonicalize strides
     x_strides = [0]*(3 - x_storage.data.ndim) + list(x_storage.data.stride())
     x_scale_strides = x_scale.stride() if x_has_mx else (None, None, None)
@@ -529,14 +542,13 @@ def matmul_ogs(x, w, bias,
     # launch kernel
     kernels = get_kernels(epilogue.specs, fused_activation.specs)
     (kernels._p_matmul_ogs if opt_flags.is_persistent else kernels._matmul_ogs)[(grid,)](
-                   flex.out_data.reinterpret(memory["output"]),
-                   flex.out_data.reinterpret(out0), *out0.stride(),
+                   y_tensor_or_tma, y_storage.data, *out0.stride(),
                    *((None, out_scale, None) if out_has_mx else out0_flex),
                    *out_scale_strides[-3:],
                    x_tensor_or_tma, x_storage.data, *x_strides,
                    flex.lhs_data.scale,
                    None if x_scale is None else x_scale.data.view(torch.uint8), *x_scale_strides,
-                   w_tensor_or_tma, *w_storage.data.stride(), w_storage.data.stride()[-1] != 1,
+                   w_tensor_or_tma, w_storage.data, *w_storage.data.stride(), w_storage.data.stride()[-1] != 1,
                    flex.rhs_data.scale,
                    w_scale_tensor_or_tma, *w_scale_strides,
                    bias, bias_stride,
@@ -574,7 +586,8 @@ def matmul_ogs(x, w, bias,
                    num_stages=opt_flags.num_stages,
                    arch=opt_flags.arch,
                    UPCAST_INDICES=should_upcast_indices(x, w, out0),
-                   DISABLE_Y_TMA=out0.stride(-2) * out0.dtype.itemsize % 16 != 0,
+                   X_TMA_MODE=x_tma_mode,
+                   Y_TMA_MODE=y_tma_mode,
                    SWAP_XW=preprocessing_features.swap_xw,
                    IS_EPILOGUE_DEQUANT_MXFP8=epilogue.specs.name == FnName.DEQUANTIZE_MXFP8.name,
                    NUM_SMS = grid if opt_flags.is_persistent else 0,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py
@@ -2,7 +2,6 @@
 
 import triton
 import triton.language as tl
-from triton.tools.tensor_descriptor import TensorDescriptor
 
 # -----------------------------------------------------------------------------
 #                                  Utilities
@@ -94,7 +93,7 @@ def matmul_launch_metadata(grid, kernel, args):
 
     ret = dict()
     M, N, K = args["M"], args["N"], args["K"]
-    Y, X, W = [t.base if isinstance(t, TensorDescriptor) else t for t in [args["Y"], args["X"], args["W"]]]
+    Y, X, W = args["YPtr"], args["XPtr"], args["WPtr"]
     tokens_per_expt = args.get("TOKENS_PER_EXPT_FOR_ANNOTATION")
     hist = args["ExptHist"]
     if hist is not None:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -30,13 +30,13 @@ def _zero_masked_rows(
 @triton.jit(do_not_specialize=["TOKENS_PER_EXPT_FOR_ANNOTATION"],
             repr=_matmul_ogs_repr, launch_metadata=matmul_launch_metadata)
 def _matmul_ogs(
-             Y, Out, stride_y_k, stride_y_z, stride_y_m, stride_y_n,
+             Y, YPtr, stride_y_k, stride_y_z, stride_y_m, stride_y_n,
              YExpectedScale, YActualScale, YChecksumScale,
              stride_y_mx_z, stride_y_mx_m, stride_y_mx_n,
              X, XPtr, stride_x_z, stride_x_m, stride_x_k,
              XScale,
              XMxScale, stride_x_mx_z, stride_x_mx_m, stride_x_mx_k,
-             W, stride_w_e, stride_w_k, stride_w_n, W_TRANSPOSE: tl.constexpr,
+             W, WPtr, stride_w_e, stride_w_k, stride_w_n, W_TRANSPOSE: tl.constexpr,
              WScale,
              WMxScale, stride_w_mx_e, stride_w_mx_k, stride_w_mx_n,
              B, stride_b_e, # Bias
@@ -72,13 +72,13 @@ def _matmul_ogs(
              EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr,
              W_CACHE_MODIFIER: tl.constexpr,
              NUM_SMS: tl.constexpr,
+             X_TMA_MODE: tl.constexpr,
+             Y_TMA_MODE: tl.constexpr,
              TOKENS_PER_EXPT_FOR_ANNOTATION=None,
              UPCAST_INDICES: tl.constexpr = False,
-             DISABLE_Y_TMA: tl.constexpr = True,
              SWAP_XW: tl.constexpr = False,
              IS_EPILOGUE_DEQUANT_MXFP8: tl.constexpr = False):
 
-    Y = Out  # Y is passed for the purposes of annotation; replace it with Out
     is_w_microscaled: tl.constexpr = WMxScale is not None
     MX_PACK_DIVISOR: tl.constexpr = MXFP_BLOCK_SIZE
     if is_w_microscaled:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
diff --git a/python/triton_kernels/triton_kernels/tensor.py b/python/triton_kernels/triton_kernels/tensor.py