[FRONTEND] Support for write-only ragged TMAs (#7792)

apgoucher · web-flow · commit b1f774c41570 · 2025-08-08T06:48:07.000+01:00
Tested on both H100 and GB200
diff --git a/python/test/unit/cuda/test_tma_descriptor.py b/python/test/unit/cuda/test_tma_descriptor.py
@@ -55,8 +55,9 @@ def example_load_store_kernel(X, Y, x_off, y_off, x_size, y_size):
     store_ragged(Y, y_off, y_size, [0, 0], data)
 
 
+@pytest.mark.parametrize("write_only", [False, True])
 @pytest.mark.parametrize("dtype", ["float16", "float32", "float64"])
-def test_ragged_tma(dtype):
+def test_ragged_tma(dtype, write_only):
 
     if not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 9:
         pytest.skip("Test requires Hopper or Blackwell target.")
@@ -69,7 +70,7 @@ def test_ragged_tma(dtype):
     dst = 1.0 * ref
 
     X = create_ragged_descriptor(src, [32, 128])
-    Y = create_ragged_descriptor(dst, [32, 128])
+    Y = create_ragged_descriptor(dst, [32, 128], write_only=write_only)
 
     x_off = 42
     y_off = 51
diff --git a/python/triton/tools/ragged_tma.py b/python/triton/tools/ragged_tma.py
@@ -4,7 +4,16 @@
 
 # fmt: off
 
-def create_ragged_descriptor(T, block_shape):
+class TensorDescriptorPtr:
+    def __init__(self, data_ptr, dtype):
+        self._data_ptr = data_ptr
+        self.dtype = dtype
+
+    def data_ptr(self):
+        return self._data_ptr
+
+
+def create_ragged_descriptor(T, block_shape, ragged_dim=0, write_only=False):
     """
     Given a 2- or 3-dimensional tensor T, this creates a 'ragged descriptor'
     which behaves like a concatenation (along the first axis) of subarrays
@@ -18,22 +27,41 @@ def create_ragged_descriptor(T, block_shape):
 
     block_shape = list(block_shape)
     tensor_shape = list(T.shape)
+    rank = len(tensor_shape)
+
+    if ragged_dim < 0:
+        ragged_dim += rank
 
-    assert 2 <= len(tensor_shape) <= 3, "ragged tensors must have dimension 2 or 3"
-    assert len(tensor_shape) == len(block_shape), "block shape must match tensor shape"
+    assert 0 <= ragged_dim < rank - 1, "last dimension cannot be ragged"
+
+    if write_only:
+        assert rank <= 4, "write-only ragged descriptors must have at most 4 dimensions"
+    else:
+        assert rank <= 3, "read-write ragged descriptors must have at most 3 dimensions"
+
+    assert len(block_shape) == rank, "block shape must have same length as tensor shape"
 
     max_int = 0x7fff0000
     billion = 0x40000000  # == 2**30
 
-    assert tensor_shape[0] <= billion, "number of rows may not exceed 2**30"
+    assert tensor_shape[ragged_dim] <= billion, "number of rows may not exceed 2**30"
+    tensor_shape[ragged_dim] = billion
+    ragged_stride = T.stride(ragged_dim)
 
     # we prepend an extra two dimensions and rely on the fact that pointers
     # have 64-bit wraparound semantics:
-    tma_stride = [2**34 - T.stride(0), T.stride(0)] + [T.stride(i) for i in range(len(tensor_shape))]
-    tma_shape  = [max_int, max_int, billion] + tensor_shape[1:]
+    tma_stride = [2**34 - ragged_stride, ragged_stride] + [T.stride(i) for i in range(rank)]
+    tma_shape  = [max_int, max_int] + tensor_shape
     box_shape  = [1, 1] + block_shape
+    ptr = T.data_ptr()
 
-    return TensorDescriptor(T, tma_shape, tma_stride, box_shape)
+    if write_only:
+        tma_stride = tma_stride[1:]
+        tma_shape = tma_shape[1:]
+        box_shape = box_shape[1:]
+        ptr = (ptr - billion * ragged_stride * T.element_size()) % (2**64)
+
+    return TensorDescriptor(TensorDescriptorPtr(ptr, T.dtype), tma_shape, tma_stride, box_shape)
 
 
 @triton.jit
@@ -50,7 +78,7 @@ def to_ragged_indices(batch_offset, batch_size, row):
 
 
 @triton.jit
-def load_ragged(TMA, batch_offset, batch_size, coords):
+def load_ragged(TMA, batch_offset, batch_size, coords, ragged_dim: tl.constexpr = 0):
     """
     Read from a subarray T[batch_offset : batch_offset + batch_size] with
     hardware bounds-checking, where reading outside the subarray gives zeros.
@@ -59,14 +87,16 @@ def load_ragged(TMA, batch_offset, batch_size, coords):
     TMA.load().
     """
 
-    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[0])
-    data = TMA.load([c0, c1, c2] + coords[1:])
+    tl.static_assert(len(TMA.shape) == len(coords) + 2, "TMA must be a read-write ragged descriptor")
+
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
+    data = TMA.load([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:])
     data = tl.reshape(data, data.shape[2:])
     return data
 
 
 @triton.jit
-def store_ragged(TMA, batch_offset, batch_size, coords, data):
+def store_ragged(TMA, batch_offset, batch_size, coords, data, ragged_dim: tl.constexpr = 0):
     """
     Write to a subarray T[batch_offset : batch_offset + batch_size] with
     hardware bounds-checking, where writes outside the subarray are masked
@@ -76,6 +106,18 @@ def store_ragged(TMA, batch_offset, batch_size, coords, data):
     TMA.store().
     """
 
-    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[0])
-    data = tl.reshape(data, [1, 1] + data.shape)
-    TMA.store([c0, c1, c2] + coords[1:], data)
+    if len(TMA.shape) == len(coords) + 1:
+        write_only: tl.constexpr = True
+    elif len(TMA.shape) == len(coords) + 2:
+        write_only: tl.constexpr = False
+    else:
+        tl.static_assert(False, "TMA must be a ragged descriptor")
+
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
+
+    if write_only:
+        data = tl.reshape(data, [1] + data.shape)
+        TMA.store([c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:], data)
+    else:
+        data = tl.reshape(data, [1, 1] + data.shape)
+        TMA.store([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:], data)