[FRONTEND] Ragged TMA atomic add (#8238)

kylee-openai · web-flow · commit 0d44a36f2324 · 2025-09-19T21:50:54.000Z
Adds ragged TMA atomic add support to triton  # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/test/unit/cuda/test_tma_descriptor.py b/python/test/unit/cuda/test_tma_descriptor.py
@@ -2,7 +2,7 @@
 import pytest
 import torch
 import triton
-from triton.tools.ragged_tma import create_ragged_descriptor, load_ragged, store_ragged
+from triton.tools.ragged_tma import create_ragged_descriptor, atomic_add_ragged, load_ragged, store_ragged
 from triton.tools.tensor_descriptor import TensorDescriptor
 
 
@@ -55,6 +55,13 @@ def example_load_store_kernel(X, Y, x_off, y_off, x_size, y_size):
     store_ragged(Y, y_off, y_size, [0, 0], data)
 
 
+@triton.jit
+def example_load_atomic_add_kernel(X, Y, x_off, y_off, x_size, y_size):
+
+    data = load_ragged(X, x_off, x_size, [0, 0])
+    atomic_add_ragged(Y, y_off, y_size, [0, 0], data)
+
+
 @pytest.mark.parametrize("dtype", [
     "bfloat16", "float16", "float32", "float64",  # floating-point
     "int8", "int16", "int32", "int64",  # signed integers
@@ -66,28 +73,34 @@ def test_ragged_tma(dtype):
         pytest.skip("Test requires Hopper or Blackwell target.")
         return
 
+    test_atomic_add = dtype in ["bfloat16", "float16", "float32", "int32"]
     dtype = getattr(torch, dtype)
 
-    src = torch.randn((1024, 80), dtype=torch.float32, device="cuda").to(dtype)
+    src1 = torch.randn((1024, 80), dtype=torch.float32, device="cuda").to(dtype)
+    src2 = torch.randn((1024, 80), dtype=torch.float32, device="cuda").to(dtype)
     ref = torch.randn((1024, 80), dtype=torch.float32, device="cuda").to(dtype)
     dst = ref.clone()
 
-    X = create_ragged_descriptor(src, [32, 128])
+    X1 = create_ragged_descriptor(src1, [32, 128])
+    X2 = create_ragged_descriptor(src2, [32, 128])
     Y = create_ragged_descriptor(dst, [32, 128])
 
     x_off = 42
     y_off = 51
     x_size = 17
     y_size = 24
 
-    example_load_store_kernel[(1, )](X, Y, x_off, y_off, x_size, y_size)
+    example_load_store_kernel[(1, )](X1, Y, x_off, y_off, x_size, y_size)
+    if test_atomic_add:
+        example_load_atomic_add_kernel[(1, )](X2, Y, x_off, y_off, x_size, y_size)
 
     # the initial and final segments are unchanged:
     res0 = torch.equal(dst[:y_off], ref[:y_off])
     res1 = torch.equal(dst[y_off + y_size:], ref[y_off + y_size:])
 
     # this segment will be copied verbatim from src:
-    res2 = torch.equal(dst[y_off:y_off + x_size], src[x_off:x_off + x_size])
+    ref_tensor = src1 + src2 if test_atomic_add else src1
+    res2 = torch.equal(dst[y_off:y_off + x_size], ref_tensor[x_off:x_off + x_size])
 
     # this segment will have read OOB zeroes and written them here:
     res3 = torch.all(dst[y_off + x_size:y_off + y_size] == 0.0).item()
diff --git a/python/triton/tools/ragged_tma.py b/python/triton/tools/ragged_tma.py
@@ -90,3 +90,19 @@ def store_ragged(TMA, batch_offset, batch_size, coords, data, ragged_dim: tl.con
     c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
     data = tl.reshape(data, [1, 1] + data.shape)
     TMA.store([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:], data)
+
+
+@triton.jit
+def atomic_add_ragged(TMA, batch_offset, batch_size, coords, data, ragged_dim: tl.constexpr = 0):
+    """
+    Atomic add into a subarray T[batch_offset : batch_offset + batch_size] with
+    hardware bounds-checking, where adds outside the subarray are masked
+    correctly.
+
+    Coords should be an appropriately-sized list of integers, just like in
+    TMA.atomic_add().
+    """
+
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
+    data = tl.reshape(data, [1, 1] + data.shape)
+    TMA.atomic_add([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:], data)