[2/N][TLX-2cta] Expose cluster_cta_rank (#638)

pchen7e2 · meta-codesync[bot] · commit c6c7ac03e7d3 · 2025-11-05T12:27:33.000-08:00
Summary: This will expose the capability for executing CTA to know whether it's leader CTA in the pair or not. It will be necessary if we want non leader CTA to arrive a barrier for leader CTA and synchronize the two before issuing MMA. ``` % make test-lit ninja -C /data/users/pchen7e4/triton/build/cmake.linux-x86_64-cpython-3.11 check-triton-lit-tests ninja: Entering directory `/data/users/pchen7e4/triton/build/cmake.linux-x86_64-cpython-3.11' [0/1] Running the triton regression tests Testing Time: 7.81s Total Discovered Tests: 208 Passed : 207 (99.52%) Expectedly Failed: 1 (0.48%) % third_party/tlx/run_all.sh Need to build triton in this script? {y|n}n Run all LITs? {y|n}n Run core Triton python unit tests? {y|n}n Run all TLX unit tests? {y|n}y Running TLX Unit Tests ... ====================================================================================== 31 passed, 76 skipped in 19.55s ====================================================================================== Run TLX tutorial kernels (correctness|performance|no)? {c|p|n} c Verifying correctness of TLX tutorial kernels (all passing) ``` Pull Request resolved: #638 Reviewed By: htyu Differential Revision: D86249537 Pulled By: pchen7e2 fbshipit-source-id: 1becf189deab327d33ea64d32963723668bae257
diff --git a/README.md b/README.md
@@ -50,6 +50,13 @@ While this approach places more responsibility on the user, it reduces the compi
 
     Slice a `M x N` tensor at a `m x n` offset.
 
+### Remote buffer operations
+
+- `buffer = tlx.remote_view(buffer, remote_cta_rank)`
+
+  Return a remote view of the `buffer` living in another CTA in the same cluster with ID `remote_cta_rank`. NOTE: for
+  now we only support barrier as `buffer`, not general SMEM.
+
 ### Async memory access
 
 
@@ -171,6 +178,11 @@ Examples: how mbarriers are communicated in warp specialization
 
 `tlx.async_task(num_warps=4)` defines a warp-specialized asynchronous task that explicitly reserves 4 warps in addition to those used by the trunk task..
 
+### Other operations
+
+- `tlx.cluster_cta_rank()`
+
+  Returns the rank (unique ID) of the current CTA within the cluster.
 
 - `tlx.thread_id(axis)`
 
diff --git a/python/test/unit/language/test_tlx.py b/python/test/unit/language/test_tlx.py
@@ -552,6 +552,37 @@ def store_from_thread_0_kernel(
     torch.testing.assert_close(output, expected_output)
 
 
+@pytest.mark.skipif(not is_hopper_or_newer(), reason="Need Hopper or newer")
+def test_custer_cta_rank(device):
+
+    @triton.jit
+    def test_cta_0_kernel(
+        output_ptr,
+        n_elements,
+        BLOCK_SIZE: tl.constexpr,
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        # without multi-cta cluster launch, this test does not validate much except
+        # the fact that the IR lowering flow works
+        cta_id = tlx.cluster_cta_rank()
+        tl.store(output_ptr + offsets, cta_id, mask=mask)
+
+    tensor_size = 32
+    # init with 1, expected to be filled with 0
+    output = torch.ones(tensor_size, dtype=torch.int32, device=device)
+    kernel = test_cta_0_kernel[(1, )](output, tensor_size, tensor_size, num_warps=1)
+
+    ttgir = kernel.asm["ttgir"]
+    assert ttgir.count("nvgpu.cluster_id") == 1
+
+    torch.cuda.synchronize()
+    expected_output = torch.zeros(tensor_size, dtype=torch.int32, device=device)
+    torch.testing.assert_close(output, expected_output)
+
+
 def test_clock64(device):
 
     @triton.jit
diff --git a/third_party/tlx/dialect/triton_tlx.cc b/third_party/tlx/dialect/triton_tlx.cc
@@ -2,6 +2,7 @@
 #include "Transforms/Passes.h"
 #include "ir.h" // TritonOpBuilder
 #include "mlir/Pass/PassManager.h"
+#include "nvidia/include/Dialect/NVGPU/IR/Dialect.h"
 #include "passes.h"
 #include "tlx/dialect/include/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -555,6 +556,14 @@ void init_triton_tlx_ir(py::module &&m) {
                  self.getBuilder().getI32Type(), threadId);
              return threadId;
            })
+      .def("create_cluster_cta_rank",
+           [](TritonOpBuilder &self) -> Value {
+             // The naming of ClusterCTAIdOp is bad. It actually returns the
+             // cluster CTA rank (1D) instead of cluster CTA ID (3D)
+             Value rank = self.create<triton::nvgpu::ClusterCTAIdOp>(
+                 self.getBuilder().getI32Type());
+             return rank;
+           })
       .def("create_map_to_remote_buffer",
            [](TritonOpBuilder &self, Value &src,
               Value &clusterCTARank) -> Value {
diff --git a/third_party/tlx/language/tlx/__init__.py b/third_party/tlx/language/tlx/__init__.py
@@ -32,6 +32,7 @@
     tcgen05_commit,
 )
 from .utility import (
+    cluster_cta_rank,
     thread_id,
     async_task_replica_id,
     dtype_of,
@@ -96,6 +97,7 @@
     "async_dot_wait",
     "tcgen05_commit",
     # utility
+    "cluster_cta_rank",
     "thread_id",
     "async_task_replica_id",
     "dtype_of",
diff --git a/third_party/tlx/language/tlx/utility.py b/third_party/tlx/language/tlx/utility.py
@@ -17,6 +17,14 @@ def cuda_parse_arch(arch):
     return int(match.group(1))
 
 
+@tl.builtin
+def cluster_cta_rank(_semantic=None):
+    """
+    :return the unique CTA ID within a cluster across all dims
+    """
+    return tl.tensor(_semantic.builder.create_cluster_cta_rank(), tl.int32)
+
+
 @tl.builtin
 def thread_id(axis, _semantic=None):
     """