[1/N][TLX-2cta] Introduce TTNG_MapToRemoteBufferOp (#637)

pchen7e2 · meta-codesync[bot] · commit 9a0e0cb7630e · 2025-11-04T22:52:24.000-08:00
Summary: To be able to essentially call NV's "mapa" on an SMEM buffer (or a barrier living there), we need to open up this API to front end. This will make it possible to explicitly arrive a remote barrier, and in the future, read/write DSMEM. Note if the CTAId is the executing CTA, original src address will be returned. Marking this Op as `MemDescViewTrait` will automatically handle alias analysis like MemDescIndex ops etc. ``` % make test-lit ninja -C /data/users/pchen7e4/triton/build/cmake.linux-x86_64-cpython-3.11 check-triton-lit-tests ninja: Entering directory `/data/users/pchen7e4/triton/build/cmake.linux-x86_64-cpython-3.11' [0/1] Running the triton regression tests Testing Time: 7.81s Total Discovered Tests: 208 Passed : 207 (99.52%) Expectedly Failed: 1 (0.48%) % third_party/tlx/run_all.sh Need to build triton in this script? {y|n}n Run all LITs? {y|n}n Run core Triton python unit tests? {y|n}n Run all TLX unit tests? {y|n}y Running TLX Unit Tests ... ====================================================================================== 31 passed, 76 skipped in 19.55s ====================================================================================== Run TLX tutorial kernels (correctness|performance|no)? {c|p|n} c Verifying correctness of TLX tutorial kernels (all passing) ``` Pull Request resolved: #637 Reviewed By: htyu Differential Revision: D86244418 Pulled By: pchen7e2 fbshipit-source-id: f62f28aefe5630d81e27fa9395e2f973db72b015
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -70,6 +70,25 @@ def TTNG_ClusterWaitOp : TTNG_Op<"cluster_wait", []> {
   let assemblyFormat = "attr-dict";
 }
 
+def TTNG_MapToRemoteBufferOp : TTNG_Op<"map_to_remote_buffer", [Pure, MemDescViewTrait]> {
+  let summary = "Map shared memory buffer to the corresponding buffer in the target CTA";
+  let description = [{
+    Given a shared memory buffer mem desc `src`, return a mem desc referring to the corresponding buffer in the specified
+    target CTA.
+
+    `$ctaRank` refers to the unique CTA id in a cluster acorss all dims. e.g. For a 2x4 CTA cluster, a valid CTA rank
+    will be 0~7.
+  }];
+
+  let arguments = (ins TTG_MemDescType:$src, I32:$ctaRank);
+
+  let results = (outs TTG_MemDescType:$result);
+
+  let assemblyFormat = [{$src`,` $ctaRank attr-dict `:` qualified(type($src)) `->` qualified(type($result))}];
+
+  let hasVerifier = 1;
+}
+
 //
 // WarpGroupDot Op
 //
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -38,6 +38,29 @@ namespace mlir {
 namespace triton {
 namespace nvidia_gpu {
 
+LogicalResult MapToRemoteBufferOp::verify() {
+  // src and result should have the same type except MemorySpace
+  MemDescType localType = getSrc().getType();
+  MemDescType remoteType = getResult().getType();
+  if (!(localType.getShape() == remoteType.getShape() &&
+        localType.getElementType() == remoteType.getElementType() &&
+        localType.getEncoding() == remoteType.getEncoding() &&
+        localType.getMutableMemory() == remoteType.getMutableMemory() &&
+        localType.getAllocShape() == remoteType.getAllocShape())) {
+    return emitOpError() << "Local MemDesc not matching Remote MemDesc: "
+                         << localType << " vs " << remoteType;
+  }
+  if (!isa<SharedMemorySpaceAttr>(localType.getMemorySpace())) {
+    return emitOpError() << "Invalid memory space for local MemDesc: "
+                         << localType;
+  }
+  if (!isa<SharedClusterMemorySpaceAttr>(remoteType.getMemorySpace())) {
+    return emitOpError() << "Invalid memory space for remote MemDesc: "
+                         << remoteType;
+  }
+  return success();
+}
+
 // -- WarpGroupDotOp --
 LogicalResult WarpGroupDotOp::inferReturnTypes(
     MLIRContext *context, std::optional<Location> location, ValueRange operands,
diff --git a/test/Conversion/tritonnvidiagpu_to_llvm.mlir b/test/Conversion/tritonnvidiagpu_to_llvm.mlir
@@ -271,3 +271,17 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+// CHECK-LABEL: map_smem_to_remote
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65536 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @map_smem_to_remote(%arg: !ttg.memdesc<1xi64, #shared, #smem, mutable>) {
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: nvvm.mapa %{{.*}} : !llvm.ptr<3> -> !llvm.ptr<7>
+    %0 = ttng.map_to_remote_buffer %arg, %c1_i32: !ttg.memdesc<1xi64, #shared, #smem, mutable> -> !ttg.memdesc<1xi64, #shared, #ttng.shared_cluster_memory, mutable>
+    tt.return
+  }
+}
diff --git a/test/TritonNvidiaGPU/invalid.mlir b/test/TritonNvidiaGPU/invalid.mlir
@@ -1,5 +1,18 @@
 // RUN: triton-opt --split-input-file %s --verify-diagnostics
 
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65536 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @map_smem_to_remote(%arg: !ttg.memdesc<1xi64, #shared, #smem, mutable>) {
+    %c1_i32 = arith.constant 1 : i32
+    // expected-error @+1 {{Invalid memory space for remote MemDesc}}
+    %0 = ttng.map_to_remote_buffer %arg, %c1_i32: !ttg.memdesc<1xi64, #shared, #smem, mutable> -> !ttg.memdesc<1xi64, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65536 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
   tt.func public @alloc_tensor_memory() {
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ClusterOpsToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ClusterOpsToLLVM.cpp
@@ -25,6 +25,7 @@
 #include "PatternTritonGPUOpToLLVM.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
 using namespace mlir;
@@ -63,12 +64,55 @@ struct ClusterWaitOpConversion
     return success();
   }
 };
+
+// lower MapToRemoteBufferOp
+struct MapToRemoteBufferOpConversion
+    : public ConvertOpToLLVMPattern<triton::nvidia_gpu::MapToRemoteBufferOp> {
+  using ConvertOpToLLVMPattern<
+      triton::nvidia_gpu::MapToRemoteBufferOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(triton::nvidia_gpu::MapToRemoteBufferOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto srcSmemObj = LLVM::getSharedMemoryObjectFromStruct(
+        loc, adaptor.getSrc(),
+        typeConverter->convertType(op.getSrc().getType().getElementType()),
+        rewriter);
+    auto srcSmemPtr = srcSmemObj.getBase();
+
+    auto ptrTy = cast<LLVM::LLVMPointerType>(srcSmemPtr.getType());
+    assert(ptrTy.getAddressSpace() == 3 &&
+           "Invalid src llvm addr space for MapToRemoteBufferOp");
+
+    // The result pointer is referring to a memory buffer living in a CTA
+    // cluster, so it has a different memory space. NVVM::MapaOp verifies its
+    // src and result ptr type, so we need to construct the result ptr type
+    // from typeConverter output here
+    LLVM::LLVMStructType convertedRetTy =
+        cast<LLVM::LLVMStructType>(typeConverter->convertType(op.getType()));
+    Type convertedPtrTy = convertedRetTy.getBody()[0];
+
+    // map an SMEM ptr in mem space 3 to a ptr in mem space 7
+    auto remotePtr = rewriter.create<NVVM::MapaOp>(
+        loc, convertedPtrTy, srcSmemPtr, adaptor.getCtaRank());
+
+    // everything stays the same except base ptr comparing to srcSmemObj
+    auto dstSmemObj = SharedMemoryObject(
+        remotePtr, srcSmemObj.getBaseElemType(), srcSmemObj.getOffsets());
+    auto retVal = getStructFromSharedMemoryObject(loc, dstSmemObj, rewriter);
+    rewriter.replaceOp(op, retVal);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::triton::NVIDIA::populateClusterOpsToLLVMPatterns(
     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
     PatternBenefit benefit) {
   patterns.add<ClusterArriveOpConversion>(typeConverter, benefit);
   patterns.add<ClusterWaitOpConversion>(typeConverter, benefit);
+  patterns.add<MapToRemoteBufferOpConversion>(typeConverter, benefit);
   return;
 }
diff --git a/third_party/tlx/dialect/triton_tlx.cc b/third_party/tlx/dialect/triton_tlx.cc
@@ -554,6 +554,22 @@ void init_triton_tlx_ir(py::module &&m) {
              threadId = self.create<arith::IndexCastOp>(
                  self.getBuilder().getI32Type(), threadId);
              return threadId;
+           })
+      .def("create_map_to_remote_buffer",
+           [](TritonOpBuilder &self, Value &src,
+              Value &clusterCTARank) -> Value {
+             auto bufferType = cast<ttg::MemDescType>(src.getType());
+             assert(
+                 isa<ttg::SharedMemorySpaceAttr>(bufferType.getMemorySpace()) &&
+                 "Input of MapToRemoteBuffer has to be local SMEM");
+             auto newBufferType = ttg::MemDescType::get(
+                 bufferType.getShape(), bufferType.getElementType(),
+                 bufferType.getEncoding(),
+                 ttng::SharedClusterMemorySpaceAttr::get(self.getContext()),
+                 bufferType.getMutableMemory(), bufferType.getAllocShape());
+             Value remoteBuf = self.create<ttng::MapToRemoteBufferOp>(
+                 newBufferType, src, clusterCTARank);
+             return remoteBuf;
            });
 }
 
diff --git a/third_party/tlx/language/tlx/__init__.py b/third_party/tlx/language/tlx/__init__.py
@@ -15,7 +15,7 @@
     CLCPipelineContext,
     async_token,
 )
-from .mem_ops import (local_alloc, local_view, local_slice, subslice, async_load, async_load_commit_group,
+from .mem_ops import (local_alloc, local_view, remote_view, local_slice, subslice, async_load, async_load_commit_group,
                       async_load_wait_group, local_load, local_store, local_trans, local_reinterpret,
                       async_descriptor_load, async_descriptor_store, async_descriptor_store_wait, fence_async_shared)
 from .barrier import (
@@ -70,6 +70,7 @@
     # mem_ops
     "local_alloc",
     "local_view",
+    "remote_view",
     "local_slice",
     "subslice",
     "async_load",
diff --git a/third_party/tlx/language/tlx/mem_ops.py b/third_party/tlx/language/tlx/mem_ops.py
@@ -3,6 +3,7 @@
 from . import types as tlx
 from .utility import cuda_parse_arch
 from .mma_ops import require_nv_mma_shared_layout
+from .types import storage_kind
 from typing import Optional, Tuple, overload
 
 
@@ -183,6 +184,35 @@ def _buffered_tensor_getitem(self, buffer_idx):
     return local_view(self, buffer_idx, _semantic=self.type.semantic)
 
 
+@tl.builtin
+def remote_view(
+    local_allocated_buffer: tlx.mbarrier,
+    remote_cta_rank: int | tl.constexpr | tl.tensor,
+    _semantic=None,
+) -> tlx.mbarrier:
+    """
+    Returns a remote view of the buffer. This returns a remote buf handle living in a CTA in the same CTA cluster with the
+    executing CTA.
+    :arg local_allocated_buffer: the local buffer handle we start with
+    :arg remote_cta_rank: unique ID of the remote CTA within the CTA cluster. This ID is across all dims, so e.g. for
+    a cluster of shape [2, 4] a valid unique ID could be 0~7, including the executing CTA itself
+    :returns: a remote view of the buffer, located at the same relative location, but just in a possibly different CTA
+    """
+    assert isinstance(local_allocated_buffer, tlx.mbarrier), "remote_view only supports barrier for now"
+    assert local_allocated_buffer.type.storage == storage_kind.smem, "remote_view requires local smem as input"
+    if isinstance(remote_cta_rank, tl.constexpr) or isinstance(remote_cta_rank, int):
+        remote_cta_rank_handle = _semantic._convert_elem_to_ir_value(tl._unwrap_if_constexpr(remote_cta_rank),
+                                                                     require_i64=False)
+    else:
+        assert isinstance(
+            remote_cta_rank, tl.tensor
+        ), f"`remote_cta_rank` is in type {type(remote_cta_rank)} (must be either `tl.tensor` or `tl.constexpr`)"
+        remote_cta_rank_handle = remote_cta_rank.handle
+    remote_buf_handle = _semantic.builder.create_map_to_remote_buffer(local_allocated_buffer.handle,
+                                                                      remote_cta_rank_handle)
+    return tlx.mbarrier(remote_buf_handle, 0, local_allocated_buffer.type.layout, storage_kind.smemCluster)
+
+
 tlx.buffered_tensor.__getitem__ = _buffered_tensor_getitem
 tlx.mbarrier.__getitem__ = _buffered_tensor_getitem
 tlx.clc_response.__getitem__ = _buffered_tensor_getitem
diff --git a/third_party/tlx/language/tlx/types.py b/third_party/tlx/language/tlx/types.py
@@ -4,6 +4,7 @@
 import enum
 from abc import abstractmethod
 from triton._C.libtriton import ir
+
 from triton.language.semantic import TritonSemantic
 
 
@@ -287,9 +288,10 @@ class mbarrier(tl.base_value):
     """
 
     def __init__(self, handle, num: int, layout: Optional[swizzled_shared_layout_encoding],
-                 semantics: TritonSemantic = None):
+                 semantics: TritonSemantic = None, storage: storage_kind = storage_kind.smem):
+        assert storage == storage_kind.smem or storage == storage_kind.smemCluster, "mbarrier requires storage to be smem or smemCluster"
         self.handle = handle
-        self.type = mbarrier_type(num, layout, semantics)
+        self.type = mbarrier_type(num, layout, semantics, storage)
         self.num = num
 
     def _flatten_ir(self, handles) -> None:
@@ -305,8 +307,8 @@ def _unflatten_ir(self, handles, cursor):
 
 class mbarrier_type(buffered_tensor_type):
 
-    def __init__(self, num: int, layout: Optional[swizzled_shared_layout_encoding], semantic: TritonSemantic):
-        super().__init__(tl.int64, [1], num, storage_kind.smem, layout, semantic)
+    def __init__(self, num: int, layout: Optional[swizzled_shared_layout_encoding], semantic: TritonSemantic, storage):
+        super().__init__(tl.int64, [1], num, storage, layout, semantic)
 
     def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[mbarrier, int]:
         value = mbarrier(handles[cursor], self.num, self.layout, self.semantic)