triton-lang · antiagainst · Feb 5, 2025 · Jan 17, 2025 · Jan 17, 2025 · Jan 24, 2025
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -294,3 +294,46 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: async_copy
+  tt.func public @async_copy(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg1: i32 {tt.divisibility = 16 : i32},
+                                %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
+    // We need the splat to allow the AxisAnalysis to work during lowering
+    %1 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x64x!tt.ptr<f16>, #blocked>
+    // CHECK: rocdl.global.load.lds
+    %2 = ttg.async_copy_global_to_local %1, %arg2 : tensor<32x64x!tt.ptr<f16>, #blocked> -> <32x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: async_copy_vectorized
+  tt.func public @async_copy_vectorized(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg1: i32 {tt.divisibility = 16 : i32},
+                                %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
+    // We need the index calculation so AxisAnalysis sees that we can vectorize the load
+    %1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
+    %3 = tt.broadcast %2 : tensor<1x64xi32, #blocked> -> tensor<32x64xi32, #blocked>
+    %4 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x64x!tt.ptr<f16>, #blocked>
+    %5 = tt.addptr %4, %3 : tensor<32x64x!tt.ptr<f16>, #blocked>, tensor<32x64xi32, #blocked>
+
+    // Each thread needs to load 8 elements and we load 2 (sizePerThread) per global.load.lds
+    // CHECK-COUNT-4: rocdl.global.load.lds
+    // CHECK-NOT: rocdl.global.load.lds
+    %6 = ttg.async_copy_global_to_local %5, %arg2 : tensor<32x64x!tt.ptr<f16>, #blocked> -> <32x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
diff --git a/test/Conversion/amd/tritongpu_to_llvm_gfx950.mlir b/test/Conversion/amd/tritongpu_to_llvm_gfx950.mlir
@@ -0,0 +1,24 @@
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx950 --convert-builtin-func-to-llvm | FileCheck %s
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: async_copy_vectorized
+  tt.func public @async_copy_vectorized(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg1: i32 {tt.divisibility = 16 : i32},
+                                %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
+    // We need the index calculation so AxisAnalysis sees that we can vectorize the load
+    %1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
+    %3 = tt.broadcast %2 : tensor<1x64xi32, #blocked> -> tensor<32x64xi32, #blocked>
+    %4 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x64x!tt.ptr<f16>, #blocked>
+    %5 = tt.addptr %4, %3 : tensor<32x64x!tt.ptr<f16>, #blocked>, tensor<32x64xi32, #blocked>
+
+    // Each thread needs to load 8 elements and we load 8 (sizePerThread) per global.load.lds
+    // CHECK: rocdl.global.load.lds
+    // CHECK-NOT: rocdl.global.load.lds
+    %6 = ttg.async_copy_global_to_local %5, %arg2 : tensor<32x64x!tt.ptr<f16>, #blocked> -> <32x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
@@ -20,9 +20,12 @@ using namespace mlir::triton::gpu;
 
 using ::mlir::LLVM::delinearize;
 using ::mlir::LLVM::getSharedMemoryBase;
+using ::mlir::LLVM::AMD::getContiguity;
+using mlir::LLVM::AMD::getCtrlBitsForCacheModifierOnTarget;
 using ::mlir::LLVM::AMD::getVectorSize;
 using ::mlir::LLVM::AMD::llLoad;
 using ::mlir::LLVM::AMD::llStore;
+using ::mlir::triton::AMD::ISAFamily;
 using ::mlir::triton::gpu::getTotalElemsPerThread;
 
 namespace {
@@ -396,6 +399,177 @@ struct BufferLoadOpConversion
   }
 };
 
+struct AsyncCopyGlobalToLocalOpConversion
+    : public ConvertOpToLLVMPattern<triton::gpu::AsyncCopyGlobalToLocalOp>,
+      public LoadStoreConversionBase {
+  using ConvertOpToLLVMPattern<
+      triton::gpu::AsyncCopyGlobalToLocalOp>::ConvertOpToLLVMPattern;
+
+  AsyncCopyGlobalToLocalOpConversion(LLVMTypeConverter &converter,
+                                     const AMD::TargetInfo &targetInfo,
+                                     ModuleAxisInfoAnalysis &axisAnalysisPass,
+                                     PatternBenefit benefit)
+      : ConvertOpToLLVMPattern<triton::gpu::AsyncCopyGlobalToLocalOp>(converter,
+                                                                      benefit),
+        LoadStoreConversionBase(targetInfo, axisAnalysisPass) {}
+
+  bool isLoadWidthSupported(unsigned bits,
+                            const AMD::TargetInfo &targetInfo) const {
+    llvm::SmallSetVector<unsigned, 10> supportedWidths;
+    switch (targetInfo.getISAFamily()) {
+    case mlir::triton::AMD::ISAFamily::CDNA1:
+    case mlir::triton::AMD::ISAFamily::CDNA2:
+    case mlir::triton::AMD::ISAFamily::CDNA3:
+      supportedWidths.insert(8);
+      supportedWidths.insert(16);
+      supportedWidths.insert(32);
+      if (targetInfo.getGPUKind() == llvm::AMDGPU::GPUKind::GK_GFX950) {
+        supportedWidths.insert(96);
+        supportedWidths.insert(128);
+      }
+      break;
+    default:
+      return false;
+    }
+
+    return supportedWidths.contains(bits);
+  }
+
+  LogicalResult
+  matchAndRewrite(triton::gpu::AsyncCopyGlobalToLocalOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto loc = op.getLoc();
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
+
+    auto srcTy = op.getSrc().getType();
+    auto srcEncoding = srcTy.getEncoding();
+    assert((isa<BlockedEncodingAttr, SliceEncodingAttr>(srcEncoding) &&
+            "Unexpected srcEncoding in AsyncCopyGlobalToLocalOpConversion"));
+    assert(srcTy.getShape().size() <= 2 && "Async copy only supports 1d and 2d "
+                                           "tensors: Unexpected rank of %src");
+
+    auto dstTy = op.getResult().getType();
+    auto resElemTy = getTypeConverter()->convertType(dstTy.getElementType());
+
+    Value llSrc = adaptor.getSrc();
+
+    auto srcElems = unpackLLElements(loc, llSrc, rewriter);
+
+    Value llDst = adaptor.getResult();
+    auto smemObj = mlir::LLVM::getSharedMemoryObjectFromStruct(
+        loc, llDst, resElemTy, rewriter);
+
+    unsigned maxVec = getContiguity(op.getSrc(), axisAnalysisPass);
+
+    Value mask = op.getMask();
+    if (mask) {
+      maxVec = std::min(maxVec, getMaskAlignment(mask));
+    }
+
+    // global.load.lds does not support per lane offsets.
+    // We need to ensure that we write coalesced into shared memory.
+    // This means that the kLane dim needs to be contigeous based on the
+    // vectorization size
+    auto shape = dstTy.getShape();
+    LinearLayout srcLayout =
+        triton::gpu::toLinearLayout(shape, srcTy.getEncoding());
+    LinearLayout sharedLayout = triton::gpu::toLinearLayout(
+        shape, dstTy.getEncoding(), resElemTy.getIntOrFloatBitWidth());
+    LinearLayout srcToSharedLayout = srcLayout.invertAndCompose(sharedLayout);
+
+    StringAttr kLane = rewriter.getStringAttr("lane");
+    for (int inLane : llvm::seq(srcToSharedLayout.getInDimSizeLog2(kLane))) {
+      auto basis = srcToSharedLayout.getBasis(kLane, inLane)[0];
+      unsigned expected = maxVec * (1 << inLane);
+      if (basis != expected) {
+        return emitError(loc, "Invalid layout in AsyncCopy: ")
+               << "Lane: " << 1 + inLane << " is " << basis << " should be "
+               << expected << "\n";
+      }
+    }
+
+    // Addresses to store into, one per `vecTy`.
+    VectorType vecTy;
+    SmallVector<Value> shmemAddrs;
+    bool ok = emitTransferBetweenRegistersAndShared(
+        srcTy, dstTy, resElemTy, {}, smemObj, loc, rewriter, targetInfo,
+        [&](VectorType vecTy_, Value shmemAddr) {
+          vecTy = vecTy_;
+          shmemAddrs.push_back(shmemAddr);
+        });
+    assert(ok);
+
+    int vecBits = vecTy.getNumElements() * vecTy.getElementTypeBitWidth();
+    if (!isLoadWidthSupported(vecBits, targetInfo)) {
+      return emitError(loc, "Async copy does not support the required load "
+                            "vectorization, got ")
+             << vecBits << " bits";
+    }
+
+    int vecBytes = vecBits / 8;
+    assert(llvm::isPowerOf2_32(vecBytes));
+    Value vecBytesVal = b.i32_val(vecBytes);
+
+    Value cacheModifiers = b.i32_val(
+        getCtrlBitsForCacheModifierOnTarget(op.getCache(), false, targetInfo));
+
+    Value llMask = adaptor.getMask();
+    SmallVector<Value> maskElems;
+    if (llMask) {
+      maskElems = unpackLLElements(loc, llMask, rewriter);
+      assert(srcElems.size() == maskElems.size());
+    }
+
+    Value other = op.getOther();
+    SmallVector<Value> otherElems;
+    if (other) {
+      otherElems = unpackLLElements(loc, adaptor.getOther(), rewriter);
+      assert(srcElems.size() == otherElems.size());
+    }
+
+    for (int i = 0; i < shmemAddrs.size(); i++) {
+      auto srcIdx = i * maxVec;
+      auto srcPtr = srcElems[srcIdx];
+
+      if (!mask) {
+        rewriter.create<ROCDL::GlobalLoadLDSOp>(
+            loc, srcPtr, shmemAddrs[i], vecBytesVal, /*offset=*/b.i32_val(0),
+            cacheModifiers);
+      } else {
+        Block *currentBlock = rewriter.getInsertionBlock();
+        Block *afterLoad =
+            rewriter.splitBlock(currentBlock, rewriter.getInsertionPoint());
+        Block *loadBlock = rewriter.createBlock(afterLoad);
+        rewriter.setInsertionPointToEnd(currentBlock);
+        rewriter.create<LLVM::CondBrOp>(loc, maskElems[srcIdx], loadBlock,
+                                        afterLoad);
+        rewriter.setInsertionPointToStart(loadBlock);
+        rewriter.create<ROCDL::GlobalLoadLDSOp>(
+            loc, srcPtr, shmemAddrs[i], vecBytesVal, /*offset=*/b.i32_val(0),
+            cacheModifiers);
+
+        rewriter.create<LLVM::BrOp>(loc, afterLoad);
+        rewriter.setInsertionPointToStart(afterLoad);
+        if (other) {
+          Value storeVal =
+              packElementRangeIntoVector(rewriter, this->getTypeConverter(),
+                                         loc, vecTy, otherElems, srcIdx);
+          llStore(rewriter, loc, shmemAddrs[i], storeVal,
+                  b.icmp_ne(maskElems[srcIdx], b.true_val()), 0, op.getCache());
+        }
+      }
+    }
+
+    // Drop the result token.
+    Value zero = rewriter.create<LLVM::ConstantOp>(
+        op.getLoc(), IntegerType::get(op.getContext(), 32),
+        rewriter.getI32IntegerAttr(0));
+    rewriter.replaceOp(op, zero);
+    return success();
+  }
+};
+
 struct StoreOpConversion : public ConvertOpToLLVMPattern<triton::StoreOp>,
                            public LoadStoreConversionBase {
   using ConvertOpToLLVMPattern<triton::StoreOp>::ConvertOpToLLVMPattern;
@@ -1459,6 +1633,49 @@ struct AtomicRMWOpConversion
     return endBlock->getArgument(0);
   }
 };
+
+struct AsyncWaitConversion : public ConvertOpToLLVMPattern<AsyncWaitOp> {
+  using ConvertOpToLLVMPattern<AsyncWaitOp>::ConvertOpToLLVMPattern;
+
+  AsyncWaitConversion(LLVMTypeConverter &converter,
+                      const AMD::TargetInfo &targetInfo,
+                      ModuleAxisInfoAnalysis &axisAnalysisPass,
+                      PatternBenefit benefit)
+      : ConvertOpToLLVMPattern<AsyncWaitOp>(converter, benefit) {}
+
+  LogicalResult
+  matchAndRewrite(AsyncWaitOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto loc = op->getLoc();
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
+    rewriter.create<ROCDL::WaitcntOp>(loc, op.getNum());
+    rewriter.replaceOp(op, b.i32_val(0));
+    return success();
+  }
+};
+
+struct AsyncCommitGroupConversion
+    : public ConvertOpToLLVMPattern<AsyncCommitGroupOp> {
+  using ConvertOpToLLVMPattern<AsyncCommitGroupOp>::ConvertOpToLLVMPattern;
+
+  AsyncCommitGroupConversion(LLVMTypeConverter &converter,
+                             const AMD::TargetInfo &targetInfo,
+                             ModuleAxisInfoAnalysis &axisAnalysisPass,
+                             PatternBenefit benefit)
+      : ConvertOpToLLVMPattern<AsyncCommitGroupOp>(converter, benefit) {}
+
+  LogicalResult
+  matchAndRewrite(AsyncCommitGroupOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Drop the result token
+    auto loc = op->getLoc();
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
+    rewriter.replaceOp(op, b.i32_val(0));
+    return success();
+  }
+};
+
 } // namespace
 
 namespace mlir::triton::AMD {
@@ -1470,7 +1687,9 @@ void populateLoadStoreOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                        PatternBenefit benefit) {
   patterns.add<AtomicCASOpConversion, AtomicRMWOpConversion, LoadOpConversion,
                StoreOpConversion, BufferLoadOpConversion,
-               BufferStoreOpConversion, BufferAtomicRMWOpConversion>(
+               BufferStoreOpConversion, BufferAtomicRMWOpConversion,
+               AsyncCopyGlobalToLocalOpConversion, AsyncCommitGroupConversion,
+               AsyncWaitConversion, AsyncCommitGroupConversion>(
       typeConverter, targetInfo, axisInfoAnalysis, benefit);
 }
 } // namespace mlir::triton::AMD
@@ -517,9 +517,9 @@ static int32_t getDefaultCtrlBitsForCacheModifier(triton::CacheModifier cm) {
 // .cv: don't cache and fetch again
 // .wb: write-back, writes back data at all cache levels
 // .wt: write-through, write data directly to system memory
-int32_t
-getCtrlBitsForCacheModifierOnTarget(triton::CacheModifier cm, bool isBufferLoad,
-                                    mlir::triton::AMD::TargetInfo &targetInfo) {
+int32_t getCtrlBitsForCacheModifierOnTarget(
+    triton::CacheModifier cm, bool isBufferLoad,
+    const mlir::triton::AMD::TargetInfo &targetInfo) {
   if (targetInfo.getGPUKind() == llvm::AMDGPU::GK_GFX942) // gfx942
     return getCtrlBitsForCacheModifierOnGFX942(cm, isBufferLoad);
   else

@@ -54,8 +54,9 @@ void llStore(RewriterBase &rewriter, Location loc, Value ptr, Value val,
 // Get flags <volatile, nontemporal> for a predicated Load or Store
 std::pair<bool, bool> getCacheModifierFlagsForPredicatedCall(LLVM::CallOp);
 // Get the cachepolicy value for a cache modifier
-int32_t getCtrlBitsForCacheModifierOnTarget(triton::CacheModifier, bool,
-                                            mlir::triton::AMD::TargetInfo &);
+int32_t
+getCtrlBitsForCacheModifierOnTarget(triton::CacheModifier, bool,
+                                    const mlir::triton::AMD::TargetInfo &);
 
 // Get cache modifier information for buffer atomics
 int32_t getCtrlBitsForBufferAtomicsOnGFX942(bool setSC0, bool setSC1,