[AMD] Count vmcnt instructions for AsyncWait (#6426)

AlexAUT · web-flow · commit 09460154eaaa · 2025-04-09T14:34:13.000-07:00
Adds `UpdateAsyncWaitCountPass` to adjusts the wait counts of
`AsyncWait` ops to reflect the number of interleaved direct to lds
assembly instructions. The LLVM backend cannot infer the dependency
between the `AsyncCopies` and the `local_reads` so we emit it from
Triton as we have the dependency information via tracing the
`AsyncToken`.

The pass ignores global/buffer loads because the actual number of
assembly instructions is determined by the LLVM backend. Note that an
underestimation does only affect performance but not correctness.

`findMinPathCountInDefChain` is in separate file because we might reuse
it for combining `AsyncWaits` in the `StreamPipeliner`.
diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h
@@ -73,6 +73,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUConvertToBufferOps();
   mlir::registerTritonAMDGPUInThreadTranspose();
   mlir::registerTritonAMDGPUCoalesceAsyncCopy();
+  mlir::registerTritonAMDGPUUpdateAsyncWaitCount();
   mlir::triton::registerTritonAMDGPUInsertInstructionSchedHints();
   mlir::triton::registerTritonAMDGPULowerInstructionSchedHints();
   mlir::registerTritonAMDFoldTrueCmpI();
diff --git a/test/TritonGPU/amd/amd-update-async-wait-count.mlir b/test/TritonGPU/amd/amd-update-async-wait-count.mlir
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -277,6 +277,8 @@ def make_ttgir(mod, metadata, options):
         passes.common.add_canonicalizer(pm)
         passes.common.add_cse(pm)
         passes.common.add_symbol_dce(pm)
+        if use_async_copy:
+            amd.passes.ttgpuir.add_update_async_wait_count(pm, options.arch)
         pm.run(mod)
         return mod
 
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h
@@ -41,6 +41,9 @@ std::unique_ptr<Pass> createTritonAMDGPUInThreadTransposePass();
 std::unique_ptr<Pass>
 createTritonAMDGPUCoalesceAsyncCopyPass(std::string archGenName = {});
 
+std::unique_ptr<Pass>
+createTritonAMDGPUUpdateAsyncWaitCountPass(std::string archGenName = {});
+
 std::unique_ptr<Pass> createTritonAMDGPUFoldTrueCmpIPass();
 
 /// Generate the code for registering passes.
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
@@ -248,6 +248,26 @@ def TritonAMDGPUCoalesceAsyncCopy: Pass<"tritonamdgpu-coalesce-async-copy", "mli
   ];
 }
 
+def TritonAMDGPUUpdateAsyncWaitCount: Pass<"tritonamdgpu-update-async-wait-count", "mlir::ModuleOp"> {
+  let summary = "Adjust async wait count to allow prefetching over multiple loop iterations";
+
+  let description = [{
+    GFX9:
+      LLVM cannot see the dependency across loop iterations between AsyncCopy and local_reads. So we
+      compute the number of interleaving global memory instructions to emit the correct waitcnt during lowering.
+  }];
+
+  let constructor = "mlir::createTritonAMDGPUUpdateAsyncWaitCountPass()";
+
+  let dependentDialects = [];
+
+  let options = [
+    Option<"archGenerationName", "arch-generation-name",
+           "std::string", /*default=*/"std::string{}",
+           "GFX generation name of target device.">,
+  ];
+}
+
 def TritonAMDFoldTrueCmpI: Pass<"tritonamdgpu-fold-true-cmpi", "mlir::ModuleOp"> {
   let summary = "Fold true arith.cmpi to %true";
 
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt b/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt
@@ -11,6 +11,8 @@ add_triton_library(TritonAMDGPUTransforms
   MfmaGroup.cpp
   InThreadTranspose.cpp
   FoldTrueCmpIOp.cpp
+  UpdateAsyncWaitCount.cpp
+  Utility.cpp
 
   DEPENDS
   TritonAMDGPUIR
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -346,11 +346,13 @@ bool StreamPipeliner::createAsyncCopy(tt::LoadOp loadOp, Value alloc,
       builder.create<ttg::AsyncCommitGroupOp>(loc, newLoadOp->getResult(0));
   ttg::AsyncWaitOp wait =
       builder.create<ttg::AsyncWaitOp>(loc, commit->getResult(0), 0);
-
   // We need to place the prefetches (AsyncCopy) after the AsyncWaits which
   // create a barrier to ensure all warps are finished reading the shared buffer
   // we will write into. This is done by scheduling it as a local_store.
   scheduleOp(newLoadOp, SCHED_LOCAL_STORE);
+  // Place ttg.async_commit_group op next to async load so the later
+  // UpdateAsyncWaitCount pass can deduce better waitcnts
+  scheduleOp(commit, SCHED_LOCAL_STORE);
 
   // Create local load which consumes the async token from the AsyncWait
   auto sharedLoad =
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/UpdateAsyncWaitCount.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/UpdateAsyncWaitCount.cpp
@@ -0,0 +1,138 @@
+#include "amd/lib/TritonAMDGPUToLLVM/Utility.h"
+#include "amd/lib/TritonAMDGPUTransforms/Utility.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+#define GEN_PASS_CLASSES
+#include "TritonAMDGPUTransforms/Passes.h"
+
+// This pass updates the waitCount of `AsyncWait` Ops to represent the number of
+// inflight async load operation between the async_wait and the definition of
+// the AsyncToken, thus allowing to wait only on the dependent async loads
+// allowing loads issued after to complete in the future.
+// This also means we should never overestimate the value to ensure
+// correctness; being conservative and underestimating is fine given that only
+// affects performance
+// For each async_wait we need to compute the minimum across all AsyncToken
+// operands.
+// For each token the minimum number of async transaction along it's
+// def chain is deduced. A token can be copied when passing in as loop initial
+// argument and yielded from a loop body in which case we need to take the
+// minimum along both paths.
+// We do not exit early if we encounter another async_wait along the def chain
+// because the pipeliner will merge redundant waits for us already
+
+using namespace mlir;
+namespace tt = triton;
+namespace ttg = triton::gpu;
+
+// Returns the number of individual async load memory transactions when copy
+// data from the given |srcTy| in global memory to the given |dstTy| in shared
+// memory.
+int getNumberOfLoadInstructions(RankedTensorType srcTy,
+                                ttg::MemDescType dstTy) {
+  auto shape = srcTy.getShape();
+  LinearLayout srcLayout = tt::gpu::toLinearLayout(shape, srcTy.getEncoding());
+  LinearLayout sharedLayout =
+      tt::gpu::toLinearLayout(shape, dstTy.getEncoding());
+  LinearLayout srcToSharedLayout = srcLayout.invertAndCompose(sharedLayout);
+
+  // On GFX9 we cannot split direct to lds loads into multiple ones because we
+  // need coalesced writes. So we can divide the number of registers by the
+  // contiguity to get the number of load instructions.
+  int contig = srcToSharedLayout.getNumConsecutiveInOut();
+  int numberOfRegisters = srcToSharedLayout.getInDimSize(
+      StringAttr::get(srcTy.getContext(), "register"));
+  int loadInstructionCount = std::max(1, numberOfRegisters / contig);
+  return loadInstructionCount;
+}
+
+// The pipeliner always insert ops following an order of ttg.async_load ->
+// [token] -> ttg.async_commit_group -> [token] -> ttg.async_wait. So here we
+// scan the operands of ttg.async_commit_group to count the number of issued
+// async load intrinsics.
+int getNumberOfLoadInstructions(Operation *op) {
+  if (isa<ttg::AsyncCommitGroupOp>(op)) {
+    int count = 0;
+    for (auto token : op->getOperands()) {
+      auto defOp = token.getDefiningOp();
+      if (!defOp)
+        continue;
+      if (auto copyOp = llvm::dyn_cast<ttg::AsyncCopyGlobalToLocalOp>(defOp)) {
+        count += getNumberOfLoadInstructions(copyOp.getSrc().getType(),
+                                             copyOp.getResult().getType());
+      } else if (auto copyOp =
+                     llvm::dyn_cast<amdgpu::BufferLoadToLocalOp>(defOp)) {
+        auto srcTy = cast<RankedTensorType>(LLVM::AMD::getPointerTypeWithShape(
+            copyOp.getPtr(), copyOp.getOffsets()));
+        count += getNumberOfLoadInstructions(srcTy, copyOp.getDest().getType());
+      }
+    }
+    return count;
+  }
+  if (isa<tt::LoadOp, tt::StoreOp, amdgpu::BufferLoadToLocalOp,
+          amdgpu::BufferStoreOp, tt::AtomicRMWOp, tt::AtomicCASOp,
+          amdgpu::BufferAtomicRMWOp>(op)) {
+    op->emitRemark("Global memory operation between async wait and "
+                   "async_loads. This will hinder the interleaving of memory "
+                   "operations and might impact performance.");
+  }
+  return 0;
+}
+
+// LLVM cannot infer the dependency between direct to lds (async) loads and
+// the local reads between warps in a workgroup. As a workaround we update the
+// waitcnt to represent the number of hardware instructions we are
+// interleaving with. This allows us to manually emit the waitcnt during
+// lowering.
+void updateWaitCount(ttg::AsyncWaitOp waitOp, RewriterBase &rewriter) {
+  int waitCnt = std::numeric_limits<int>::max();
+
+  // AsyncWait can await multiple tokens so we get the minimum from all
+  // tokens
+  for (auto token : waitOp.getOperands()) {
+    // Traverse def chain from waitOp to the producer of the token and count
+    // the minumum number of vmcnt instructions
+    auto tokenWaitCnt =
+        deduceMinCountOnDefChain(token, waitOp, [](Operation *op) {
+          return getNumberOfLoadInstructions(op);
+        });
+    waitCnt = std::min(waitCnt, tokenWaitCnt);
+  }
+
+  if (waitCnt == std::numeric_limits<int>::max() || waitOp.getNum() == waitCnt)
+    return;
+
+  rewriter.modifyOpInPlace(waitOp, [&]() { waitOp.setNum(waitCnt); });
+}
+
+struct TritonAMDGPUUpdateAsyncWaitCountPass
+    : public TritonAMDGPUUpdateAsyncWaitCountBase<
+          TritonAMDGPUUpdateAsyncWaitCountPass> {
+  TritonAMDGPUUpdateAsyncWaitCountPass(StringRef archGenName) {
+    this->archGenerationName = archGenName.str();
+  }
+
+  void runOnOperation() override {
+    tt::AMD::TargetInfo targetInfo(archGenerationName);
+    if (!targetInfo.isCDNA()) {
+      return;
+    }
+
+    ModuleOp m = getOperation();
+
+    SmallVector<ttg::AsyncWaitOp> waitOps;
+    getOperation()->walk(
+        [&](ttg::AsyncWaitOp waitOp) { waitOps.push_back(waitOp); });
+
+    for (auto waitOp : waitOps) {
+      IRRewriter builder(waitOp->getContext());
+      updateWaitCount(waitOp, builder);
+    }
+  }
+};
+
+std::unique_ptr<Pass>
+mlir::createTritonAMDGPUUpdateAsyncWaitCountPass(std::string archGenName) {
+  return std::make_unique<TritonAMDGPUUpdateAsyncWaitCountPass>(archGenName);
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/Utility.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/Utility.cpp
@@ -0,0 +1,108 @@
+#include "Utility.h"
+
+#include "mlir/Dialect/SCF/IR/SCF.h"
+
+#include <limits>
+
+namespace deduceMin {
+int deduceMinCountInBlock(Block &block,
+                          const std::function<int(Operation *)> &countFunc);
+
+// Returns the minimum found when accumulating countFunc(op) between begin and
+// end (inclusive)
+int deduceMinCountBetweeOps(Operation *beginOp, Operation *endOp,
+                            const std::function<int(Operation *)> &countFunc) {
+  assert(beginOp && endOp);
+  assert(beginOp == endOp || beginOp->isBeforeInBlock(endOp));
+  int count = 0;
+  for (auto op = beginOp; op != endOp; op = op->getNextNode()) {
+    if (auto ifOp = llvm::dyn_cast<scf::IfOp>(op)) {
+      assert(!ifOp.getThenRegion().empty() && !ifOp.getElseRegion().empty());
+      auto minThen =
+          deduceMinCountInBlock(ifOp.getThenRegion().front(), countFunc);
+      auto minElse =
+          deduceMinCountInBlock(ifOp.getElseRegion().front(), countFunc);
+      count += std::min(minThen, minElse);
+    } else if (auto forOp = llvm::dyn_cast<scf::ForOp>(op)) {
+      auto tripCount = constantTripCount(forOp.getLowerBound(),
+                                         forOp.getUpperBound(), forOp.getStep())
+                           .value_or(0);
+      if (tripCount > 0) {
+        count += tripCount * deduceMinCountInBlock(*forOp.getBody(), countFunc);
+      }
+    } else {
+      count += countFunc(op);
+    }
+  }
+  return count;
+}
+
+// Returns the minimum found when accumulating countFunc(op) for all paths
+// between the block's start and end op
+int deduceMinCountInBlock(Block &block,
+                          const std::function<int(Operation *)> &countFunc) {
+  if (block.empty())
+    return 0;
+  return deduceMinCountBetweeOps(&block.front(), &block.back(), countFunc);
+}
+} // namespace deduceMin
+
+int deduceMinCountOnDefChain(Value defValue, Operation *consumerOp,
+                             const std::function<int(Operation *)> &countFunc,
+                             int pathSum, int foundMin) {
+  using namespace deduceMin;
+  // If the value is not defined in the same region as the consumer we need to
+  // peel the parent region of consumer until we arrive at value's region
+  while (consumerOp->getParentRegion() != defValue.getParentRegion()) {
+    pathSum += deduceMin::deduceMinCountBetweeOps(
+        &consumerOp->getBlock()->front(), consumerOp, countFunc);
+    consumerOp = consumerOp->getParentOp();
+  }
+
+  // Break recursion if we arrive at the producer updating the path based on the
+  // ops between producer and consumer
+  if (Operation *defOp = defValue.getDefiningOp()) {
+    pathSum +=
+        deduceMinCountBetweeOps(defOp->getNextNode(), consumerOp, countFunc);
+    foundMin = std::min(foundMin, pathSum);
+    return foundMin;
+  }
+  // If value is a loop carried argument (BlockArgument) we need to look at
+  // initial arguments of the loop and the previous iteration
+  if (auto arg = mlir::dyn_cast<BlockArgument>(defValue)) {
+    Block *block = arg.getOwner();
+    auto forOp = dyn_cast<scf::ForOp>(block->getParentOp());
+
+    // Failed to track, return 0 conservatively.
+    if (!forOp || forOp.getBody()->empty()) {
+      return 0;
+    }
+
+    Operation *firstOpInLoop = &*forOp.getBody()->begin();
+    pathSum += deduceMinCountBetweeOps(firstOpInLoop, consumerOp, countFunc);
+
+    // Break recursion early if we exceed previous min
+    if (pathSum >= foundMin)
+      return foundMin;
+
+    Value incomingVal = forOp.getInitArgs()[arg.getArgNumber() - 1];
+    int countLoopInit = deduceMinCountOnDefChain(incomingVal, forOp, countFunc,
+                                                 pathSum, foundMin);
+
+    Operation *yieldOp = block->getTerminator();
+    Value prevVal = yieldOp->getOperand(arg.getArgNumber() - 1);
+    int countPreviousIter = deduceMinCountOnDefChain(
+        prevVal, yieldOp, countFunc, pathSum, foundMin);
+
+    return std::min(std::min(countLoopInit, countPreviousIter), foundMin);
+  }
+
+  // Unsupported value, return 0 conservatively.
+  return 0;
+}
+
+int deduceMinCountOnDefChain(Value defValue, Operation *consumerOp,
+                             llvm::function_ref<int(Operation *)> countFunc) {
+  return deduceMinCountOnDefChain(defValue, consumerOp, countFunc, 0,
+                                  std::numeric_limits<int>::max());
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/Utility.h b/third_party/amd/lib/TritonAMDGPUTransforms/Utility.h
@@ -0,0 +1,19 @@
+#ifndef TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTRANSFORMS_UTILITY_H_
+#define TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTRANSFORMS_UTILITY_H_
+
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+
+using namespace mlir;
+
+// DFS the def chain of 'defValue' starting from 'consumer' and will return the
+// minimum found when accumulating countFunc(op) for all non control flow ops
+// between value and the consumer. This function will traverse through for loop
+// iterations and to the outside of the loop to find all its producers.
+//    CountOp(Operation*) should return the value to accumulate for the
+//    operation
+// Returns 0 if there is an error traversing the def chain
+int deduceMinCountOnDefChain(Value defValue, Operation *consumerOp,
+                             llvm::function_ref<int(Operation *)> countFunc);
+
+#endif
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc
@@ -84,6 +84,9 @@ void init_triton_amd_passes_ttgpuir(py::module &&m) {
   ADD_PASS_WRAPPER_1("add_coalesce_async_copy",
                      mlir::createTritonAMDGPUCoalesceAsyncCopyPass,
                      std::string);
+  ADD_PASS_WRAPPER_1("add_update_async_wait_count",
+                     mlir::createTritonAMDGPUUpdateAsyncWaitCountPass,
+                     std::string);
   m.def("add_in_thread_transpose", [](mlir::PassManager &pm) {
     pm.addNestedPass<mlir::triton::FuncOp>(
         mlir::createTritonAMDGPUInThreadTransposePass());