[AMD] Enable async pingpong for F16 GEMMs (#796)

raikonenfnu · AlexAUT · web-flow · commit 1028c8f60f81 · 2025-05-15T08:24:01.000-07:00
* [AMD] Generalize PingPong to have different type of Load/Store Ops

This main motivation behind this commit is to add support for PingPong
with AsyncOps. In order to accomplish that we made these changes:
- Fork "determineDotMemoryOps" to "determineDotAsyncMemoryOps" that handles async memory ops.
- Refactor validation and pruning of memory ops to "pruneDotMemoryOps"
  S.T we can have clean interface for it's async memory ops counterpart
  "pruneAsyncDotMemoryOps".
- Plumb "useBlockPingpong" into StreamPipeliner S.T it can adjust AsyncWait
  stage/cluster to hoist first AsyncWait and allow set AsyncWait towards
  the end of the loop to make it easier for 4 PP cluster to move it
  before the 3rd dot-slice / 2 s_barrier before localLoads
  this is to ensure no race conditions.
- Add check to enable handling of dotSOps (dot scaled) VS dotOps (dot)

Signed-off-by: Stanley Winata &lt;stanley.winata@amd.com&gt;
Co-authored-by: Alexander Weinrauch &lt;alexander.weinrauch@amd.com&gt;
diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
@@ -35,6 +35,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_HIP_LOCAL_PREFETCH",
     "TRITON_HIP_USE_ASYNC_COPY",
     "TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE",
+    "TRITON_HIP_ENABLE_F16_ASYNC_PINGPONG",
     "TRITON_HIP_USE_BLOCK_PINGPONG",
     "TRITON_HIP_USE_IN_THREAD_TRANSPOSE",
     "TRITON_LLVM_DEBUG_ONLY",
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -262,7 +262,7 @@ def make_ttgir(mod, metadata, options):
         amd.passes.ttgpuir.add_reorder_instructions(pm)
         use_block_pingpong = is_pingpong_schedule_enabled(options.arch)
         if use_block_pingpong and options.num_stages in [2, 4]:
-            amd.passes.ttgpuir.add_block_pingpong(pm, options.num_stages)
+            amd.passes.ttgpuir.add_block_pingpong(pm, options.num_stages, use_async_copy)
 
         if knobs.amd.use_buffer_ops:
             amd.passes.ttgpuir.add_canonicalize_pointers(pm)
diff --git a/third_party/amd/include/TritonAMDGPUToLLVM/AsyncUtility.h b/third_party/amd/include/TritonAMDGPUToLLVM/AsyncUtility.h
@@ -0,0 +1,12 @@
+#ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_ASYNCUTILITY_H_
+#define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_ASYNCUTILITY_H_
+
+#include "mlir/IR/Value.h"
+
+namespace mlir::triton::AMD {
+// Traverses the def-chain including control flow of the token and returns true
+// if all defining operations are an AsyncWait
+bool comesFromAsyncWait(mlir::Value value);
+} // namespace mlir::triton::AMD
+
+#endif
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h
@@ -34,7 +34,8 @@ std::unique_ptr<Pass> createTritonAMDGPUConvertToBufferOpsPass(
     std::string archGenName = std::string());
 
 std::unique_ptr<Pass>
-createTritonAMDGPUBlockPingpongPass(int32_t numStages = 2);
+createTritonAMDGPUBlockPingpongPass(int32_t numStages = 2,
+                                    bool useAsyncCopy = false);
 
 std::unique_ptr<Pass> createTritonAMDGPUInThreadTransposePass();
 
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
@@ -168,11 +168,12 @@ def TritonAMDGPUBlockPingpong: Pass<"tritonamdgpu-block-pingpong", "mlir::Module
 
   let dependentDialects = ["mlir::ROCDL::ROCDLDialect, mlir::triton::amdgpu::TritonAMDGPUDialect"];
 
-  let options = [
-    Option<"numStages", "num-stages",
-        "int32_t", /*default*/"2",
-        "Number of Pipeline stages">,
-    ];
+  let options =
+      [Option<"numStages", "num-stages", "int32_t", /*default*/ "2",
+              "Number of Pipeline stages">,
+       Option<"useAsyncCopy", "use_async_copy", "bool", /*default*/ "false",
+              "Use AsyncCopyGlobalToLocal to directly load to shared memory">,
+  ];
 }
 
 def TritonAMDGPUInThreadTranspose: Pass<"tritonamdgpu-in-thread-transpose", "mlir::triton::FuncOp"> {
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.cpp
@@ -0,0 +1,62 @@
+#include "third_party/amd/include/TritonAMDGPUToLLVM/AsyncUtility.h"
+#include "Dialect/TritonAMDGPU/IR/Dialect.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/IR/Operation.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
+namespace mlir::triton::AMD {
+
+// Traverses the def-chain including control flow of the token and returns true
+// if all defining operations are an AsyncWait
+bool comesFromAsyncWait(mlir::Value token) {
+  if (auto defOp = token.getDefiningOp()) {
+    if (isa<triton::gpu::AsyncWaitOp>(defOp))
+      return true;
+    else if (auto castOp = dyn_cast<UnrealizedConversionCastOp>(defOp))
+      return comesFromAsyncWait(castOp.getInputs()[0]);
+    else
+      return false;
+  }
+
+  auto blockArg = llvm::dyn_cast<mlir::BlockArgument>(token);
+  // If the token has no defining op and is not an BlockArgument bail out
+  if (!blockArg) {
+    return false;
+  }
+
+  auto block = blockArg.getOwner();
+  auto argId = blockArg.getArgNumber();
+
+  auto destOperandFromAsyncWait = [argId](auto &&operands) {
+    assert(argId < operands.size());
+    return comesFromAsyncWait(operands[argId]);
+  };
+
+  // Check all predecessor block's terminator and follow the passed value at
+  // argId to see if they are immediately an AsyncWait.
+  for (auto *pred : block->getPredecessors()) {
+    auto terminator = pred->getTerminator();
+    if (auto br = llvm::dyn_cast<cf::BranchOp>(terminator)) {
+      if (!destOperandFromAsyncWait(br.getDestOperands()))
+        return false;
+    } else if (auto condBr = llvm::dyn_cast<cf::CondBranchOp>(terminator)) {
+      if (condBr.getTrueDest() == block) {
+        if (!destOperandFromAsyncWait(condBr.getTrueDestOperands()))
+          return false;
+      }
+      if (condBr.getFalseDest() == block) {
+        if (!destOperandFromAsyncWait(condBr.getFalseDestOperands()))
+          return false;
+      }
+    } else if (auto br = llvm::dyn_cast<LLVM::BrOp>(terminator)) {
+      if (!destOperandFromAsyncWait(br.getDestOperands()))
+        return false;
+    } else {
+      llvm::dbgs() << "no terminator!" << *terminator << "\n";
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace mlir::triton::AMD
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt b/third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_triton_library(TritonAMDGPUToLLVM
+    AsyncUtility.cpp
     AtomicRMWOpsEmitter.cpp
     BufferOpsEmitter.cpp
     ConvertLayoutOpToLLVM/SharedToDotOperandHelper.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/MembarUtility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/MembarUtility.cpp
@@ -1,55 +1,12 @@
 #include "third_party/amd/include/TritonAMDGPUToLLVM/MembarUtility.h"
 #include "Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "third_party/amd/include/TritonAMDGPUToLLVM/AsyncUtility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
 namespace mlir::triton::AMD {
 namespace {
 
-// Traverses the def-chain including control flow of the token and returns true
-// if all defining operations are an AsyncWait
-bool comesFromAsyncWait(Value token) {
-  if (auto defOp = token.getDefiningOp()) {
-    return isa<triton::gpu::AsyncWaitOp>(defOp);
-  }
-
-  auto blockArg = dyn_cast<BlockArgument>(token);
-  // If the token has no defining op and is not an BlockArgument bail out
-  if (!blockArg) {
-    return false;
-  }
-
-  auto block = blockArg.getOwner();
-  auto argId = blockArg.getArgNumber();
-
-  auto destOperandFromAsyncWait = [argId](auto &&operands) {
-    assert(argId < operands.size());
-    return comesFromAsyncWait(operands[argId]);
-  };
-
-  // Check all predecessor block's terminator and follow the passed value at
-  // argId to see if they are immediately an AsyncWait.
-  for (auto *pred : block->getPredecessors()) {
-    auto terminator = pred->getTerminator();
-    if (auto br = dyn_cast<cf::BranchOp>(terminator)) {
-      if (!destOperandFromAsyncWait(br.getDestOperands()))
-        return false;
-    } else if (auto condBr = dyn_cast<cf::CondBranchOp>(terminator)) {
-      if (condBr.getTrueDest() == block) {
-        if (!destOperandFromAsyncWait(condBr.getTrueDestOperands()))
-          return false;
-      }
-      if (condBr.getFalseDest() == block) {
-        if (!destOperandFromAsyncWait(condBr.getFalseDestOperands()))
-          return false;
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
 // Returns true if one of the operands is a LocalLoad synced via AsyncWait.
 bool filterAsyncLocalLoadsDeppendencies(Operation *op1, Operation *op2) {
   auto isAsyncLoad = [](Operation *op) {
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -4,12 +4,14 @@
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/IR/PatternMatch.h"
+#include "third_party/amd/include/TritonAMDGPUToLLVM/AsyncUtility.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 
 namespace tt = mlir::triton;
 using mlir::triton::ModuleAxisInfoAnalysis;
+using mlir::triton::AMD::comesFromAsyncWait;
 using mlir::triton::AMD::DppCtrl;
 using mlir::triton::AMD::ISAFamily;
 using mlir::triton::gpu::appendOrGetExternFuncOp;
@@ -734,8 +736,9 @@ void addAsyncCopyAliasScope(AliasAnalysisOpInterface directToLdsOp) {
 void addLocalLoadNoAliasScope(triton::gpu::LocalLoadOp localLoadOp,
                               AliasAnalysisOpInterface llLoadOp) {
   auto token = localLoadOp.getToken();
-  if (!token || !token.getDefiningOp<tt::gpu::AsyncWaitOp>())
+  if (!token || !comesFromAsyncWait(token)) {
     return;
+  }
 
   return addLocalLoadNoAliasScope(llLoadOp);
 }
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`add_triton_library(TritonAMDGPUToLLVM`
	`2`	`+ AsyncUtility.cpp`
`2`	`3`	`AtomicRMWOpsEmitter.cpp`
`3`	`4`	`BufferOpsEmitter.cpp`
`4`	`5`	`ConvertLayoutOpToLLVM/SharedToDotOperandHelper.cpp`