[AMD] Refactor Membar filter for LocalLoads synced via AsyncWait (#7047)

AlexAUT · web-flow · commit 859dcf0c5b3b · 2025-06-04T08:56:02.000-07:00
Instead of walking the def-chain of the `AsyncToken` inside the membar
filter we do it once before running membar analysis. Also makes the
branch handling more generic by using `BranchOpInterface` instead of the
specific branch instructions.

This also allows us to reuse the information when adding alias
information while lowering `LocalLoads` which will be enabled in a
follow up PR.
diff --git a/third_party/amd/include/TritonAMDGPUToLLVM/MembarUtility.h b/third_party/amd/include/TritonAMDGPUToLLVM/MembarUtility.h
@@ -1,9 +1,19 @@
 #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_MEMBARUTILITY_H_
 #define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_MEMBARUTILITY_H_
 
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Operation.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
 namespace mlir::triton::AMD {
+
+// Annotates LocalLoadOps with ttg.amdgpu.syncedByAsyncWait=true if they are
+// synced by an AsyncWait.
+void annotateLocalLoadsSyncedViaAsyncWait(ModuleOp mod);
+
+// Getter for the annotation applied by annotateLocalLoadsSyncedViaAsyncWait
+bool isSyncedViaAsyncWait(triton::gpu::LocalLoadOp localLoadOp);
+
 // Filter function used in the AMDGPU backend to filter unnecessary barriers
 // during Membar Analysis. Filters applied by this function:
 // 1) Do not create barriers between AsyncCopyGlobalToLocal and LocalLoad if the
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/MembarUtility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/MembarUtility.cpp
@@ -1,10 +1,11 @@
-#include "third_party/amd/include/TritonAMDGPUToLLVM/MembarUtility.h"
+#include "TritonAMDGPUToLLVM/MembarUtility.h"
 #include "Dialect/TritonAMDGPU/IR/Dialect.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
 namespace mlir::triton::AMD {
 namespace {
+constexpr const char *syncedViaAsyncWaitAttrName =
+    "ttg.amdgpu.syncedViaAsyncWait";
 
 // Traverses the def-chain including control flow of the token and returns true
 // if all defining operations are an AsyncWait
@@ -31,16 +32,12 @@ bool comesFromAsyncWait(Value token) {
   // argId to see if they are immediately an AsyncWait.
   for (auto *pred : block->getPredecessors()) {
     auto terminator = pred->getTerminator();
-    if (auto br = dyn_cast<cf::BranchOp>(terminator)) {
-      if (!destOperandFromAsyncWait(br.getDestOperands()))
-        return false;
-    } else if (auto condBr = dyn_cast<cf::CondBranchOp>(terminator)) {
-      if (condBr.getTrueDest() == block) {
-        if (!destOperandFromAsyncWait(condBr.getTrueDestOperands()))
-          return false;
-      }
-      if (condBr.getFalseDest() == block) {
-        if (!destOperandFromAsyncWait(condBr.getFalseDestOperands()))
+    if (auto br = dyn_cast<BranchOpInterface>(terminator)) {
+      for (auto successor : llvm::enumerate(br->getSuccessors())) {
+        if (block != successor.value())
+          continue;
+        auto operands = br.getSuccessorOperands(successor.index());
+        if (!destOperandFromAsyncWait(operands))
           return false;
       }
     } else {
@@ -51,19 +48,14 @@ bool comesFromAsyncWait(Value token) {
 }
 
 // Returns true if one of the operands is a LocalLoad synced via AsyncWait.
-bool filterAsyncLocalLoadsDeppendencies(Operation *op1, Operation *op2) {
+bool filterAsyncLocalLoadsDependencies(Operation *op1, Operation *op2) {
   auto isAsyncLoad = [](Operation *op) {
     return llvm::isa<triton::gpu::AsyncCopyGlobalToLocalOp,
                      triton::amdgpu::BufferLoadToLocalOp>(op);
   };
   auto isLocalLoadWithAsyncWaitToken = [](Operation *op) {
     auto localLoad = llvm::dyn_cast<triton::gpu::LocalLoadOp>(op);
-    if (!localLoad)
-      return false;
-    Value token = localLoad.getToken();
-    if (!token || !comesFromAsyncWait(token))
-      return false;
-    return true;
+    return localLoad && isSyncedViaAsyncWait(localLoad);
   };
 
   // Early return if neither or both operands are an AsyncLoad
@@ -76,7 +68,33 @@ bool filterAsyncLocalLoadsDeppendencies(Operation *op1, Operation *op2) {
 };
 } // namespace
 
+void annotateLocalLoadsSyncedViaAsyncWait(ModuleOp mod) {
+  SmallVector<triton::gpu::LocalLoadOp> localLoads;
+  mod->walk([&](triton::gpu::LocalLoadOp localLoadOp) {
+    localLoads.emplace_back(localLoadOp);
+  });
+
+  auto *ctx = mod->getContext();
+  for (auto &loadOp : localLoads) {
+    auto token = loadOp.getToken();
+    bool isSyncedViaAsyncWait = token && comesFromAsyncWait(token);
+    loadOp->setAttr(syncedViaAsyncWaitAttrName,
+                    BoolAttr::get(ctx, isSyncedViaAsyncWait));
+  }
+}
+
+bool isSyncedViaAsyncWait(triton::gpu::LocalLoadOp localLoadOp) {
+  auto attr = localLoadOp->getAttr(syncedViaAsyncWaitAttrName);
+  if (!attr) {
+    localLoadOp.emitRemark("has no async sync information attached to it which "
+                           "might negatively affect performance. Run "
+                           "annotateLocalLoadSyncedViaAsyncWait first");
+    return false;
+  }
+  return cast<BoolAttr>(attr).getValue();
+}
+
 bool membarFilter(Operation *op1, Operation *op2) {
-  return filterAsyncLocalLoadsDeppendencies(op1, op2);
+  return filterAsyncLocalLoadsDependencies(op1, op2);
 }
 } // namespace mlir::triton::AMD
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp
@@ -100,6 +100,7 @@ struct ConvertTritonAMDGPUToLLVM
     // Allocate shared memory and set barrier
     ModuleAllocation allocation(mod);
 
+    AMD::annotateLocalLoadsSyncedViaAsyncWait(mod);
     ModuleMembarAnalysis membarPass(&allocation,
                                     mlir::triton::AMD::membarFilter);
     membarPass.run();
diff --git a/third_party/amd/test/lib/Analysis/TestAMDGPUMembar.cpp b/third_party/amd/test/lib/Analysis/TestAMDGPUMembar.cpp
@@ -21,10 +21,10 @@ struct TestAMDGPUMembarPass
 
   void runOnOperation() override {
     ModuleOp moduleOp = getOperation();
+    triton::AMD::annotateLocalLoadsSyncedViaAsyncWait(moduleOp);
     // Print all ops after membar pass
     ModuleAllocation allocation(moduleOp);
-    ModuleMembarAnalysis membarPass(&allocation,
-                                    mlir::triton::AMD::membarFilter);
+    ModuleMembarAnalysis membarPass(&allocation, triton::AMD::membarFilter);
     membarPass.run();
   }
 };