[AMD] Add amdgpu.async_wait to explicitly represent number of async transactions (#8575)

AlexAUT · web-flow · commit a295e601341a · 2025-10-29T15:40:12.000Z
`ttg.async_wait` counts the number of outstanding `ttg.commit_groups`.
However, when lowering to LLVM on AMD we require the number of
outstanding async intrinsics/final assembly instructions. The conversion
is already done by `UpdateAsyncWaitCnt` which modifies the `num` of
`ttg.async_wait` in place.
This PR introduces a new op `amdgpu.async_wait` to make the change in
semantics explicit in the IR.

`UpdateAsyncWaitCount` is moved to `TTGIR-&gt;LLVM` primarily to also
include in for `Gluon` kernels and we should always call it since it
will only have an effect if there are `ttg.async_wait` ops present in
the kernel.

To avoid membar changes this also adds a `ttgpu.LocalBarrier` after each
`amdgpu.async_wait`. Membar will respect the newly added barrier and
behave the same as for `ttg.async_wait`.
diff --git a/test/Conversion/amd/async-ops-alias-scopes.mlir b/test/Conversion/amd/async-ops-alias-scopes.mlir
@@ -65,7 +65,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
   tt.func public @local_loads_with_token_from_async_wait(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
                                                          %arg1: !ttg.memdesc<64x1xf16, #shared, #smem, mutable>,
                                                          %arg2: !ttg.memdesc<16x16xf16, #shared, #smem, mutable>) {
-    %3 = ttg.async_wait {num = 1 : i32}
+    %3 = amdgpu.async_wait {num_inst = 1 : i32}
 
     // Check alias information is added for different lowering paths
 
@@ -111,7 +111,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
     %0 = ttg.async_copy_global_to_local %ptr, %arg1 : tensor<64x1x!tt.ptr<f32>, #blocked> -> <64x1xf32, #shared, #smem, mutable>
     %1 = ttg.async_commit_group tokens %0
 
-    %3 = ttg.async_wait %1 {num = 1 : i32}
+    %3 = amdgpu.async_wait %1 {num_inst = 1 : i32}
 
     // Check alias information is not used at all for different lowering paths
     // COMMON-NOT: [[$ASYNC_COPY_SCOPE]]
@@ -146,14 +146,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
     %c0_i32 = arith.constant 0 : i32
     %c1_i32 = arith.constant 1 : i32
 
-    %1 = ttg.async_wait {num = 1 : i32}
+    %1 = amdgpu.async_wait {num_inst = 1 : i32}
     // COMMON: llvm.load
     %2 = ttg.local_load %arg1 token %1 : !ttg.memdesc<64x1xf16, #shared, #smem, mutable> -> tensor<64x1xf16, #blocked>
 
     %loop_result:2 = scf.for %arg14 = %c0_i32 to %loopIterCount step %c1_i32 iter_args(%arg10 = %1, %arg11 = %2) -> (!ttg.async.token, tensor<64x1xf16, #blocked>)  : i32 {
       // COMMON: llvm.load {{.*}} {alias_scopes = [[[$LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[$ASYNC_COPY_SCOPE]]]
       %3 = ttg.local_load %arg1 token %arg10 : !ttg.memdesc<64x1xf16, #shared, #smem, mutable> -> tensor<64x1xf16, #blocked>
-      %4 = ttg.async_wait {num = 1 : i32}
+      %4 = amdgpu.async_wait {num_inst = 1 : i32}
       scf.yield %4, %3: !ttg.async.token, tensor<64x1xf16, #blocked>
     }
 
diff --git a/test/Conversion/amd/async_ops_to_llvm.mlir b/test/Conversion/amd/async_ops_to_llvm.mlir
@@ -106,24 +106,24 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
     // CHECK: rocdl.s.waitcnt -49168
     // CHECK: rocdl.s.waitcnt -7937
     // CHECK: rocdl.s.barrier
-    ttg.async_wait {num = 0 : i32}
+    amdgpu.async_wait {num_inst = 0 : i32}
     // CHECK: rocdl.s.waitcnt -49167
     // CHECK: rocdl.s.waitcnt -7937
     // CHECK: rocdl.s.barrier
-    ttg.async_wait {num = 1 : i32}
+    amdgpu.async_wait {num_inst = 1 : i32}
     // CHECK: rocdl.s.waitcnt -2
     // CHECK: rocdl.s.waitcnt -7937
     // CHECK: rocdl.s.barrier
-    ttg.async_wait {num = 62 : i32}
+    amdgpu.async_wait {num_inst = 62 : i32}
     // CHECK: rocdl.s.waitcnt -1
     // CHECK: rocdl.s.waitcnt -7937
     // CHECK: rocdl.s.barrier
-    ttg.async_wait {num = 63 : i32}
+    amdgpu.async_wait {num_inst = 63 : i32}
     // Check that we clamp values > 63
     // CHECK: rocdl.s.waitcnt -1
     // CHECK: rocdl.s.waitcnt -7937
     // CHECK: rocdl.s.barrier
-    ttg.async_wait {num = 64 : i32}
+    amdgpu.async_wait {num_inst = 64 : i32}
     tt.return
   }
 }
diff --git a/test/TritonGPU/amd/amd-update-async-wait-count.mlir b/test/TritonGPU/amd/amd-update-async-wait-count.mlir
@@ -18,10 +18,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     %3 = ttg.async_commit_group tokens %2
 
     // Do not wait on the second async_copy => waitcnt 2
-    // CHECK: ttg.async_wait {{.*}} {num = 2
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 2
     %9 = ttg.async_wait %1 {num = 0 : i32}
     // No async_copies in between => waitcnt 0
-    // CHECK: ttg.async_wait {{.*}} {num = 0
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 0
     %10 = ttg.async_wait %3 {num = 0 : i32}
     tt.return
   }
@@ -47,10 +47,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     %3 = ttg.async_commit_group tokens %2
 
     // Do not wait on the second async_copy => waitcnt 2
-    // CHECK: ttg.async_wait {{.*}} {num = 0
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 0
     %9 = ttg.async_wait %3 {num = 0 : i32}
     // No async_copies in between => waitcnt 0
-    // CHECK: ttg.async_wait {{.*}} {num = 2
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 2
     %10 = ttg.async_wait %1 {num = 0 : i32}
     tt.return
   }
@@ -77,9 +77,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
 
     %4 = tt.load %arg3 : tensor<128x16x!tt.ptr<f16>, #blocked>
 
-    // CHECK: ttg.async_wait {{.*}} {num = 2
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 2
     %9 = ttg.async_wait %1 {num = 0 : i32}
-    // CHECK: ttg.async_wait {{.*}} {num = 0
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 0
     %10 = ttg.async_wait %3 {num = 0 : i32}
     tt.return
   }
@@ -106,15 +106,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     %2 = ttg.async_copy_global_to_local %arg4, %arg2 : tensor<16x256x!tt.ptr<f16>, #blocked1> -> <16x256xf16, #shared1, #smem, mutable>
     %3 = ttg.async_commit_group tokens %2
     %8:2 = scf.for %arg14 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg15 = %1, %arg16 = %3) -> (!ttg.async.token, !ttg.async.token)  : i32 {
-      // CHECK: ttg.async_wait {{.*}}, {{.*}} {num = 0
+      // CHECK: amdgpu.async_wait {{.*}}, {{.*}} {num_inst = 0
       %10 = ttg.async_wait %arg15, %arg16 {num = 2 : i32}
       %11 = ttg.async_copy_global_to_local %arg3, %arg1 : tensor<128x16x!tt.ptr<f16>, #blocked> -> <128x16xf16, #shared, #smem, mutable>
       %12 = ttg.async_commit_group tokens %11
       %13 = ttg.async_copy_global_to_local %arg4, %arg2 : tensor<16x256x!tt.ptr<f16>, #blocked1> -> <16x256xf16, #shared1, #smem, mutable>
       %14 = ttg.async_commit_group tokens %13
       scf.yield %12, %14: !ttg.async.token, !ttg.async.token
     }
-    // CHECK: ttg.async_wait {{.*}}, {{.*}} {num = 0
+    // CHECK: amdgpu.async_wait {{.*}}, {{.*}} {num_inst = 0
     %9 = ttg.async_wait %8#0, %8#1 {num = 0 : i32}
     tt.return
   }
@@ -145,15 +145,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     %6 = ttg.async_copy_global_to_local %arg4, %arg2 : tensor<16x256x!tt.ptr<f16>, #blocked1> -> <16x256xf16, #shared1, #smem, mutable>
     %7 = ttg.async_commit_group tokens %6
     %8:4 = scf.for %arg14 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg15 = %1, %arg16 = %5, %arg17 = %3, %arg18 = %7) -> (!ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token)  : i32 {
-      // CHECK: ttg.async_wait {{.*}}, {{.*}} {num = 3
+      // CHECK: amdgpu.async_wait {{.*}}, {{.*}} {num_inst = 3
       %10 = ttg.async_wait %arg15, %arg17 {num = 2 : i32}
       %11 = ttg.async_copy_global_to_local %arg3, %arg1 : tensor<128x16x!tt.ptr<f16>, #blocked> -> <128x16xf16, #shared, #smem, mutable>
       %12 = ttg.async_commit_group tokens %11
       %13 = ttg.async_copy_global_to_local %arg4, %arg2 : tensor<16x256x!tt.ptr<f16>, #blocked1> -> <16x256xf16, #shared1, #smem, mutable>
       %14 = ttg.async_commit_group tokens %13
       scf.yield %arg16, %12, %arg18, %14 : !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token
     }
-    // CHECK: ttg.async_wait {{.*}}, {{.*}} {num = 0
+    // CHECK: amdgpu.async_wait {{.*}}, {{.*}} {num_inst = 0
     %9 = ttg.async_wait %8#0, %8#1, %8#2, %8#3 {num = 0 : i32}
     tt.return
   }
@@ -185,12 +185,12 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     %8:4 = scf.for %arg14 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg15 = %1, %arg16 = %5, %arg17 = %3, %arg18 = %7) -> (!ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token) : i32 {
       %103 = scf.if %cond -> (!ttg.async.token) {
         // We wait on both tokens so we interleave with one iteration => 3
-        // CHECK: ttg.async_wait {{.*}}, {{.*}} {num = 3
+        // CHECK: amdgpu.async_wait {{.*}}, {{.*}} {num_inst = 3
         %token1 = ttg.async_wait %arg15, %arg17 {num = 2 : i32}
         scf.yield %token1 : !ttg.async.token
       } else {
         // We only wait on the token of the first load so we can interleave one more load => 3 + 2
-        // CHECK: ttg.async_wait {{.*}} {num = 5
+        // CHECK: amdgpu.async_wait {{.*}} {num_inst = 5
         %token2 = ttg.async_wait %arg15 {num = 1 : i32}
         scf.yield %token2 : !ttg.async.token
       }
@@ -200,7 +200,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
       %14 = ttg.async_commit_group tokens %13
       scf.yield %arg16, %12, %arg18, %14 : !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token
     }
-    // CHECK: ttg.async_wait {{.*}}, {{.*}} {num = 0
+    // CHECK: amdgpu.async_wait {{.*}}, {{.*}} {num_inst = 0
     %9 = ttg.async_wait %8#0, %8#1, %8#2, %8#3 {num = 0 : i32}
     tt.return
   }
@@ -235,7 +235,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
         %cond_load = ttg.async_copy_global_to_local %arg4, %arg2 : tensor<16x256x!tt.ptr<f16>, #blocked1> -> <16x256xf16, #shared1, #smem, mutable>
         %cond_load_commit = ttg.async_commit_group tokens %cond_load
         // We wait on both tokens (3) and additionally we should count the load inside our block (+2) => 5
-        // CHECK: ttg.async_wait {{.*}}, {{.*}} {num = 5
+        // CHECK: amdgpu.async_wait {{.*}}, {{.*}} {num_inst = 5
         %token1 = ttg.async_wait %arg15, %arg17 {num = 2 : i32}
         scf.yield %token1 : !ttg.async.token
       } else {
@@ -247,7 +247,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
       %14 = ttg.async_commit_group tokens %13
       scf.yield %arg16, %12, %arg18, %14 : !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token
     }
-    // CHECK: ttg.async_wait {{.*}}, {{.*}} {num = 0
+    // CHECK: amdgpu.async_wait {{.*}}, {{.*}} {num_inst = 0
     %9 = ttg.async_wait %8#0, %8#1, %8#2, %8#3 {num = 0 : i32}
     tt.return
   }
@@ -279,7 +279,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     %7 = ttg.async_commit_group tokens %6
     %8:4 = scf.for %arg14 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg15 = %1, %arg16 = %5, %arg17 = %3, %arg18 = %7) -> (!ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token)  : i32 {
       // The then block contains 3 instructions and the else 1 so we expect the count to be 3 (1 + 2) because there are also 2 instructions outside the scf.if in the loop body
-      // CHECK: ttg.async_wait {{.*}}, {{.*}} {num = 3
+      // CHECK: amdgpu.async_wait {{.*}}, {{.*}} {num_inst = 3
       %token1 = ttg.async_wait %arg15, %arg17 {num = 2 : i32}
 
       %103 = scf.if %cond -> (!ttg.async.token) {
@@ -296,7 +296,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
       %14 = ttg.async_commit_group tokens %13
       scf.yield %arg16, %103, %arg18, %14 : !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token
     }
-    // CHECK: ttg.async_wait {{.*}}, {{.*}} {num = 0
+    // CHECK: amdgpu.async_wait {{.*}}, {{.*}} {num_inst = 0
     %9 = ttg.async_wait %8#0, %8#1, %8#2, %8#3 {num = 0 : i32}
     tt.return
   }
@@ -323,14 +323,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     %7 = ttg.async_commit_group tokens %6
     // Dynamic iteration count so we should not count its body
     %30 = scf.for %arg21 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg30 = %6) -> (!ttg.async.token) : i32 {
-      // CHECK: ttg.async_wait {{.*}} {num = 0
+      // CHECK: amdgpu.async_wait {{.*}} {num_inst = 0
       %31 = ttg.async_wait %arg30 {num = 1 : i32}
       // Emits 1 direct to lds instruction
       %32 = ttg.async_copy_global_to_local %arg3, %arg1 : tensor<128x16x!tt.ptr<f16>, #blocked> -> <128x16xf16, #shared, #smem, mutable>
       %33 = ttg.async_commit_group tokens %32
       scf.yield %33 : !ttg.async.token
     }
-    // CHECK: ttg.async_wait {{.*}} {num = 1
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 1
     %10 = ttg.async_wait %1 {num = 1 : i32}
     tt.return
   }
@@ -357,14 +357,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     %7 = ttg.async_commit_group tokens %6
     // Loop with 4 iterations => 4 instructions
     %30 = scf.for %arg21 = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%arg30 = %6) -> (!ttg.async.token) : i32 {
-      // CHECK: ttg.async_wait {{.*}} {num = 0
+      // CHECK: amdgpu.async_wait {{.*}} {num_inst = 0
       %31 = ttg.async_wait %arg30 {num = 1 : i32}
       // Emits 1 direct to lds instruction
       %32 = ttg.async_copy_global_to_local %arg3, %arg1 : tensor<128x16x!tt.ptr<f16>, #blocked> -> <128x16xf16, #shared, #smem, mutable>
       %33 = ttg.async_commit_group tokens %32
       scf.yield %33 : !ttg.async.token
     }
-    // CHECK: ttg.async_wait {{.*}} {num = 5
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 5
     %10 = ttg.async_wait %1 {num = 1 : i32}
     tt.return
   }
@@ -397,10 +397,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
 
     // Check that we do not take other TDM loads into account (they use a different HW counter)
 
-    // CHECK: ttg.async_wait {{.*}} {num = 2
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 2
     %cw1 = ttg.async_wait %21 {num = 0 : i32}
 
-    // CHECK: ttg.async_wait {{.*}} {num = 0
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 0
     %cw2 = ttg.async_wait %51 {num = 0 : i32}
     tt.return
   }
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -256,8 +256,6 @@ def make_ttgir(mod, metadata, options):
         passes.common.add_canonicalizer(pm)
         passes.common.add_cse(pm)
         passes.common.add_symbol_dce(pm)
-        if use_async_copy:
-            amd.passes.ttgpuir.add_update_async_wait_count(pm, options.arch)
         pm.run(mod, 'make_ttgir')
         return mod
 
@@ -283,6 +281,7 @@ def make_llir(src, metadata, options):
         # TritonGPU -> LLVM-IR (MLIR)
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
+        amd.passes.ttgpuir.add_update_async_wait_count(pm, options.arch)
         # custom_lds_size is an experimental parameter that defines amount of LDS available
         # for one thread block. Measured in bytes.
         #
diff --git a/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td b/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
@@ -775,4 +775,21 @@ def AsyncTDMWait : TT_AMDGPU_Op<"async_tdm_wait"> {
   let assemblyFormat = "$asyncToken attr-dict";
 }
 
+//===----------------------------------------------------------------------===//
+// AsyncWait
+//===----------------------------------------------------------------------===//
+
+def AsyncWaitOp : TT_AMDGPU_Op<"async_wait"> {
+  let summary = "Wait until there are less than or equal to the given number of outstanding async intrinsics";
+  let description = [{
+    Similar to ttg.async_wait but instead of waiting on oustanding ttg.async_commit_groups
+    this op waits on the number of outstanding async instructions/intrinsics as required for the
+    lowering to LLVM on the AMD backend.
+  }];
+
+  let arguments = (ins Variadic<TTG_AsyncToken>:$asyncToken, I32Attr:$num_inst);
+  let results = (outs TTG_AsyncToken:$retToken);
+  let assemblyFormat = "($asyncToken^)? attr-dict";
+}
+
 #endif
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
@@ -257,7 +257,7 @@ def TritonAMDGPUUpdateAsyncWaitCount: Pass<"tritonamdgpu-update-async-wait-count
       compute the number of interleaving global memory instructions to emit the correct waitcnt during lowering.
   }];
 
-  let dependentDialects = [];
+  let dependentDialects = ["mlir::triton::amdgpu::TritonAMDGPUDialect"];
 
   let options = [
     Option<"archGenerationName", "arch-generation-name",
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.cpp
@@ -2,6 +2,7 @@
 
 #include "Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "TargetInfo.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -13,7 +14,7 @@ constexpr const char *syncedViaAsyncWaitAttrName =
 // if all defining operations are an AsyncWait
 bool comesFromAsyncWait(Value token) {
   if (auto defOp = token.getDefiningOp()) {
-    return isa<triton::gpu::AsyncWaitOp>(defOp);
+    return isa<triton::gpu::AsyncWaitOp, amdgpu::AsyncWaitOp>(defOp);
   }
 
   auto blockArg = dyn_cast<BlockArgument>(token);
@@ -50,6 +51,22 @@ bool comesFromAsyncWait(Value token) {
 }
 } // namespace
 
+void addLocalBarrierAfterAmdGpuAsyncWait(ModuleOp mod) {
+  auto *ctx = mod->getContext();
+
+  SmallVector<amdgpu::AsyncWaitOp> waits;
+  mod->walk([&waits](amdgpu::AsyncWaitOp waitOp) { waits.push_back(waitOp); });
+
+  IRRewriter builder(mod.getContext());
+  for (auto waitOp : waits) {
+    if (isa<mlir::gpu::BarrierOp, gpu::LocalBarrierOp>(waitOp->getNextNode()))
+      continue;
+
+    builder.setInsertionPointAfter(waitOp);
+    builder.create<triton::gpu::LocalBarrierOp>(waitOp->getLoc());
+  }
+}
+
 void annotateLocalLoadsSyncedViaAsyncWait(ModuleOp mod) {
   auto *ctx = mod->getContext();
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h
@@ -9,6 +9,12 @@
 namespace mlir::triton::AMD {
 class TargetInfo;
 
+// Walks the module and adds a LocalBarrier after any amdgpu.async_wait if there
+// is not already a barrier following it. This mimicks what Member does for
+// common async wait operations and avoids AMD specific modifications to Membar.
+// This yields to the same behaviour compared to when membar adds the barrier.
+void addLocalBarrierAfterAmdGpuAsyncWait(ModuleOp mod);
+
 // Annotates LocalLoadOps with ttg.amdgpu.syncedByAsyncWait=true if they are
 // synced by an AsyncWait.
 void annotateLocalLoadsSyncedViaAsyncWait(ModuleOp mod);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1887,14 +1887,15 @@ struct AtomicRMWOpConversion
   }
 };
 
-struct AsyncWaitOpConversion : public ConvertOpToLLVMPattern<AsyncWaitOp> {
+struct AsyncWaitOpConversion
+    : public ConvertOpToLLVMPattern<amdgpu::AsyncWaitOp> {
   AsyncWaitOpConversion(LLVMTypeConverter &converter,
                         const AMD::TargetInfo &targetInfo,
                         PatternBenefit benefit)
       : ConvertOpToLLVMPattern(converter, benefit), targetInfo(targetInfo) {}
 
   LogicalResult
-  matchAndRewrite(AsyncWaitOp op, OpAdaptor adaptor,
+  matchAndRewrite(amdgpu::AsyncWaitOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = op->getLoc();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
@@ -1912,7 +1913,7 @@ struct AsyncWaitOpConversion : public ConvertOpToLLVMPattern<AsyncWaitOp> {
       // interested in those.
 
       // Clamp vmcnt to 6bits; a lower vmcnt will produce a conservative wait
-      unsigned vmCnt = std::min(63u, op.getNum());
+      unsigned vmCnt = std::min(63u, op.getNumInst());
 
       // Extract low and high bits and combine while setting all other bits to 1
       unsigned lowBits = vmCnt & 0xF;
@@ -1925,7 +1926,7 @@ struct AsyncWaitOpConversion : public ConvertOpToLLVMPattern<AsyncWaitOp> {
     }
     case ISAFamily::GFX1250: {
       // Clamp asyncCnt to 6bits(hw imit); lower means conservative
-      unsigned asyncCnt = std::min(63u, op.getNum());
+      unsigned asyncCnt = std::min(63u, op.getNumInst());
       LLVM::createLLVMIntrinsicCallOp(rewriter, loc,
                                       "llvm.amdgcn.s.wait.asynccnt", {},
                                       {b.i16_val(asyncCnt)});
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp
@@ -121,6 +121,8 @@ struct ConvertTritonAMDGPUToLLVM
 
     if (targetInfo.requiresAliasInfoForAsyncOps())
       AMD::annotateLocalLoadsSyncedViaAsyncWait(mod);
+
+    AMD::addLocalBarrierAfterAmdGpuAsyncWait(mod);
     ModuleMembarAnalysis membarPass(&allocation,
                                     mlir::triton::AMD::membarFilter);
     membarPass.run();
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/UpdateAsyncWaitCount.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/UpdateAsyncWaitCount.cpp