intel
diff --git a/‎test/Conversion/amd/async-ops-alias-scopes.mlir
Lines changed: 56 additions & 15 deletions b/‎test/Conversion/amd/async-ops-alias-scopes.mlir
Lines changed: 56 additions & 15 deletions
diff --git a/‎third_party/amd/include/TritonAMDGPUToLLVM/MembarUtility.h
Lines changed: 0 additions & 9 deletions b/‎third_party/amd/include/TritonAMDGPUToLLVM/MembarUtility.h
Lines changed: 0 additions & 9 deletions
diff --git a/‎third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.cpp
Lines changed: 129 additions & 0 deletions b/‎third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.cpp
Lines changed: 129 additions & 0 deletions
diff --git a/‎third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h
Lines changed: 44 additions & 0 deletions b/‎third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h
Lines changed: 44 additions & 0 deletions
diff --git a/‎third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp
Lines changed: 3 additions & 2 deletions b/‎third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp
Lines changed: 3 additions & 2 deletions
@@ -1,12 +1,13 @@
 // RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx950 --convert-builtin-func-to-llvm | FileCheck %s --check-prefixes=COMMON,GFX950
 // RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx942 --convert-builtin-func-to-llvm | FileCheck %s --check-prefixes=COMMON,GFX942
 
-// COMMON: [[ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
-// COMMON: [[LOCAL_LOAD_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.LocalLoads"
+// COMMON: [[$ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
+// COMMON: [[$LOCAL_LOAD_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.LocalLoads"
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
 #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: @async_copy_alias
   tt.func public @async_copy_alias(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
                                    %arg1: !ttg.memdesc<64x1xf32, #shared, #smem, mutable>,
                                    %maskVal: i1) {
@@ -15,9 +16,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
     %ptr = tt.splat %arg0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked>
     %mask = tt.splat %maskVal : i1 -> tensor<64x1xi1, #blocked>
 
-    // COMMON: rocdl.global.load.lds {{.*}} {alias_scopes = [[[ASYNC_COPY_SCOPE]]]
+    // COMMON: rocdl.global.load.lds {{.*}} {alias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     // Check that store for 'other' has alias information set
-    // COMMON: llvm.store {{.*}} {alias_scopes = [[[LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[ASYNC_COPY_SCOPE]]]
+    // COMMON: llvm.store {{.*}} {alias_scopes = [[[$LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     %0 = ttg.async_copy_global_to_local %ptr, %arg1 mask %mask other %other : tensor<64x1x!tt.ptr<f32>, #blocked> -> <64x1xf32, #shared, #smem, mutable>
 
     // COMMON: llvm.return
@@ -27,21 +28,22 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
 
 // -----
 
-// COMMON: [[ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
+// COMMON: [[$ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: @buffer_load_to_local_alias
   tt.func public @buffer_load_to_local_alias(%maskVal: i1,
                                              %arg1: !tt.ptr<f32>,
                                              %arg2: tensor<8x64xi32, #blocked>,
                                              %arg3: !ttg.memdesc<8x64xf32, #shared, #smem, mutable>) {
     %mask = tt.splat %maskVal : i1 -> tensor<8x64xi1, #blocked>
     %other = arith.constant dense<1.000000e+00> : tensor<8x64xf32, #blocked>
 
-    // COMMON: rocdl.raw.ptr.buffer.load.lds {{.*}} {alias_scopes = [[[ASYNC_COPY_SCOPE]]]
+    // COMMON: rocdl.raw.ptr.buffer.load.lds {{.*}} {alias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     // Check that store for 'other' has alias information set
-    // COMMON: llvm.store {{.*}} {alias_scopes = [[[LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[ASYNC_COPY_SCOPE]]]
+    // COMMON: llvm.store {{.*}} {alias_scopes = [[[$LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     %65 = amdgpu.buffer_load_to_local %arg1[%arg2] mask=%mask other=%other into %arg3 {OpIdx = #amdgpu.OpIdx<1>} : <f32>[tensor<8x64xi32, #blocked>] tensor<8x64xf32, #blocked> -> <8x64xf32, #shared, #smem, mutable>
 
     // COMMON: llvm.return
@@ -51,14 +53,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 
 // -----
 
-// COMMON: [[LOCAL_LOAD_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.LocalLoads"
-// COMMON: [[ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
+// COMMON: [[$LOCAL_LOAD_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.LocalLoads"
+// COMMON: [[$ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
 #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1]}>
 #shared1 = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
 #smem = #ttg.shared_memory
 #mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [32, 32], isTransposed = true}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: @local_loads_with_token_from_async_wait
   tt.func public @local_loads_with_token_from_async_wait(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
                                                          %arg1: !ttg.memdesc<64x1xf16, #shared, #smem, mutable>,
                                                          %arg2: !ttg.memdesc<16x16xf16, #shared, #smem, mutable>) {
@@ -67,12 +70,12 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
     // Check alias information is added for different lowering paths
 
     // Test lowering path in common MemoryOpToLLVM pattern
-    // COMMON: llvm.load {{.*}} {alias_scopes = [[[LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[ASYNC_COPY_SCOPE]]]
+    // COMMON: llvm.load {{.*}} {alias_scopes = [[[$LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     %4 = ttg.local_load %arg1 token %3 : !ttg.memdesc<64x1xf16, #shared, #smem, mutable> -> tensor<64x1xf16, #blocked>
 
     // Test lowering path in AMD's MemoryOpToLLVM pattern
-    // GFX942: llvm.load {{.*}} {alias_scopes = [[[LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[ASYNC_COPY_SCOPE]]]
-    // GFX950: rocdl.ds.read.tr16.b64 {{.*}} {alias_scopes = [[[LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[ASYNC_COPY_SCOPE]]]
+    // GFX942: llvm.load {{.*}} {alias_scopes = [[[$LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[$ASYNC_COPY_SCOPE]]]
+    // GFX950: rocdl.ds.read.tr16.b64 {{.*}} {alias_scopes = [[[$LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     %5 = ttg.local_load %arg2 token %3 : !ttg.memdesc<16x16xf16, #shared, #smem, mutable> -> tensor<16x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
 
     // Stores to keep the local_loads
@@ -90,27 +93,28 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
 
 // Same as above but LocalLoad does not use the token from AsyncWait
 
-// COMMON: [[ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
+// COMMON: [[$ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
 #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1]}>
 #shared1 = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
 #smem = #ttg.shared_memory
 #mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [32, 32], isTransposed = true}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: @local_loads_without_token_from_async_wait
   tt.func public @local_loads_without_token_from_async_wait(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
                                                             %arg1: !ttg.memdesc<64x1xf32, #shared, #smem, mutable>,
                                                             %arg4: !ttg.memdesc<16x16xf32, #shared, #smem, mutable>) {
     // We need the splat to allow the AxisAnalysis to work during lowering
     %ptr = tt.splat %arg0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked>
 
-    // COMMON: rocdl.global.load.lds {{.*}} {alias_scopes = [[[ASYNC_COPY_SCOPE]]]
+    // COMMON: rocdl.global.load.lds {{.*}} {alias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     %0 = ttg.async_copy_global_to_local %ptr, %arg1 : tensor<64x1x!tt.ptr<f32>, #blocked> -> <64x1xf32, #shared, #smem, mutable>
     %1 = ttg.async_commit_group %0
 
     %3 = ttg.async_wait %1 {num = 1 : i32}
 
     // Check alias information is not used at all for different lowering paths
-    // COMMON-NOT: [[ASYNC_COPY_SCOPE]]
+    // COMMON-NOT: [[$ASYNC_COPY_SCOPE]]
 
     // Test lowering path in common MemoryOpToLLVM pattern
     %4 = ttg.local_load %arg1 token %0 : !ttg.memdesc<64x1xf32, #shared, #smem, mutable> -> tensor<64x1xf32, #blocked>
@@ -124,3 +128,40 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+// COMMON: [[$LOCAL_LOAD_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.LocalLoads"
+// COMMON: [[$ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1]}>
+#shared1 = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#smem = #ttg.shared_memory
+#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [32, 32], isTransposed = true}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: @local_loads_with_loop_carried_token
+  tt.func public @local_loads_with_loop_carried_token(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                                         %arg1: !ttg.memdesc<64x1xf16, #shared, #smem, mutable>,
+                                                         %loopIterCount: i32) {
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+
+    %1 = ttg.async_wait {num = 1 : i32}
+    // COMMON: llvm.load
+    %2 = ttg.local_load %arg1 token %1 : !ttg.memdesc<64x1xf16, #shared, #smem, mutable> -> tensor<64x1xf16, #blocked>
+
+    %loop_result:2 = scf.for %arg14 = %c0_i32 to %loopIterCount step %c1_i32 iter_args(%arg10 = %1, %arg11 = %2) -> (!ttg.async.token, tensor<64x1xf16, #blocked>)  : i32 {
+      // COMMON: llvm.load {{.*}} {alias_scopes = [[[$LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[$ASYNC_COPY_SCOPE]]]
+      %3 = ttg.local_load %arg1 token %arg10 : !ttg.memdesc<64x1xf16, #shared, #smem, mutable> -> tensor<64x1xf16, #blocked>
+      %4 = ttg.async_wait {num = 1 : i32}
+      scf.yield %4, %3: !ttg.async.token, tensor<64x1xf16, #blocked>
+    }
+
+    // Stores to keep the local_loads
+    %ptr = tt.splat %arg0 : !tt.ptr<f16> -> tensor<64x1x!tt.ptr<f16>, #blocked>
+    tt.store %ptr, %loop_result#1 : tensor<64x1x!tt.ptr<f16>, #blocked>
+
+    // COMMON: llvm.return
+    tt.return
+  }
+}
@@ -1,19 +1,10 @@
 #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_MEMBARUTILITY_H_
 #define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_MEMBARUTILITY_H_
 
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Operation.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
 namespace mlir::triton::AMD {
 
-// Annotates LocalLoadOps with ttg.amdgpu.syncedByAsyncWait=true if they are
-// synced by an AsyncWait.
-void annotateLocalLoadsSyncedViaAsyncWait(ModuleOp mod);
-
-// Getter for the annotation applied by annotateLocalLoadsSyncedViaAsyncWait
-bool isSyncedViaAsyncWait(triton::gpu::LocalLoadOp localLoadOp);
-
 // Filter function used in the AMDGPU backend to filter unnecessary barriers
 // during Membar Analysis. Filters applied by this function:
 // 1) Do not create barriers between AsyncCopyGlobalToLocal and LocalLoad if the
 
@@ -0,0 +1,129 @@
+#include "AsyncUtility.h"
+
+#include "Dialect/TritonAMDGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
+namespace mlir::triton::AMD {
+namespace {
+constexpr const char *syncedViaAsyncWaitAttrName =
+    "ttg.amdgpu.syncedViaAsyncWait";
+// Traverses the def-chain including control flow of the token and returns true
+// if all defining operations are an AsyncWait
+bool comesFromAsyncWait(Value token) {
+  if (auto defOp = token.getDefiningOp()) {
+    return isa<triton::gpu::AsyncWaitOp>(defOp);
+  }
+
+  auto blockArg = dyn_cast<BlockArgument>(token);
+  // If the token has no defining op and is not an BlockArgument bail out
+  if (!blockArg) {
+    return false;
+  }
+
+  auto block = blockArg.getOwner();
+  auto argId = blockArg.getArgNumber();
+
+  auto destOperandFromAsyncWait = [argId](auto &&operands) {
+    assert(argId < operands.size());
+    return comesFromAsyncWait(operands[argId]);
+  };
+
+  // Check all predecessor block's terminator and follow the passed value at
+  // argId to see if they are immediately an AsyncWait.
+  for (auto *pred : block->getPredecessors()) {
+    auto terminator = pred->getTerminator();
+    if (auto br = dyn_cast<BranchOpInterface>(terminator)) {
+      for (auto successor : llvm::enumerate(br->getSuccessors())) {
+        if (block != successor.value())
+          continue;
+        auto operands = br.getSuccessorOperands(successor.index());
+        if (!destOperandFromAsyncWait(operands))
+          return false;
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+} // namespace
+
+void annotateLocalLoadsSyncedViaAsyncWait(ModuleOp mod) {
+  SmallVector<triton::gpu::LocalLoadOp> localLoads;
+  mod->walk([&](triton::gpu::LocalLoadOp localLoadOp) {
+    localLoads.emplace_back(localLoadOp);
+  });
+
+  auto *ctx = mod->getContext();
+  for (auto &loadOp : localLoads) {
+    auto token = loadOp.getToken();
+    bool isSyncedViaAsyncWait = token && comesFromAsyncWait(token);
+    loadOp->setAttr(syncedViaAsyncWaitAttrName,
+                    BoolAttr::get(ctx, isSyncedViaAsyncWait));
+  }
+}
+
+bool isSyncedViaAsyncWait(triton::gpu::LocalLoadOp localLoadOp) {
+  auto attr = localLoadOp->getAttr(syncedViaAsyncWaitAttrName);
+  if (!attr) {
+    localLoadOp.emitRemark("has no async sync information attached to it which "
+                           "might negatively affect performance. Run "
+                           "annotateLocalLoadSyncedViaAsyncWait first");
+    return false;
+  }
+  return cast<BoolAttr>(attr).getValue();
+}
+
+namespace {
+LLVM::AliasScopeDomainAttr getLoadScopeDomain(MLIRContext *ctx) {
+  Builder b(ctx);
+  return b.getAttr<LLVM::AliasScopeDomainAttr>(
+      b.getStringAttr("amdgpu.AsyncOps"),
+      b.getStringAttr(
+          "Domain to hold alias scopes to specify aliasing information between "
+          "AsyncCopyGlobalToLocal, BufferLoadToLocal and LocalLoad ops"));
+}
+
+LLVM::AliasScopeAttr getAsyncCopyScope(MLIRContext *ctx) {
+  Builder b(ctx);
+  auto name = b.getStringAttr("amdgpu.AsyncCopies");
+  auto desc = b.getStringAttr(
+      "Scope containing all AsyncCopyGlobalToLocal and BufferLoadToLocal ops");
+  return b.getAttr<LLVM::AliasScopeAttr>(name, getLoadScopeDomain(ctx), desc);
+}
+
+LLVM::AliasScopeAttr getLoadCopyScope(MLIRContext *ctx) {
+  Builder b(ctx);
+  auto name = b.getStringAttr("amdgpu.LocalLoads");
+  auto desc = b.getStringAttr("Scope containing all LocalLoad ops");
+  return b.getAttr<LLVM::AliasScopeAttr>(name, getLoadScopeDomain(ctx), desc);
+}
+} // namespace
+
+void addAsyncCopyAliasScope(LLVM::AliasAnalysisOpInterface directToLdsOp) {
+  auto ctx = directToLdsOp->getContext();
+  Builder b(ctx);
+  directToLdsOp.setAliasScopes(b.getArrayAttr(getAsyncCopyScope(ctx)));
+}
+
+void addLocalLoadNoAliasScope(triton::gpu::LocalLoadOp localLoadOp,
+                              LLVM::AliasAnalysisOpInterface llLoadOp) {
+  if (!isSyncedViaAsyncWait(localLoadOp))
+    return;
+
+  return addLocalLoadNoAliasScope(llLoadOp);
+}
+
+void addLocalLoadNoAliasScope(LLVM::AliasAnalysisOpInterface llLoadOp) {
+  auto ctx = llLoadOp->getContext();
+
+  // Do not alias with AsyncCopies
+  auto noAliasScopes = ArrayAttr::get(ctx, getAsyncCopyScope(ctx));
+  llLoadOp.setNoAliasScopes(noAliasScopes);
+
+  // Add to different scope as ops without any scope alias with everything
+  auto aliasScopes = ArrayAttr::get(ctx, getLoadCopyScope(ctx));
+  llLoadOp.setAliasScopes(aliasScopes);
+}
+
+} // namespace mlir::triton::AMD
@@ -0,0 +1,44 @@
+#ifndef TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_ASYNCUTILITY_H_
+#define TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_ASYNCUTILITY_H_
+
+#include "mlir/Dialect/LLVMIR/LLVMInterfaces.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "triton/Conversion/MLIRTypes.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
+namespace mlir::triton::AMD {
+// Annotates LocalLoadOps with ttg.amdgpu.syncedByAsyncWait=true if they are
+// synced by an AsyncWait.
+void annotateLocalLoadsSyncedViaAsyncWait(ModuleOp mod);
+
+// Getter for the annotation applied by annotateLocalLoadsSyncedViaAsyncWait
+bool isSyncedViaAsyncWait(triton::gpu::LocalLoadOp localLoadOp);
+
+// LLVM is unable to deduce dependencies across warps and loop iterations for
+// AsyncCopy and LocalLoad and will emit conservative wait counts. In triton the
+// dependency is models via AsyncWait, e.g.
+//   %token1 = ttg.async_copy_global_to_local/amdgpu.buffer_load_to_local
+//   %token2 = ttg.async_wait %token1
+//   %1      = ttg.local_load .. token %token2
+// For such cases AsyncWait will emit the correct wait and the conservative
+// waits are redundant and hindering performance/interleaving.
+// To disable the conservative waits two alias scopes are created:
+//   1) "amdgpu.AsyncCopies" will contain all AsyncCopy ops
+//   2) "amdgpu.LocalLoad" will contain all LocalLoads manually synchronized via
+//      AsyncWait
+// ALl manually synchronized LocalLoads will additionally have "AsyncCopies" as
+// a non alias scope to disable the implicit waits from the LLVM backend
+
+// If localLoadOp has a token from an AsyncWait:
+//  - Attaches "amdgpu.LocalLoad" alias scope to llLoadOp
+//  - Attaches "amdgpu.AsyncCopies" as *non* alias scope to llLoadOp
+void addLocalLoadNoAliasScope(triton::gpu::LocalLoadOp localLoadOp,
+                              LLVM::AliasAnalysisOpInterface llLoadOp);
+// Overload from above without checking the AsyncToken
+void addLocalLoadNoAliasScope(LLVM::AliasAnalysisOpInterface llLoadOp);
+// Attaches the "AsyncCopies" alias scope to llLoadDirectToLdsOp
+void addAsyncCopyAliasScope(LLVM::AliasAnalysisOpInterface llLoadDirectToLdsOp);
+
+} // namespace mlir::triton::AMD
+
+#endif
@@ -1,5 +1,6 @@
 #include "TritonAMDGPUToLLVM/Passes.h"
 
+#include "AsyncUtility.h"
 #include "Utility.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Pass/Pass.h"
@@ -80,7 +81,7 @@ class CallOpConversion : public OpRewritePattern<LLVM::CallOp> {
     bool addAsyncAliasScopes =
         callOp.getCallee().value().contains(mlir::LLVM::AMD::noAliasAsyncLoads);
     if (addAsyncAliasScopes) {
-      LLVM::AMD::addLocalLoadNoAliasScope(storeOp);
+      AMD::addLocalLoadNoAliasScope(storeOp);
     }
     rewriter.create<LLVM::BrOp>(loc, afterStore);
     rewriter.setInsertionPointToStart(afterStore);
@@ -120,7 +121,7 @@ class CallOpConversion : public OpRewritePattern<LLVM::CallOp> {
     bool addAsyncNoAliasInfo =
         callOp.getCallee().value().contains(mlir::LLVM::AMD::noAliasAsyncLoads);
     if (addAsyncNoAliasInfo) {
-      LLVM::AMD::addLocalLoadNoAliasScope(loadOp);
+      AMD::addLocalLoadNoAliasScope(loadOp);
     }
     rewriter.create<LLVM::BrOp>(loc, loadOp->getResult(0), afterLoad);
     rewriter.setInsertionPointToStart(falseBlock);