[AMDGPU][ASAN] Handle special GVs lowering in amdgpu-sw-lower-lds #161827

skc7 · 2025-10-03T11:27:51Z

This PR updates amdgpu-sw-lower-lds pass to use the logic of handling Special GVs i.e named barriers from amdgpu-lower-module-lds.

Also moves utility methods recordLDSAbsoluteAddress, uniquifyGVPerKernel and lowerSpecialLDSVariables from amdgpu-lower-module-lds to AMDGPUMemoryUtils module.

llvmbot · 2025-10-03T11:31:30Z

@llvm/pr-subscribers-backend-amdgpu

Author: Chaitanya (skc7)

Changes

This PR updates amdgpu-sw-lower-lds pass to use the logic of handling Special GVs i.e named barriers from amdgpu-lower-module-lds.

Also moves utility methods recordLDSAbsoluteAddress, uniquifyGVPerKernel and lowerSpecialLDSVariables from amdgpu-lower-module-lds to AMDGPUMemoryUtils module.

Patch is 37.93 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161827.diff

6 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp (-133)
(modified) llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp (+138)
(modified) llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h (+16)
(modified) llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp (+17-14)
(added) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll (+241)
(added) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier.ll (+113)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index f01d5f6726822..3036b992bdcc9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -511,19 +511,6 @@ class AMDGPULowerModuleLDS {
     return MostUsed.GV;
   }
 
-  static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
-                                       uint32_t Address) {
-    // Write the specified address into metadata where it can be retrieved by
-    // the assembler. Format is a half open range, [Address Address+1)
-    LLVMContext &Ctx = M->getContext();
-    auto *IntTy =
-        M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
-    auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
-    auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
-    GV->setMetadata(LLVMContext::MD_absolute_symbol,
-                    MDNode::get(Ctx, {MinC, MaxC}));
-  }
-
   DenseMap<Function *, Value *> tableKernelIndexCache;
   Value *getTableLookupKernelIndex(Module &M, Function *F) {
     // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
@@ -922,126 +909,6 @@ class AMDGPULowerModuleLDS {
     return KernelToCreatedDynamicLDS;
   }
 
-  static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
-                                             Function *KF) {
-    bool NeedsReplacement = false;
-    for (Use &U : GV->uses()) {
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        Function *F = I->getFunction();
-        if (isKernelLDS(F) && F != KF) {
-          NeedsReplacement = true;
-          break;
-        }
-      }
-    }
-    if (!NeedsReplacement)
-      return GV;
-    // Create a new GV used only by this kernel and its function
-    GlobalVariable *NewGV = new GlobalVariable(
-        M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-        GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-        GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-    NewGV->copyAttributesFrom(GV);
-    for (Use &U : make_early_inc_range(GV->uses())) {
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        Function *F = I->getFunction();
-        if (!isKernelLDS(F) || F == KF) {
-          U.getUser()->replaceUsesOfWith(GV, NewGV);
-        }
-      }
-    }
-    return NewGV;
-  }
-
-  bool lowerSpecialLDSVariables(
-      Module &M, LDSUsesInfoTy &LDSUsesInfo,
-      VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-    bool Changed = false;
-    const DataLayout &DL = M.getDataLayout();
-    // The 1st round: give module-absolute assignments
-    int NumAbsolutes = 0;
-    std::vector<GlobalVariable *> OrderedGVs;
-    for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
-      GlobalVariable *GV = K.first;
-      if (!isNamedBarrier(*GV))
-        continue;
-      // give a module-absolute assignment if it is indirectly accessed by
-      // multiple kernels. This is not precise, but we don't want to duplicate
-      // a function when it is called by multiple kernels.
-      if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-        OrderedGVs.push_back(GV);
-      } else {
-        // leave it to the 2nd round, which will give a kernel-relative
-        // assignment if it is only indirectly accessed by one kernel
-        LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
-      }
-      LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-    }
-    OrderedGVs = sortByName(std::move(OrderedGVs));
-    for (GlobalVariable *GV : OrderedGVs) {
-      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-      unsigned BarId = NumAbsolutes + 1;
-      unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-      NumAbsolutes += BarCnt;
-
-      // 4 bits for alignment, 5 bits for the barrier num,
-      // 3 bits for the barrier scope
-      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-      recordLDSAbsoluteAddress(&M, GV, Offset);
-    }
-    OrderedGVs.clear();
-
-    // The 2nd round: give a kernel-relative assignment for GV that
-    // either only indirectly accessed by single kernel or only directly
-    // accessed by multiple kernels.
-    std::vector<Function *> OrderedKernels;
-    for (auto &K : LDSUsesInfo.direct_access) {
-      Function *F = K.first;
-      assert(isKernelLDS(F));
-      OrderedKernels.push_back(F);
-    }
-    OrderedKernels = sortByName(std::move(OrderedKernels));
-
-    llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
-    for (Function *F : OrderedKernels) {
-      for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-        if (!isNamedBarrier(*GV))
-          continue;
-
-        LDSUsesInfo.direct_access[F].erase(GV);
-        if (GV->isAbsoluteSymbolRef()) {
-          // already assigned
-          continue;
-        }
-        OrderedGVs.push_back(GV);
-      }
-      OrderedGVs = sortByName(std::move(OrderedGVs));
-      for (GlobalVariable *GV : OrderedGVs) {
-        // GV could also be used directly by other kernels. If so, we need to
-        // create a new GV used only by this kernel and its function.
-        auto NewGV = uniquifyGVPerKernel(M, GV, F);
-        Changed |= (NewGV != GV);
-        unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-        unsigned BarId = Kernel2BarId[F];
-        BarId += NumAbsolutes + 1;
-        unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-        Kernel2BarId[F] += BarCnt;
-        unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-        recordLDSAbsoluteAddress(&M, NewGV, Offset);
-      }
-      OrderedGVs.clear();
-    }
-    // Also erase those special LDS variables from indirect_access.
-    for (auto &K : LDSUsesInfo.indirect_access) {
-      assert(isKernelLDS(K.first));
-      for (GlobalVariable *GV : K.second) {
-        if (isNamedBarrier(*GV))
-          K.second.erase(GV);
-      }
-    }
-    return Changed;
-  }
-
   bool runOnModule(Module &M) {
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index e17c2113ca398..0a204d2bd0021 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -439,4 +439,142 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
   return false;
 }
 
+GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+                                    Function *KF) {
+  bool NeedsReplacement = false;
+  for (Use &U : GV->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (isKernelLDS(F) && F != KF) {
+        NeedsReplacement = true;
+        break;
+      }
+    }
+  }
+  if (!NeedsReplacement)
+    return GV;
+  // Create a new GV used only by this kernel and its function
+  GlobalVariable *NewGV = new GlobalVariable(
+      M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+      GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+      GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+  NewGV->copyAttributesFrom(GV);
+  for (Use &U : make_early_inc_range(GV->uses())) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (!isKernelLDS(F) || F == KF) {
+        U.getUser()->replaceUsesOfWith(GV, NewGV);
+      }
+    }
+  }
+  return NewGV;
+}
+
+template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
+  llvm::sort(V, [](const auto *L, const auto *R) {
+    return L->getName() < R->getName();
+  });
+  return {std::move(V)};
+}
+
+void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, uint32_t Address) {
+  // Write the specified address into metadata where it can be retrieved by
+  // the assembler. Format is a half open range, [Address Address+1)
+  LLVMContext &Ctx = M->getContext();
+  auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+  auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+  auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+  GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                  MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+bool lowerSpecialLDSVariables(
+    Module &M, LDSUsesInfoTy &LDSUsesInfo,
+    VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+  bool Changed = false;
+  const DataLayout &DL = M.getDataLayout();
+  // The 1st round: give module-absolute assignments
+  int NumAbsolutes = 0;
+  std::vector<GlobalVariable *> OrderedGVs;
+  for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+    GlobalVariable *GV = K.first;
+    if (!isNamedBarrier(*GV))
+      continue;
+    // give a module-absolute assignment if it is indirectly accessed by
+    // multiple kernels. This is not precise, but we don't want to duplicate
+    // a function when it is called by multiple kernels.
+    if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+      OrderedGVs.push_back(GV);
+    } else {
+      // leave it to the 2nd round, which will give a kernel-relative
+      // assignment if it is only indirectly accessed by one kernel
+      LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+    }
+    LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+  }
+  OrderedGVs = sortByName(std::move(OrderedGVs));
+  for (GlobalVariable *GV : OrderedGVs) {
+    unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+    unsigned BarId = NumAbsolutes + 1;
+    unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+    NumAbsolutes += BarCnt;
+
+    // 4 bits for alignment, 5 bits for the barrier num,
+    // 3 bits for the barrier scope
+    unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+    recordLDSAbsoluteAddress(&M, GV, Offset);
+  }
+  OrderedGVs.clear();
+
+  // The 2nd round: give a kernel-relative assignment for GV that
+  // either only indirectly accessed by single kernel or only directly
+  // accessed by multiple kernels.
+  std::vector<Function *> OrderedKernels;
+  for (auto &K : LDSUsesInfo.direct_access) {
+    Function *F = K.first;
+    assert(isKernelLDS(F));
+    OrderedKernels.push_back(F);
+  }
+  OrderedKernels = sortByName(std::move(OrderedKernels));
+
+  llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+  for (Function *F : OrderedKernels) {
+    for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+      if (!isNamedBarrier(*GV))
+        continue;
+
+      LDSUsesInfo.direct_access[F].erase(GV);
+      if (GV->isAbsoluteSymbolRef()) {
+        // already assigned
+        continue;
+      }
+      OrderedGVs.push_back(GV);
+    }
+    OrderedGVs = sortByName(std::move(OrderedGVs));
+    for (GlobalVariable *GV : OrderedGVs) {
+      // GV could also be used directly by other kernels. If so, we need to
+      // create a new GV used only by this kernel and its function.
+      auto NewGV = uniquifyGVPerKernel(M, GV, F);
+      Changed |= (NewGV != GV);
+      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+      unsigned BarId = Kernel2BarId[F];
+      BarId += NumAbsolutes + 1;
+      unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+      Kernel2BarId[F] += BarCnt;
+      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+      recordLDSAbsoluteAddress(&M, NewGV, Offset);
+    }
+    OrderedGVs.clear();
+  }
+  // Also erase those special LDS variables from indirect_access.
+  for (auto &K : LDSUsesInfo.indirect_access) {
+    assert(isKernelLDS(K.first));
+    for (GlobalVariable *GV : K.second) {
+      if (isNamedBarrier(*GV))
+        K.second.erase(GV);
+    }
+  }
+  return Changed;
+}
+
 } // end namespace llvm::AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
index 058e74452573c..f867c695fa77f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
@@ -71,6 +71,22 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA);
 bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
                            AAResults *AA);
 
+/// Create a new global variable in \p M based on \p GV, but uniquified for
+/// \p KF. The new global variable will have the same properties as \p GV, but
+/// will have a name based on \p GV and \p KF to ensure uniqueness.
+GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+                                    Function *KF);
+
+/// Record the absolute address of \p GV in the module flag metadata of \p M.
+void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, uint32_t Address);
+
+/// Lower special LDS variables i.e named barriers in \p M.
+/// Update \p LDSUsesInfo and \p LDSToKernelsThatNeedToAccessItIndirectly
+/// to reflect any changes made.
+bool lowerSpecialLDSVariables(
+    Module &M, LDSUsesInfoTy &LDSUsesInfo,
+    VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly);
+
 } // end namespace AMDGPU
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 4a9437b37aa39..c4f37992b6cee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -304,18 +304,6 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
   }
 }
 
-static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
-                                     uint32_t Address) {
-  // Write the specified address into metadata where it can be retrieved by
-  // the assembler. Format is a half open range, [Address Address+1)
-  LLVMContext &Ctx = M.getContext();
-  auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
-  MDBuilder MDB(Ctx);
-  MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
-                                         ConstantInt::get(IntTy, Address + 1));
-  GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
-}
-
 static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
                                 bool IsDynLDS) {
   if (Offset != 0) {
@@ -378,10 +366,10 @@ void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
   auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
   bool IsDynLDSUsed = LDSParams.SwDynLDS;
   uint32_t Offset = LDSParams.LDSSize;
-  recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
+  recordLDSAbsoluteAddress(&M, LDSParams.SwLDS, 0);
   addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
   if (LDSParams.SwDynLDS)
-    recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
+    recordLDSAbsoluteAddress(&M, LDSParams.SwDynLDS, Offset);
 }
 
 void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
@@ -1161,6 +1149,21 @@ bool AMDGPUSwLowerLDS::run() {
   if (!LowerAllLDS)
     return Changed;
 
+  // Lower special LDS variables like named barriers.
+  if (LDSUsesInfo.HasSpecialGVs) {
+    // For each variable accessed through callees, which kernels access it
+    VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+    for (auto &K : LDSUsesInfo.indirect_access) {
+      Function *F = K.first;
+      assert(isKernelLDS(F));
+      for (GlobalVariable *GV : K.second) {
+        LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+      }
+    }
+    Changed |= lowerSpecialLDSVariables(
+        M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+  }
+
   // Utility to group LDS access into direct, indirect, static and dynamic.
   auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
                                             bool DirectAccess) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll
new file mode 100644
index 0000000000000..357f034ef447c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll
@@ -0,0 +1,241 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; This test checks that named barriers in LDS and other LDS in moduleare lowered correctly by the
+; amdgpu-sw-lower-lds pass.
+@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+
+;.
+; CHECK: @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar1.kernel1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META2]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META3:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel1.md.type { %llvm.amdgcn.sw.lds.kernel1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel1.md.item { i32 32, i32 1, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.kernel2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META3]]
+; @llvm.amdgcn.sw.lds.kernel2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel2.md.type { %llvm.amdgcn.sw.lds.kernel2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel2.md.item { i32 32, i32 1, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [2 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel2], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [2 x [1 x ptr addrspace(1)]] [[1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel1.md, i32 0, i32 1, i32 0)], [1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel2.md, i32 0, i32 1, i32 0)]], no_sanitize_address
+;.
+define void @func1() {
+; CHECK-LABEL: define void @func1() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x [1 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]]
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]]
+; CHECK-NEXT:    store i8 1, ptr addrspace...
[truncated]

arsenm · 2025-10-03T11:41:15Z

llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll

+
+; This test checks that named barriers in LDS and other LDS in moduleare lowered correctly by the
+; amdgpu-sw-lower-lds pass.
+@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison


I don't understand why the diff is so big. These aren't special named globals, they are special target types

AFAIU, LDS globals with TargetExtType and name as "amdgcn.named.barrier" are called named barriers in backend.
This test in particular, has a static LDS global with kernel accessing it directly and also in-directly using device functions. So, the IR generated has asan specific lowering handling all the scenarios. And, named barrier LDS are lowered following lower-module-lds pass.

arsenm · 2025-10-03T11:50:52Z

llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll

+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; This test checks that named barriers in LDS and other LDS in moduleare lowered correctly by the


Suggested change

; This test checks that named barriers in LDS and other LDS in moduleare lowered correctly by the

; This test checks that named barriers in LDS and other LDS in module are lowered correctly by the

Fixed. Thanks.

cmc-rep · 2025-10-06T18:24:18Z

It seems to me better if we can ignore those "special" GVs in amdgpu-sw-lower-lds, instead of duplicating the call

maybe amdgpu-lower-module-lds also needs some adjustment to handle this scenario we have not tested before.

skc7 · 2025-10-08T15:34:25Z

It seems to me better if we can ignore those "special" GVs in amdgpu-sw-lower-lds, instead of duplicating the call

maybe amdgpu-lower-module-lds also needs some adjustment to handle this scenario we have not tested before.

As per the amdgpu-lower-module-lds pass, lowering of LDS globals are categorized as two cases as below:
LINK:

llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp

Line 270 in 0f3ca19

// Verify that we fall into one of 2 cases:

// Verify that we fall into one of 2 cases:
// - All variables are either absolute
// or direct mapped dynamic LDS that is not lowered.
// this is a re-run of the pass
// so we don't have anything to do.
// - No variables are absolute.

AFAIU, this check is essential to make the pass idempotent, so that no lowering happens in the rerun once its been done already. And also fails in case of partial lowering. And this is the reason in case of ASAN, we handle all the lowering in the amdgpu-sw-lower-lds pass itself.

cmc-rep · 2025-10-08T18:51:33Z

It seems to me better if we can ignore those "special" GVs in amdgpu-sw-lower-lds, instead of duplicating the call
maybe amdgpu-lower-module-lds also needs some adjustment to handle this scenario we have not tested before.

As per the amdgpu-lower-module-lds pass, lowering of LDS globals are categorized as two cases as below: LINK:

llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp

Line 270 in 0f3ca19

// Verify that we fall into one of 2 cases:

// Verify that we fall into one of 2 cases:
// - All variables are either absolute
// or direct mapped dynamic LDS that is not lowered.
// this is a re-run of the pass
// so we don't have anything to do.
// - No variables are absolute.
AFAIU, this check is essential to make the pass idempotent, so that no lowering happens in the rerun once its been done already. And also fails in case of partial lowering. And this is the reason in case of ASAN, we handle all the lowering in the amdgpu-sw-lower-lds pass itself.

In amdgpu-lower-module-lds pass, could we modify the check to allow another situation to work:

All "special" GVs are not assigned but all other normal LDS GV are assigned

skc7 · 2025-10-14T09:26:54Z

It seems to me better if we can ignore those "special" GVs in amdgpu-sw-lower-lds, instead of duplicating the call
maybe amdgpu-lower-module-lds also needs some adjustment to handle this scenario we have not tested before.

As per the amdgpu-lower-module-lds pass, lowering of LDS globals are categorized as two cases as below: LINK:

llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp

Line 270 in 0f3ca19

// Verify that we fall into one of 2 cases:

// Verify that we fall into one of 2 cases:
// - All variables are either absolute
// or direct mapped dynamic LDS that is not lowered.
// this is a re-run of the pass
// so we don't have anything to do.
// - No variables are absolute.
AFAIU, this check is essential to make the pass idempotent, so that no lowering happens in the rerun once its been done already. And also fails in case of partial lowering. And this is the reason in case of ASAN, we handle all the lowering in the amdgpu-sw-lower-lds pass itself.

In amdgpu-lower-module-lds pass, could we modify the check to allow another situation to work:

All "special" GVs are not assigned but all other normal LDS GV are assigned

Hi @arsenm @Pierre-vh @JonChesterfield @shiltian

Can the lower-module-lds pass be modified so that it accepts partially lowered LDS globals ?
Named barrier LDS are left behind and other LDS globals are lowered (by sw-lower-lds), so that this pass should only lower special GVs.
Is this possible in current implementation of the pass?

JonChesterfield · 2025-10-14T15:55:54Z

I don't know what "special" refers to here. As far as I'm aware there are no lds variables that are treated inherently differently to others, except for the dynamic case which has the weird aliasing behaviour.

I would suggest not moving asan specific work into the main lds lowering pass, on the grounds that this seems error prone and I'd rather the asan complexity not invade the rest of the code paths.

jayfoad · 2025-10-15T09:53:20Z

I don't know what "special" refers to here. As far as I'm aware there are no lds variables that are treated inherently differently to others, except for the dynamic case which has the weird aliasing behaviour.

See #114550. Named barriers are represented in IR as addrspace(3) globals with a special type. During codegen they go through special handling which maps them onto a small fixed number of hardware barrier IDs, instead of getting space allocated in LDS. Downstream there are other use cases that use a very similar mechanism.

JonChesterfield · 2025-10-15T10:35:32Z

That just raises more questions. Named barriers aren't lds globals, why would rocm want to to represent them as such?

Matt pointed out that the patch doesn't mean much without reference to the internal discussion, which I don't have access to. One of the drawbacks of split development really.

cmc-rep · 2025-10-15T14:56:26Z

That just raises more questions. Named barriers aren't lds globals, why would rocm want to to represent them as such?

Matt pointed out that the patch doesn't mean much without reference to the internal discussion, which I don't have access to. One of the drawbacks of split development really.

On platform without hardware support, we can emulate named-barrier using LDS atomics.

ssahasra · 2025-10-16T06:37:56Z

On platform without hardware support, we can emulate named-barrier using LDS atomics.

Why? What's the software need to do that on older hardware?

jayfoad · 2025-10-16T09:59:55Z

On platform without hardware support, we can emulate named-barrier using LDS atomics.

Why? What's the software need to do that on older hardware?

There is no software need, it's just part of the rationale for making them look like LDS allocations. As I understand it:

they are allocated per-workgroup just like LDS allocations
you could in theory have a software implementation that actually allocated them in LDS
with that in place, the path to allocate them to hardware barrier IDs would be "just an optimizastion"
but there's no use case for that fallback path, so we have only implemented the optimized path

Pierre-vh · 2025-10-16T10:46:11Z

I believe it'd have been a better choice to model those global as externally defined, and existing in a new, separate address space dedicated to barriers. It'd fit better into the memory model and is easier to reason about.

I don't like the rationale of "We could lower them to LDS, we likely won't, but we could so let's say this is LDS". It really feels more like this started with a solution ("the LDS lowering pass can take care of it") and looked for a justification afterwards.

I also struggle to imagine the nightmare it'd be to actually add that fallback to LDS.

Are we saying we'd also translate all of the scalar barrier operations into vector memory operation ? Do we know how much it'd affect performance?
How will the memory model even reason about this if suddenly all barrier ops can also be LDS ops?
How will the s.barrier.wait operation be implemented ? Would we now have a loop instead of the wave going to sleep?

If there was some internal discussion on the topic, it must be summarized into AMDGPUUsage as a documentation and design rationale for these globals to guide future work on this topic.

Pierre-vh

I still have concerns with the approach of making this a LDS global on #114550
Until all the concerns on that patch are addressed, please do not land this.

ssahasra · 2025-10-17T12:51:27Z

I don't think LDS lowering should need to do anything with "special" global values of any kind. Those "special" values need to be moved out of the LDS address space, and that "lowerSpecialLDSVariables" itself is suspect.

ssahasra

Given the uncertainty surrounding the programming model for named barriers, I believe the minimum requirement for a way forward is to move all these "special GVs" under a flag which clearly uses "experimental" in its spelling, and rename all the builtins and GV names to also include the string "experimental".

Forgot to add the intention of the experimental flag: asan should continue to function as if these special GVs don't exist by simply disabling them. It is not on asan to worry about supporting special GVs until the experimental tag is removed.

Pierre-vh · 2025-10-29T13:30:10Z

Given #114550 is unlikely to be refactored short-term, I suggest moving the named barrier variable lowering into a separate small pass that runs before the LDS module lowering, instead of having two passes do the same thing through shared helpers.

I think it's a better design to have one pass that only handles those magic GVs, and lowers them into the !absolute_symbol form. Then both LowerModuleLDS and SwLowerModuleLDS should be able to handle the lowered version naturally without any special casing needed for these named barrier GVs (e.g. by skipping over GVs with !absolute_address).

I hope we can revisit the address space in which those GV live eventually. Keeping all the logic in one module would make this less intrusive and easier to refactor later. If we start special casing it in every place that touches LDS it'll make it much harder to correct the design.

skc7 · 2025-10-29T16:39:53Z

Given #114550 is unlikely to be refactored short-term, I suggest moving the named barrier variable lowering into a separate small pass that runs before the LDS module lowering, instead of having two passes do the same thing through shared helpers.

I think it's a better design to have one pass that only handles those magic GVs, and lowers them into the !absolute_symbol form. Then both LowerModuleLDS and SwLowerModuleLDS should be able to handle the lowered version naturally without any special casing needed for these named barrier GVs (e.g. by skipping over GVs with !absolute_address).

I hope we can revisit the address space in which those GV live eventually. Keeping all the logic in one module would make this less intrusive and easier to refactor later. If we start special casing it in every place that touches LDS it'll make it much harder to correct the design.

Will try this approach of new pass running prior to LDS lowering passes and see if it works, and post any issues faced.
Thanks for the suggestion.

skc7 · 2025-11-03T09:18:39Z

Given #114550 is unlikely to be refactored short-term, I suggest moving the named barrier variable lowering into a separate small pass that runs before the LDS module lowering, instead of having two passes do the same thing through shared helpers.

I think it's a better design to have one pass that only handles those magic GVs, and lowers them into the !absolute_symbol form. Then both LowerModuleLDS and SwLowerModuleLDS should be able to handle the lowered version naturally without any special casing needed for these named barrier GVs (e.g. by skipping over GVs with !absolute_address).

I hope we can revisit the address space in which those GV live eventually. Keeping all the logic in one module would make this less intrusive and easier to refactor later. If we start special casing it in every place that touches LDS it'll make it much harder to correct the design.

Implemented this approach in below PRs. Please review.

[AMDGPU] Add amdgpu-lower-special-lds pass to lower named-barrier LDS #165692
[AMDGPU] Enable amdgpu-lower-special-lds pass in pipeline #165746

ssahasra

Unblocking this work since I previously marked it as "changes requested". I don't believe that work on asan should be blocked on discussions happening elsewhere.

qcolombet · 2025-11-12T00:31:48Z

Unblocking this work since I previously marked it as "changes requested". I don't believe that work on asan should be blocked on discussions happening elsewhere.

Just to make sure I understand the status of this PR.
We need to land #165692 #165746 and #166731 before we can do anything with this one.
Is that correct @skc7 ?

skc7 · 2025-11-12T09:53:03Z

Unblocking this work since I previously marked it as "changes requested". I don't believe that work on asan should be blocked on discussions happening elsewhere.

Just to make sure I understand the status of this PR. We need to land #165692 #165746 and #166731 before we can do anything with this one. Is that correct @skc7 ?

Closed #165746 and #166731 and C-P'd those commits to #165692.
Only #165692 is needed to handle named-barrier LDS.
This PR #161827 will be closed after the merge of #165692

…ls (#165692) This PR introduces `amdgpu-lower-exec-sync` pass which specifically lowers named-barrier LDS globals introduced by #114550 . Changes include: - Moving the logic of lowering named-barrier LDS globals from `amdgpu-lower-module-lds` pass to this new pass. - This PR adds the pass to pipeline, remove the existing lowering logic for named-barrier LDS in `amdgpu-lower-module-lds` See #161827 for discussion on this topic.

qcolombet · 2025-11-17T20:16:18Z

Hi @skc7,

#165692 has been merged.

skc7 · 2025-11-18T05:27:35Z

Hi @skc7,

#165692 has been merged.

Closing this PR, since #165692 is merged.

…ls (llvm#165692) This PR introduces `amdgpu-lower-exec-sync` pass which specifically lowers named-barrier LDS globals introduced by llvm#114550 . Changes include: - Moving the logic of lowering named-barrier LDS globals from `amdgpu-lower-module-lds` pass to this new pass. - This PR adds the pass to pipeline, remove the existing lowering logic for named-barrier LDS in `amdgpu-lower-module-lds` See llvm#161827 for discussion on this topic.

[AMDGPU][ASAN] Handle special GVs in amdgpu-sw-lower-lds

99a4a7d

skc7 requested a review from b-sumner October 3, 2025 11:30

skc7 marked this pull request as ready for review October 3, 2025 11:31

llvmbot added the backend:AMDGPU label Oct 3, 2025

skc7 requested a review from arsenm October 3, 2025 11:31

skc7 changed the title ~~[AMDGPU][ASAN] Handle special GVs in amdgpu-sw-lower-lds~~ [AMDGPU][ASAN] Handle special GVs lowering in amdgpu-sw-lower-lds Oct 3, 2025

arsenm reviewed Oct 3, 2025

View reviewed changes

Fix typo in test

2ae4c6e

skc7 requested review from ampandey-1995, arsenm and cmc-rep October 3, 2025 15:47

skc7 requested a review from Pierre-vh October 7, 2025 04:05

skc7 requested review from JonChesterfield and shiltian October 14, 2025 09:21

skc7 requested a review from jayfoad October 15, 2025 09:55

skc7 requested a review from ssahasra October 16, 2025 06:44

Pierre-vh requested changes Oct 17, 2025

View reviewed changes

ssahasra requested changes Oct 21, 2025

View reviewed changes

skc7 mentioned this pull request Nov 3, 2025

[AMDGPU] Add amdgpu-lower-exec-sync pass to lower named-barrier globals #165692

Merged

ssahasra approved these changes Nov 4, 2025

View reviewed changes

skc7 closed this Nov 18, 2025

	; This test checks that named barriers in LDS and other LDS in moduleare lowered correctly by the
	; This test checks that named barriers in LDS and other LDS in module are lowered correctly by the

[AMDGPU][ASAN] Handle special GVs lowering in amdgpu-sw-lower-lds #161827

[AMDGPU][ASAN] Handle special GVs lowering in amdgpu-sw-lower-lds #161827

Conversation

skc7 commented Oct 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 3, 2025

Uh oh!

arsenm Oct 3, 2025

Choose a reason for hiding this comment

Uh oh!

skc7 Oct 3, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Oct 3, 2025

Choose a reason for hiding this comment

Uh oh!

skc7 Oct 3, 2025

Choose a reason for hiding this comment

Uh oh!

cmc-rep commented Oct 6, 2025

Uh oh!

skc7 commented Oct 8, 2025

Uh oh!

cmc-rep commented Oct 8, 2025

Uh oh!

skc7 commented Oct 14, 2025

Uh oh!

JonChesterfield commented Oct 14, 2025

Uh oh!

jayfoad commented Oct 15, 2025

Uh oh!

JonChesterfield commented Oct 15, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

cmc-rep commented Oct 15, 2025

Uh oh!

ssahasra commented Oct 16, 2025

Uh oh!

jayfoad commented Oct 16, 2025

Uh oh!

Pierre-vh commented Oct 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Pierre-vh left a comment

Choose a reason for hiding this comment

Uh oh!

ssahasra commented Oct 17, 2025

Uh oh!

ssahasra left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Pierre-vh commented Oct 29, 2025

Uh oh!

skc7 commented Oct 29, 2025

Uh oh!

skc7 commented Nov 3, 2025

Uh oh!

ssahasra left a comment

Choose a reason for hiding this comment

Uh oh!

qcolombet commented Nov 12, 2025

Uh oh!

skc7 commented Nov 12, 2025

Uh oh!

qcolombet commented Nov 17, 2025

Uh oh!

skc7 commented Nov 18, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

9 participants

skc7 commented Oct 3, 2025 •

edited

Loading

JonChesterfield commented Oct 15, 2025 •

edited

Loading

Pierre-vh commented Oct 16, 2025 •

edited

Loading

ssahasra left a comment •

edited

Loading