-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU][ASAN] Handle special GVs lowering in amdgpu-sw-lower-lds #161827
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Chaitanya (skc7) ChangesThis PR updates Also moves utility methods Patch is 37.93 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161827.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index f01d5f6726822..3036b992bdcc9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -511,19 +511,6 @@ class AMDGPULowerModuleLDS {
return MostUsed.GV;
}
- static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
- uint32_t Address) {
- // Write the specified address into metadata where it can be retrieved by
- // the assembler. Format is a half open range, [Address Address+1)
- LLVMContext &Ctx = M->getContext();
- auto *IntTy =
- M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
- auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
- auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
- GV->setMetadata(LLVMContext::MD_absolute_symbol,
- MDNode::get(Ctx, {MinC, MaxC}));
- }
-
DenseMap<Function *, Value *> tableKernelIndexCache;
Value *getTableLookupKernelIndex(Module &M, Function *F) {
// Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
@@ -922,126 +909,6 @@ class AMDGPULowerModuleLDS {
return KernelToCreatedDynamicLDS;
}
- static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
- bool NeedsReplacement = false;
- for (Use &U : GV->uses()) {
- if (auto *I = dyn_cast<Instruction>(U.getUser())) {
- Function *F = I->getFunction();
- if (isKernelLDS(F) && F != KF) {
- NeedsReplacement = true;
- break;
- }
- }
- }
- if (!NeedsReplacement)
- return GV;
- // Create a new GV used only by this kernel and its function
- GlobalVariable *NewGV = new GlobalVariable(
- M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
- GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
- GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
- NewGV->copyAttributesFrom(GV);
- for (Use &U : make_early_inc_range(GV->uses())) {
- if (auto *I = dyn_cast<Instruction>(U.getUser())) {
- Function *F = I->getFunction();
- if (!isKernelLDS(F) || F == KF) {
- U.getUser()->replaceUsesOfWith(GV, NewGV);
- }
- }
- }
- return NewGV;
- }
-
- bool lowerSpecialLDSVariables(
- Module &M, LDSUsesInfoTy &LDSUsesInfo,
- VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
- bool Changed = false;
- const DataLayout &DL = M.getDataLayout();
- // The 1st round: give module-absolute assignments
- int NumAbsolutes = 0;
- std::vector<GlobalVariable *> OrderedGVs;
- for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
- GlobalVariable *GV = K.first;
- if (!isNamedBarrier(*GV))
- continue;
- // give a module-absolute assignment if it is indirectly accessed by
- // multiple kernels. This is not precise, but we don't want to duplicate
- // a function when it is called by multiple kernels.
- if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
- OrderedGVs.push_back(GV);
- } else {
- // leave it to the 2nd round, which will give a kernel-relative
- // assignment if it is only indirectly accessed by one kernel
- LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
- }
- LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
- }
- OrderedGVs = sortByName(std::move(OrderedGVs));
- for (GlobalVariable *GV : OrderedGVs) {
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- NumAbsolutes += BarCnt;
-
- // 4 bits for alignment, 5 bits for the barrier num,
- // 3 bits for the barrier scope
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, GV, Offset);
- }
- OrderedGVs.clear();
-
- // The 2nd round: give a kernel-relative assignment for GV that
- // either only indirectly accessed by single kernel or only directly
- // accessed by multiple kernels.
- std::vector<Function *> OrderedKernels;
- for (auto &K : LDSUsesInfo.direct_access) {
- Function *F = K.first;
- assert(isKernelLDS(F));
- OrderedKernels.push_back(F);
- }
- OrderedKernels = sortByName(std::move(OrderedKernels));
-
- llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
- for (Function *F : OrderedKernels) {
- for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
- if (!isNamedBarrier(*GV))
- continue;
-
- LDSUsesInfo.direct_access[F].erase(GV);
- if (GV->isAbsoluteSymbolRef()) {
- // already assigned
- continue;
- }
- OrderedGVs.push_back(GV);
- }
- OrderedGVs = sortByName(std::move(OrderedGVs));
- for (GlobalVariable *GV : OrderedGVs) {
- // GV could also be used directly by other kernels. If so, we need to
- // create a new GV used only by this kernel and its function.
- auto NewGV = uniquifyGVPerKernel(M, GV, F);
- Changed |= (NewGV != GV);
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = Kernel2BarId[F];
- BarId += NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- Kernel2BarId[F] += BarCnt;
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, NewGV, Offset);
- }
- OrderedGVs.clear();
- }
- // Also erase those special LDS variables from indirect_access.
- for (auto &K : LDSUsesInfo.indirect_access) {
- assert(isKernelLDS(K.first));
- for (GlobalVariable *GV : K.second) {
- if (isNamedBarrier(*GV))
- K.second.erase(GV);
- }
- }
- return Changed;
- }
-
bool runOnModule(Module &M) {
CallGraph CG = CallGraph(M);
bool Changed = superAlignLDSGlobals(M);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index e17c2113ca398..0a204d2bd0021 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -439,4 +439,142 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
return false;
}
+GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+ Function *KF) {
+ bool NeedsReplacement = false;
+ for (Use &U : GV->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (isKernelLDS(F) && F != KF) {
+ NeedsReplacement = true;
+ break;
+ }
+ }
+ }
+ if (!NeedsReplacement)
+ return GV;
+ // Create a new GV used only by this kernel and its function
+ GlobalVariable *NewGV = new GlobalVariable(
+ M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+ GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+ GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+ NewGV->copyAttributesFrom(GV);
+ for (Use &U : make_early_inc_range(GV->uses())) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (!isKernelLDS(F) || F == KF) {
+ U.getUser()->replaceUsesOfWith(GV, NewGV);
+ }
+ }
+ }
+ return NewGV;
+}
+
+template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
+ llvm::sort(V, [](const auto *L, const auto *R) {
+ return L->getName() < R->getName();
+ });
+ return {std::move(V)};
+}
+
+void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, uint32_t Address) {
+ // Write the specified address into metadata where it can be retrieved by
+ // the assembler. Format is a half open range, [Address Address+1)
+ LLVMContext &Ctx = M->getContext();
+ auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+ auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+ auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+ GV->setMetadata(LLVMContext::MD_absolute_symbol,
+ MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+bool lowerSpecialLDSVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+ bool Changed = false;
+ const DataLayout &DL = M.getDataLayout();
+ // The 1st round: give module-absolute assignments
+ int NumAbsolutes = 0;
+ std::vector<GlobalVariable *> OrderedGVs;
+ for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+ GlobalVariable *GV = K.first;
+ if (!isNamedBarrier(*GV))
+ continue;
+ // give a module-absolute assignment if it is indirectly accessed by
+ // multiple kernels. This is not precise, but we don't want to duplicate
+ // a function when it is called by multiple kernels.
+ if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+ OrderedGVs.push_back(GV);
+ } else {
+ // leave it to the 2nd round, which will give a kernel-relative
+ // assignment if it is only indirectly accessed by one kernel
+ LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+ }
+ LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ NumAbsolutes += BarCnt;
+
+ // 4 bits for alignment, 5 bits for the barrier num,
+ // 3 bits for the barrier scope
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, GV, Offset);
+ }
+ OrderedGVs.clear();
+
+ // The 2nd round: give a kernel-relative assignment for GV that
+ // either only indirectly accessed by single kernel or only directly
+ // accessed by multiple kernels.
+ std::vector<Function *> OrderedKernels;
+ for (auto &K : LDSUsesInfo.direct_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ OrderedKernels.push_back(F);
+ }
+ OrderedKernels = sortByName(std::move(OrderedKernels));
+
+ llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+ for (Function *F : OrderedKernels) {
+ for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+ if (!isNamedBarrier(*GV))
+ continue;
+
+ LDSUsesInfo.direct_access[F].erase(GV);
+ if (GV->isAbsoluteSymbolRef()) {
+ // already assigned
+ continue;
+ }
+ OrderedGVs.push_back(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ // GV could also be used directly by other kernels. If so, we need to
+ // create a new GV used only by this kernel and its function.
+ auto NewGV = uniquifyGVPerKernel(M, GV, F);
+ Changed |= (NewGV != GV);
+ unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = Kernel2BarId[F];
+ BarId += NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ Kernel2BarId[F] += BarCnt;
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, NewGV, Offset);
+ }
+ OrderedGVs.clear();
+ }
+ // Also erase those special LDS variables from indirect_access.
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ assert(isKernelLDS(K.first));
+ for (GlobalVariable *GV : K.second) {
+ if (isNamedBarrier(*GV))
+ K.second.erase(GV);
+ }
+ }
+ return Changed;
+}
+
} // end namespace llvm::AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
index 058e74452573c..f867c695fa77f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
@@ -71,6 +71,22 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA);
bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
AAResults *AA);
+/// Create a new global variable in \p M based on \p GV, but uniquified for
+/// \p KF. The new global variable will have the same properties as \p GV, but
+/// will have a name based on \p GV and \p KF to ensure uniqueness.
+GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+ Function *KF);
+
+/// Record the absolute address of \p GV in the module flag metadata of \p M.
+void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, uint32_t Address);
+
+/// Lower special LDS variables i.e named barriers in \p M.
+/// Update \p LDSUsesInfo and \p LDSToKernelsThatNeedToAccessItIndirectly
+/// to reflect any changes made.
+bool lowerSpecialLDSVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly);
+
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 4a9437b37aa39..c4f37992b6cee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -304,18 +304,6 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
}
}
-static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
- uint32_t Address) {
- // Write the specified address into metadata where it can be retrieved by
- // the assembler. Format is a half open range, [Address Address+1)
- LLVMContext &Ctx = M.getContext();
- auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
- MDBuilder MDB(Ctx);
- MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
- ConstantInt::get(IntTy, Address + 1));
- GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
-}
-
static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
bool IsDynLDS) {
if (Offset != 0) {
@@ -378,10 +366,10 @@ void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
bool IsDynLDSUsed = LDSParams.SwDynLDS;
uint32_t Offset = LDSParams.LDSSize;
- recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
+ recordLDSAbsoluteAddress(&M, LDSParams.SwLDS, 0);
addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
if (LDSParams.SwDynLDS)
- recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
+ recordLDSAbsoluteAddress(&M, LDSParams.SwDynLDS, Offset);
}
void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
@@ -1161,6 +1149,21 @@ bool AMDGPUSwLowerLDS::run() {
if (!LowerAllLDS)
return Changed;
+ // Lower special LDS variables like named barriers.
+ if (LDSUsesInfo.HasSpecialGVs) {
+ // For each variable accessed through callees, which kernels access it
+ VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ for (GlobalVariable *GV : K.second) {
+ LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+ }
+ }
+ Changed |= lowerSpecialLDSVariables(
+ M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+ }
+
// Utility to group LDS access into direct, indirect, static and dynamic.
auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
bool DirectAccess) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll
new file mode 100644
index 0000000000000..357f034ef447c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll
@@ -0,0 +1,241 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; This test checks that named barriers in LDS and other LDS in moduleare lowered correctly by the
+; amdgpu-sw-lower-lds pass.
+@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+
+;.
+; CHECK: @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar1.kernel1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META2]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META3:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel1.md.type { %llvm.amdgcn.sw.lds.kernel1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel1.md.item { i32 32, i32 1, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.kernel2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META3]]
+; @llvm.amdgcn.sw.lds.kernel2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel2.md.type { %llvm.amdgcn.sw.lds.kernel2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel2.md.item { i32 32, i32 1, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [2 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel2], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [2 x [1 x ptr addrspace(1)]] [[1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel1.md, i32 0, i32 1, i32 0)], [1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel2.md, i32 0, i32 1, i32 0)]], no_sanitize_address
+;.
+define void @func1() {
+; CHECK-LABEL: define void @func1() {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x [1 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]]
+; CHECK-NEXT: store i8 1, ptr addrspace...
[truncated]
|
|
||
; This test checks that named barriers in LDS and other LDS in moduleare lowered correctly by the | ||
; amdgpu-sw-lower-lds pass. | ||
@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand why the diff is so big. These aren't special named globals, they are special target types
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAIU, LDS globals with TargetExtType and name as "amdgcn.named.barrier" are called named barriers in backend.
This test in particular, has a static LDS global with kernel accessing it directly and also in-directly using device functions. So, the IR generated has asan specific lowering handling all the scenarios. And, named barrier LDS are lowered following lower-module-lds pass.
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s | ||
; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s | ||
|
||
; This test checks that named barriers in LDS and other LDS in moduleare lowered correctly by the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
; This test checks that named barriers in LDS and other LDS in moduleare lowered correctly by the | |
; This test checks that named barriers in LDS and other LDS in module are lowered correctly by the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed. Thanks.
It seems to me better if we can ignore those "special" GVs in amdgpu-sw-lower-lds, instead of duplicating the call maybe amdgpu-lower-module-lds also needs some adjustment to handle this scenario we have not tested before. |
As per the
// Verify that we fall into one of 2 cases: // - All variables are either absolute // or direct mapped dynamic LDS that is not lowered. // this is a re-run of the pass // so we don't have anything to do. // - No variables are absolute. AFAIU, this check is essential to make the pass idempotent, so that no lowering happens in the rerun once its been done already. And also fails in case of partial lowering. And this is the reason in case of ASAN, we handle all the lowering in the |
In amdgpu-lower-module-lds pass, could we modify the check to allow another situation to work:
|
This PR updates
amdgpu-sw-lower-lds
pass to use the logic of handling Special GVs i.e named barriers fromamdgpu-lower-module-lds
.Also moves utility methods
recordLDSAbsoluteAddress
,uniquifyGVPerKernel
andlowerSpecialLDSVariables
fromamdgpu-lower-module-lds
toAMDGPUMemoryUtils
module.