Skip to content

Commit 99a4a7d

Browse files
committed
[AMDGPU][ASAN] Handle special GVs in amdgpu-sw-lower-lds
1 parent cf50bbf commit 99a4a7d

File tree

6 files changed

+525
-147
lines changed

6 files changed

+525
-147
lines changed

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Lines changed: 0 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -511,19 +511,6 @@ class AMDGPULowerModuleLDS {
511511
return MostUsed.GV;
512512
}
513513

514-
static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
515-
uint32_t Address) {
516-
// Write the specified address into metadata where it can be retrieved by
517-
// the assembler. Format is a half open range, [Address Address+1)
518-
LLVMContext &Ctx = M->getContext();
519-
auto *IntTy =
520-
M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
521-
auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
522-
auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
523-
GV->setMetadata(LLVMContext::MD_absolute_symbol,
524-
MDNode::get(Ctx, {MinC, MaxC}));
525-
}
526-
527514
DenseMap<Function *, Value *> tableKernelIndexCache;
528515
Value *getTableLookupKernelIndex(Module &M, Function *F) {
529516
// Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
@@ -922,126 +909,6 @@ class AMDGPULowerModuleLDS {
922909
return KernelToCreatedDynamicLDS;
923910
}
924911

925-
static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
926-
Function *KF) {
927-
bool NeedsReplacement = false;
928-
for (Use &U : GV->uses()) {
929-
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
930-
Function *F = I->getFunction();
931-
if (isKernelLDS(F) && F != KF) {
932-
NeedsReplacement = true;
933-
break;
934-
}
935-
}
936-
}
937-
if (!NeedsReplacement)
938-
return GV;
939-
// Create a new GV used only by this kernel and its function
940-
GlobalVariable *NewGV = new GlobalVariable(
941-
M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
942-
GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
943-
GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
944-
NewGV->copyAttributesFrom(GV);
945-
for (Use &U : make_early_inc_range(GV->uses())) {
946-
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
947-
Function *F = I->getFunction();
948-
if (!isKernelLDS(F) || F == KF) {
949-
U.getUser()->replaceUsesOfWith(GV, NewGV);
950-
}
951-
}
952-
}
953-
return NewGV;
954-
}
955-
956-
bool lowerSpecialLDSVariables(
957-
Module &M, LDSUsesInfoTy &LDSUsesInfo,
958-
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
959-
bool Changed = false;
960-
const DataLayout &DL = M.getDataLayout();
961-
// The 1st round: give module-absolute assignments
962-
int NumAbsolutes = 0;
963-
std::vector<GlobalVariable *> OrderedGVs;
964-
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
965-
GlobalVariable *GV = K.first;
966-
if (!isNamedBarrier(*GV))
967-
continue;
968-
// give a module-absolute assignment if it is indirectly accessed by
969-
// multiple kernels. This is not precise, but we don't want to duplicate
970-
// a function when it is called by multiple kernels.
971-
if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
972-
OrderedGVs.push_back(GV);
973-
} else {
974-
// leave it to the 2nd round, which will give a kernel-relative
975-
// assignment if it is only indirectly accessed by one kernel
976-
LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
977-
}
978-
LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
979-
}
980-
OrderedGVs = sortByName(std::move(OrderedGVs));
981-
for (GlobalVariable *GV : OrderedGVs) {
982-
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
983-
unsigned BarId = NumAbsolutes + 1;
984-
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
985-
NumAbsolutes += BarCnt;
986-
987-
// 4 bits for alignment, 5 bits for the barrier num,
988-
// 3 bits for the barrier scope
989-
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
990-
recordLDSAbsoluteAddress(&M, GV, Offset);
991-
}
992-
OrderedGVs.clear();
993-
994-
// The 2nd round: give a kernel-relative assignment for GV that
995-
// either only indirectly accessed by single kernel or only directly
996-
// accessed by multiple kernels.
997-
std::vector<Function *> OrderedKernels;
998-
for (auto &K : LDSUsesInfo.direct_access) {
999-
Function *F = K.first;
1000-
assert(isKernelLDS(F));
1001-
OrderedKernels.push_back(F);
1002-
}
1003-
OrderedKernels = sortByName(std::move(OrderedKernels));
1004-
1005-
llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
1006-
for (Function *F : OrderedKernels) {
1007-
for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
1008-
if (!isNamedBarrier(*GV))
1009-
continue;
1010-
1011-
LDSUsesInfo.direct_access[F].erase(GV);
1012-
if (GV->isAbsoluteSymbolRef()) {
1013-
// already assigned
1014-
continue;
1015-
}
1016-
OrderedGVs.push_back(GV);
1017-
}
1018-
OrderedGVs = sortByName(std::move(OrderedGVs));
1019-
for (GlobalVariable *GV : OrderedGVs) {
1020-
// GV could also be used directly by other kernels. If so, we need to
1021-
// create a new GV used only by this kernel and its function.
1022-
auto NewGV = uniquifyGVPerKernel(M, GV, F);
1023-
Changed |= (NewGV != GV);
1024-
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
1025-
unsigned BarId = Kernel2BarId[F];
1026-
BarId += NumAbsolutes + 1;
1027-
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1028-
Kernel2BarId[F] += BarCnt;
1029-
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
1030-
recordLDSAbsoluteAddress(&M, NewGV, Offset);
1031-
}
1032-
OrderedGVs.clear();
1033-
}
1034-
// Also erase those special LDS variables from indirect_access.
1035-
for (auto &K : LDSUsesInfo.indirect_access) {
1036-
assert(isKernelLDS(K.first));
1037-
for (GlobalVariable *GV : K.second) {
1038-
if (isNamedBarrier(*GV))
1039-
K.second.erase(GV);
1040-
}
1041-
}
1042-
return Changed;
1043-
}
1044-
1045912
bool runOnModule(Module &M) {
1046913
CallGraph CG = CallGraph(M);
1047914
bool Changed = superAlignLDSGlobals(M);

llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,4 +439,142 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
439439
return false;
440440
}
441441

442+
GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
443+
Function *KF) {
444+
bool NeedsReplacement = false;
445+
for (Use &U : GV->uses()) {
446+
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
447+
Function *F = I->getFunction();
448+
if (isKernelLDS(F) && F != KF) {
449+
NeedsReplacement = true;
450+
break;
451+
}
452+
}
453+
}
454+
if (!NeedsReplacement)
455+
return GV;
456+
// Create a new GV used only by this kernel and its function
457+
GlobalVariable *NewGV = new GlobalVariable(
458+
M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
459+
GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
460+
GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
461+
NewGV->copyAttributesFrom(GV);
462+
for (Use &U : make_early_inc_range(GV->uses())) {
463+
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
464+
Function *F = I->getFunction();
465+
if (!isKernelLDS(F) || F == KF) {
466+
U.getUser()->replaceUsesOfWith(GV, NewGV);
467+
}
468+
}
469+
}
470+
return NewGV;
471+
}
472+
473+
template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
474+
llvm::sort(V, [](const auto *L, const auto *R) {
475+
return L->getName() < R->getName();
476+
});
477+
return {std::move(V)};
478+
}
479+
480+
void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, uint32_t Address) {
481+
// Write the specified address into metadata where it can be retrieved by
482+
// the assembler. Format is a half open range, [Address Address+1)
483+
LLVMContext &Ctx = M->getContext();
484+
auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
485+
auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
486+
auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
487+
GV->setMetadata(LLVMContext::MD_absolute_symbol,
488+
MDNode::get(Ctx, {MinC, MaxC}));
489+
}
490+
491+
bool lowerSpecialLDSVariables(
492+
Module &M, LDSUsesInfoTy &LDSUsesInfo,
493+
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
494+
bool Changed = false;
495+
const DataLayout &DL = M.getDataLayout();
496+
// The 1st round: give module-absolute assignments
497+
int NumAbsolutes = 0;
498+
std::vector<GlobalVariable *> OrderedGVs;
499+
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
500+
GlobalVariable *GV = K.first;
501+
if (!isNamedBarrier(*GV))
502+
continue;
503+
// give a module-absolute assignment if it is indirectly accessed by
504+
// multiple kernels. This is not precise, but we don't want to duplicate
505+
// a function when it is called by multiple kernels.
506+
if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
507+
OrderedGVs.push_back(GV);
508+
} else {
509+
// leave it to the 2nd round, which will give a kernel-relative
510+
// assignment if it is only indirectly accessed by one kernel
511+
LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
512+
}
513+
LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
514+
}
515+
OrderedGVs = sortByName(std::move(OrderedGVs));
516+
for (GlobalVariable *GV : OrderedGVs) {
517+
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
518+
unsigned BarId = NumAbsolutes + 1;
519+
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
520+
NumAbsolutes += BarCnt;
521+
522+
// 4 bits for alignment, 5 bits for the barrier num,
523+
// 3 bits for the barrier scope
524+
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
525+
recordLDSAbsoluteAddress(&M, GV, Offset);
526+
}
527+
OrderedGVs.clear();
528+
529+
// The 2nd round: give a kernel-relative assignment for GV that
530+
// either only indirectly accessed by single kernel or only directly
531+
// accessed by multiple kernels.
532+
std::vector<Function *> OrderedKernels;
533+
for (auto &K : LDSUsesInfo.direct_access) {
534+
Function *F = K.first;
535+
assert(isKernelLDS(F));
536+
OrderedKernels.push_back(F);
537+
}
538+
OrderedKernels = sortByName(std::move(OrderedKernels));
539+
540+
llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
541+
for (Function *F : OrderedKernels) {
542+
for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
543+
if (!isNamedBarrier(*GV))
544+
continue;
545+
546+
LDSUsesInfo.direct_access[F].erase(GV);
547+
if (GV->isAbsoluteSymbolRef()) {
548+
// already assigned
549+
continue;
550+
}
551+
OrderedGVs.push_back(GV);
552+
}
553+
OrderedGVs = sortByName(std::move(OrderedGVs));
554+
for (GlobalVariable *GV : OrderedGVs) {
555+
// GV could also be used directly by other kernels. If so, we need to
556+
// create a new GV used only by this kernel and its function.
557+
auto NewGV = uniquifyGVPerKernel(M, GV, F);
558+
Changed |= (NewGV != GV);
559+
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
560+
unsigned BarId = Kernel2BarId[F];
561+
BarId += NumAbsolutes + 1;
562+
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
563+
Kernel2BarId[F] += BarCnt;
564+
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
565+
recordLDSAbsoluteAddress(&M, NewGV, Offset);
566+
}
567+
OrderedGVs.clear();
568+
}
569+
// Also erase those special LDS variables from indirect_access.
570+
for (auto &K : LDSUsesInfo.indirect_access) {
571+
assert(isKernelLDS(K.first));
572+
for (GlobalVariable *GV : K.second) {
573+
if (isNamedBarrier(*GV))
574+
K.second.erase(GV);
575+
}
576+
}
577+
return Changed;
578+
}
579+
442580
} // end namespace llvm::AMDGPU

llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,22 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA);
7171
bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
7272
AAResults *AA);
7373

74+
/// Create a new global variable in \p M based on \p GV, but uniquified for
75+
/// \p KF. The new global variable will have the same properties as \p GV, but
76+
/// will have a name based on \p GV and \p KF to ensure uniqueness.
77+
GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
78+
Function *KF);
79+
80+
/// Record the absolute address of \p GV in the module flag metadata of \p M.
81+
void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, uint32_t Address);
82+
83+
/// Lower special LDS variables i.e named barriers in \p M.
84+
/// Update \p LDSUsesInfo and \p LDSToKernelsThatNeedToAccessItIndirectly
85+
/// to reflect any changes made.
86+
bool lowerSpecialLDSVariables(
87+
Module &M, LDSUsesInfoTy &LDSUsesInfo,
88+
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly);
89+
7490
} // end namespace AMDGPU
7591

7692
} // end namespace llvm

llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -304,18 +304,6 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
304304
}
305305
}
306306

307-
static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
308-
uint32_t Address) {
309-
// Write the specified address into metadata where it can be retrieved by
310-
// the assembler. Format is a half open range, [Address Address+1)
311-
LLVMContext &Ctx = M.getContext();
312-
auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
313-
MDBuilder MDB(Ctx);
314-
MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
315-
ConstantInt::get(IntTy, Address + 1));
316-
GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
317-
}
318-
319307
static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
320308
bool IsDynLDS) {
321309
if (Offset != 0) {
@@ -378,10 +366,10 @@ void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
378366
auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
379367
bool IsDynLDSUsed = LDSParams.SwDynLDS;
380368
uint32_t Offset = LDSParams.LDSSize;
381-
recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
369+
recordLDSAbsoluteAddress(&M, LDSParams.SwLDS, 0);
382370
addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
383371
if (LDSParams.SwDynLDS)
384-
recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
372+
recordLDSAbsoluteAddress(&M, LDSParams.SwDynLDS, Offset);
385373
}
386374

387375
void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
@@ -1161,6 +1149,21 @@ bool AMDGPUSwLowerLDS::run() {
11611149
if (!LowerAllLDS)
11621150
return Changed;
11631151

1152+
// Lower special LDS variables like named barriers.
1153+
if (LDSUsesInfo.HasSpecialGVs) {
1154+
// For each variable accessed through callees, which kernels access it
1155+
VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
1156+
for (auto &K : LDSUsesInfo.indirect_access) {
1157+
Function *F = K.first;
1158+
assert(isKernelLDS(F));
1159+
for (GlobalVariable *GV : K.second) {
1160+
LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
1161+
}
1162+
}
1163+
Changed |= lowerSpecialLDSVariables(
1164+
M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
1165+
}
1166+
11641167
// Utility to group LDS access into direct, indirect, static and dynamic.
11651168
auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
11661169
bool DirectAccess) {

0 commit comments

Comments
 (0)