Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 0 additions & 133 deletions llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,19 +511,6 @@ class AMDGPULowerModuleLDS {
return MostUsed.GV;
}

static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
uint32_t Address) {
// Write the specified address into metadata where it can be retrieved by
// the assembler. Format is a half open range, [Address Address+1)
LLVMContext &Ctx = M->getContext();
auto *IntTy =
M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
GV->setMetadata(LLVMContext::MD_absolute_symbol,
MDNode::get(Ctx, {MinC, MaxC}));
}

DenseMap<Function *, Value *> tableKernelIndexCache;
Value *getTableLookupKernelIndex(Module &M, Function *F) {
// Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
Expand Down Expand Up @@ -922,126 +909,6 @@ class AMDGPULowerModuleLDS {
return KernelToCreatedDynamicLDS;
}

static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
Function *KF) {
bool NeedsReplacement = false;
for (Use &U : GV->uses()) {
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
Function *F = I->getFunction();
if (isKernelLDS(F) && F != KF) {
NeedsReplacement = true;
break;
}
}
}
if (!NeedsReplacement)
return GV;
// Create a new GV used only by this kernel and its function
GlobalVariable *NewGV = new GlobalVariable(
M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
NewGV->copyAttributesFrom(GV);
for (Use &U : make_early_inc_range(GV->uses())) {
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
Function *F = I->getFunction();
if (!isKernelLDS(F) || F == KF) {
U.getUser()->replaceUsesOfWith(GV, NewGV);
}
}
}
return NewGV;
}

bool lowerSpecialLDSVariables(
Module &M, LDSUsesInfoTy &LDSUsesInfo,
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
bool Changed = false;
const DataLayout &DL = M.getDataLayout();
// The 1st round: give module-absolute assignments
int NumAbsolutes = 0;
std::vector<GlobalVariable *> OrderedGVs;
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
GlobalVariable *GV = K.first;
if (!isNamedBarrier(*GV))
continue;
// give a module-absolute assignment if it is indirectly accessed by
// multiple kernels. This is not precise, but we don't want to duplicate
// a function when it is called by multiple kernels.
if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
OrderedGVs.push_back(GV);
} else {
// leave it to the 2nd round, which will give a kernel-relative
// assignment if it is only indirectly accessed by one kernel
LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
}
LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
}
OrderedGVs = sortByName(std::move(OrderedGVs));
for (GlobalVariable *GV : OrderedGVs) {
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
unsigned BarId = NumAbsolutes + 1;
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
NumAbsolutes += BarCnt;

// 4 bits for alignment, 5 bits for the barrier num,
// 3 bits for the barrier scope
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
recordLDSAbsoluteAddress(&M, GV, Offset);
}
OrderedGVs.clear();

// The 2nd round: give a kernel-relative assignment for GV that
// either only indirectly accessed by single kernel or only directly
// accessed by multiple kernels.
std::vector<Function *> OrderedKernels;
for (auto &K : LDSUsesInfo.direct_access) {
Function *F = K.first;
assert(isKernelLDS(F));
OrderedKernels.push_back(F);
}
OrderedKernels = sortByName(std::move(OrderedKernels));

llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
for (Function *F : OrderedKernels) {
for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
if (!isNamedBarrier(*GV))
continue;

LDSUsesInfo.direct_access[F].erase(GV);
if (GV->isAbsoluteSymbolRef()) {
// already assigned
continue;
}
OrderedGVs.push_back(GV);
}
OrderedGVs = sortByName(std::move(OrderedGVs));
for (GlobalVariable *GV : OrderedGVs) {
// GV could also be used directly by other kernels. If so, we need to
// create a new GV used only by this kernel and its function.
auto NewGV = uniquifyGVPerKernel(M, GV, F);
Changed |= (NewGV != GV);
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
unsigned BarId = Kernel2BarId[F];
BarId += NumAbsolutes + 1;
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
Kernel2BarId[F] += BarCnt;
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
recordLDSAbsoluteAddress(&M, NewGV, Offset);
}
OrderedGVs.clear();
}
// Also erase those special LDS variables from indirect_access.
for (auto &K : LDSUsesInfo.indirect_access) {
assert(isKernelLDS(K.first));
for (GlobalVariable *GV : K.second) {
if (isNamedBarrier(*GV))
K.second.erase(GV);
}
}
return Changed;
}

bool runOnModule(Module &M) {
CallGraph CG = CallGraph(M);
bool Changed = superAlignLDSGlobals(M);
Expand Down
138 changes: 138 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,4 +439,142 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
return false;
}

GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
Function *KF) {
bool NeedsReplacement = false;
for (Use &U : GV->uses()) {
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
Function *F = I->getFunction();
if (isKernelLDS(F) && F != KF) {
NeedsReplacement = true;
break;
}
}
}
if (!NeedsReplacement)
return GV;
// Create a new GV used only by this kernel and its function
GlobalVariable *NewGV = new GlobalVariable(
M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
NewGV->copyAttributesFrom(GV);
for (Use &U : make_early_inc_range(GV->uses())) {
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
Function *F = I->getFunction();
if (!isKernelLDS(F) || F == KF) {
U.getUser()->replaceUsesOfWith(GV, NewGV);
}
}
}
return NewGV;
}

template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
llvm::sort(V, [](const auto *L, const auto *R) {
return L->getName() < R->getName();
});
return {std::move(V)};
}

void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, uint32_t Address) {
// Write the specified address into metadata where it can be retrieved by
// the assembler. Format is a half open range, [Address Address+1)
LLVMContext &Ctx = M->getContext();
auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
GV->setMetadata(LLVMContext::MD_absolute_symbol,
MDNode::get(Ctx, {MinC, MaxC}));
}

bool lowerSpecialLDSVariables(
Module &M, LDSUsesInfoTy &LDSUsesInfo,
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
bool Changed = false;
const DataLayout &DL = M.getDataLayout();
// The 1st round: give module-absolute assignments
int NumAbsolutes = 0;
std::vector<GlobalVariable *> OrderedGVs;
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
GlobalVariable *GV = K.first;
if (!isNamedBarrier(*GV))
continue;
// give a module-absolute assignment if it is indirectly accessed by
// multiple kernels. This is not precise, but we don't want to duplicate
// a function when it is called by multiple kernels.
if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
OrderedGVs.push_back(GV);
} else {
// leave it to the 2nd round, which will give a kernel-relative
// assignment if it is only indirectly accessed by one kernel
LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
}
LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
}
OrderedGVs = sortByName(std::move(OrderedGVs));
for (GlobalVariable *GV : OrderedGVs) {
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
unsigned BarId = NumAbsolutes + 1;
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
NumAbsolutes += BarCnt;

// 4 bits for alignment, 5 bits for the barrier num,
// 3 bits for the barrier scope
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
recordLDSAbsoluteAddress(&M, GV, Offset);
}
OrderedGVs.clear();

// The 2nd round: give a kernel-relative assignment for GV that
// either only indirectly accessed by single kernel or only directly
// accessed by multiple kernels.
std::vector<Function *> OrderedKernels;
for (auto &K : LDSUsesInfo.direct_access) {
Function *F = K.first;
assert(isKernelLDS(F));
OrderedKernels.push_back(F);
}
OrderedKernels = sortByName(std::move(OrderedKernels));

llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
for (Function *F : OrderedKernels) {
for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
if (!isNamedBarrier(*GV))
continue;

LDSUsesInfo.direct_access[F].erase(GV);
if (GV->isAbsoluteSymbolRef()) {
// already assigned
continue;
}
OrderedGVs.push_back(GV);
}
OrderedGVs = sortByName(std::move(OrderedGVs));
for (GlobalVariable *GV : OrderedGVs) {
// GV could also be used directly by other kernels. If so, we need to
// create a new GV used only by this kernel and its function.
auto NewGV = uniquifyGVPerKernel(M, GV, F);
Changed |= (NewGV != GV);
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
unsigned BarId = Kernel2BarId[F];
BarId += NumAbsolutes + 1;
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
Kernel2BarId[F] += BarCnt;
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
recordLDSAbsoluteAddress(&M, NewGV, Offset);
}
OrderedGVs.clear();
}
// Also erase those special LDS variables from indirect_access.
for (auto &K : LDSUsesInfo.indirect_access) {
assert(isKernelLDS(K.first));
for (GlobalVariable *GV : K.second) {
if (isNamedBarrier(*GV))
K.second.erase(GV);
}
}
return Changed;
}

} // end namespace llvm::AMDGPU
16 changes: 16 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,22 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA);
bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
AAResults *AA);

/// Create a new global variable in \p M based on \p GV, but uniquified for
/// \p KF. The new global variable will have the same properties as \p GV, but
/// will have a name based on \p GV and \p KF to ensure uniqueness.
GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
Function *KF);

/// Record the absolute address of \p GV in the module flag metadata of \p M.
void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, uint32_t Address);

/// Lower special LDS variables i.e named barriers in \p M.
/// Update \p LDSUsesInfo and \p LDSToKernelsThatNeedToAccessItIndirectly
/// to reflect any changes made.
bool lowerSpecialLDSVariables(
Module &M, LDSUsesInfoTy &LDSUsesInfo,
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly);

} // end namespace AMDGPU

} // end namespace llvm
Expand Down
31 changes: 17 additions & 14 deletions llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,18 +304,6 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
}
}

static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
uint32_t Address) {
// Write the specified address into metadata where it can be retrieved by
// the assembler. Format is a half open range, [Address Address+1)
LLVMContext &Ctx = M.getContext();
auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
MDBuilder MDB(Ctx);
MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
ConstantInt::get(IntTy, Address + 1));
GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
}

static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
bool IsDynLDS) {
if (Offset != 0) {
Expand Down Expand Up @@ -378,10 +366,10 @@ void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
bool IsDynLDSUsed = LDSParams.SwDynLDS;
uint32_t Offset = LDSParams.LDSSize;
recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
recordLDSAbsoluteAddress(&M, LDSParams.SwLDS, 0);
addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
if (LDSParams.SwDynLDS)
recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
recordLDSAbsoluteAddress(&M, LDSParams.SwDynLDS, Offset);
}

void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
Expand Down Expand Up @@ -1161,6 +1149,21 @@ bool AMDGPUSwLowerLDS::run() {
if (!LowerAllLDS)
return Changed;

// Lower special LDS variables like named barriers.
if (LDSUsesInfo.HasSpecialGVs) {
// For each variable accessed through callees, which kernels access it
VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
for (auto &K : LDSUsesInfo.indirect_access) {
Function *F = K.first;
assert(isKernelLDS(F));
for (GlobalVariable *GV : K.second) {
LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
}
}
Changed |= lowerSpecialLDSVariables(
M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
}

// Utility to group LDS access into direct, indirect, static and dynamic.
auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
bool DirectAccess) {
Expand Down
Loading