Skip to content

Commit 7047b52

Browse files
kerbowabcahoon
authored andcommitted
[AMDGPU] Move kernarg preload logic to separate pass (llvm#130434)
Moves kernarg preload logic to its own module pass. Cloned function declarations are removed when preloading hidden arguments. The inreg attribute is now added in this pass instead of AMDGPUAttributor. The rest of the logic is copied from AMDGPULowerKernelArguments which now only check whether an arguments is marked inreg to avoid replacing direct uses of preloaded arguments. This change requires test updates to remove inreg from lit tests with kernels that don't actually want preloading. (cherry picked from commit 2c9a46c)
1 parent a74ba40 commit 7047b52

17 files changed

+443
-664
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ ModulePass *createAMDGPULowerBufferFatPointersPass();
6565
FunctionPass *createSIModeRegisterPass();
6666
FunctionPass *createGCNPreRAOptimizationsPass();
6767
FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
68+
ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *);
6869

6970
struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
7071
AMDGPUSimplifyLibCallsPass() {}
@@ -234,6 +235,9 @@ extern char &GCNRegPressurePrinterID;
234235
void initializeAMDGPUPreloadKernArgPrologLegacyPass(PassRegistry &);
235236
extern char &AMDGPUPreloadKernArgPrologLegacyID;
236237

238+
void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
239+
extern char &AMDGPUPreloadKernelArgumentsLegacyID;
240+
237241
// Passes common to R600 and SI
238242
FunctionPass *createAMDGPUPromoteAlloca();
239243
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
@@ -339,6 +343,16 @@ class AMDGPUAttributorPass : public PassInfoMixin<AMDGPUAttributorPass> {
339343
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
340344
};
341345

346+
class AMDGPUPreloadKernelArgumentsPass
347+
: public PassInfoMixin<AMDGPUPreloadKernelArgumentsPass> {
348+
const TargetMachine &TM;
349+
350+
public:
351+
explicit AMDGPUPreloadKernelArgumentsPass(const TargetMachine &TM) : TM(TM) {}
352+
353+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
354+
};
355+
342356
class AMDGPUAnnotateUniformValuesPass
343357
: public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> {
344358
public:

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,6 @@ void initializeCycleInfoWrapperPassPass(PassRegistry &);
2929

3030
using namespace llvm;
3131

32-
static cl::opt<unsigned> KernargPreloadCount(
33-
"amdgpu-kernarg-preload-count",
34-
cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
35-
3632
static cl::opt<unsigned> IndirectCallSpecializationThreshold(
3733
"amdgpu-indirect-call-specialization-threshold",
3834
cl::desc(
@@ -1324,21 +1320,6 @@ struct AAAMDGPUNoAGPR
13241320

13251321
const char AAAMDGPUNoAGPR::ID = 0;
13261322

1327-
static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
1328-
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1329-
for (unsigned I = 0;
1330-
I < F.arg_size() &&
1331-
I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
1332-
++I) {
1333-
Argument &Arg = *F.getArg(I);
1334-
// Check for incompatible attributes.
1335-
if (Arg.hasByRefAttr() || Arg.hasNestAttr())
1336-
break;
1337-
1338-
Arg.addAttr(Attribute::InReg);
1339-
}
1340-
}
1341-
13421323
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13431324
AMDGPUAttributorOptions Options) {
13441325
SetVector<Function *> Functions;
@@ -1394,8 +1375,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13941375
if (!AMDGPU::isEntryFunctionCC(CC)) {
13951376
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
13961377
A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
1397-
} else if (CC == CallingConv::AMDGPU_KERNEL) {
1398-
addPreloadKernArgHint(*F, TM);
13991378
}
14001379

14011380
for (auto &I : instructions(F)) {

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 2 additions & 253 deletions
Original file line numberDiff line numberDiff line change
@@ -27,230 +27,6 @@ using namespace llvm;
2727

2828
namespace {
2929

30-
class PreloadKernelArgInfo {
31-
private:
32-
Function &F;
33-
const GCNSubtarget &ST;
34-
unsigned NumFreeUserSGPRs;
35-
36-
enum HiddenArg : unsigned {
37-
HIDDEN_BLOCK_COUNT_X,
38-
HIDDEN_BLOCK_COUNT_Y,
39-
HIDDEN_BLOCK_COUNT_Z,
40-
HIDDEN_GROUP_SIZE_X,
41-
HIDDEN_GROUP_SIZE_Y,
42-
HIDDEN_GROUP_SIZE_Z,
43-
HIDDEN_REMAINDER_X,
44-
HIDDEN_REMAINDER_Y,
45-
HIDDEN_REMAINDER_Z,
46-
END_HIDDEN_ARGS
47-
};
48-
49-
// Stores information about a specific hidden argument.
50-
struct HiddenArgInfo {
51-
// Offset in bytes from the location in the kernearg segment pointed to by
52-
// the implicitarg pointer.
53-
uint8_t Offset;
54-
// The size of the hidden argument in bytes.
55-
uint8_t Size;
56-
// The name of the hidden argument in the kernel signature.
57-
const char *Name;
58-
};
59-
60-
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61-
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
62-
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
63-
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
64-
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
65-
{22, 2, "_hidden_remainder_z"}};
66-
67-
static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
68-
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
69-
if (HiddenArgs[I].Offset == Offset)
70-
return static_cast<HiddenArg>(I);
71-
72-
return END_HIDDEN_ARGS;
73-
}
74-
75-
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
76-
if (HA < END_HIDDEN_ARGS)
77-
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
78-
79-
llvm_unreachable("Unexpected hidden argument.");
80-
}
81-
82-
static const char *getHiddenArgName(HiddenArg HA) {
83-
if (HA < END_HIDDEN_ARGS) {
84-
return HiddenArgs[HA].Name;
85-
}
86-
llvm_unreachable("Unexpected hidden argument.");
87-
}
88-
89-
// Clones the function after adding implicit arguments to the argument list
90-
// and returns the new updated function. Preloaded implicit arguments are
91-
// added up to and including the last one that will be preloaded, indicated by
92-
// LastPreloadIndex. Currently preloading is only performed on the totality of
93-
// sequential data from the kernarg segment including implicit (hidden)
94-
// arguments. This means that all arguments up to the last preloaded argument
95-
// will also be preloaded even if that data is unused.
96-
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
97-
FunctionType *FT = F.getFunctionType();
98-
LLVMContext &Ctx = F.getParent()->getContext();
99-
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
100-
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
101-
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
102-
103-
FunctionType *NFT =
104-
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
105-
Function *NF =
106-
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
107-
108-
NF->copyAttributesFrom(&F);
109-
NF->copyMetadata(&F, 0);
110-
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
111-
112-
F.getParent()->getFunctionList().insert(F.getIterator(), NF);
113-
NF->takeName(&F);
114-
NF->splice(NF->begin(), &F);
115-
116-
Function::arg_iterator NFArg = NF->arg_begin();
117-
for (Argument &Arg : F.args()) {
118-
Arg.replaceAllUsesWith(&*NFArg);
119-
NFArg->takeName(&Arg);
120-
++NFArg;
121-
}
122-
123-
AttrBuilder AB(Ctx);
124-
AB.addAttribute(Attribute::InReg);
125-
AB.addAttribute("amdgpu-hidden-argument");
126-
AttributeList AL = NF->getAttributes();
127-
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
128-
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
129-
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
130-
}
131-
132-
NF->setAttributes(AL);
133-
F.replaceAllUsesWith(NF);
134-
F.setCallingConv(CallingConv::C);
135-
136-
return NF;
137-
}
138-
139-
public:
140-
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
141-
setInitialFreeUserSGPRsCount();
142-
}
143-
144-
// Returns the maximum number of user SGPRs that we have available to preload
145-
// arguments.
146-
void setInitialFreeUserSGPRsCount() {
147-
GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
148-
NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
149-
}
150-
151-
bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,
152-
uint64_t LastExplicitArgOffset) {
153-
// Check if this argument may be loaded into the same register as the
154-
// previous argument.
155-
if (ArgOffset - LastExplicitArgOffset < 4 &&
156-
!isAligned(Align(4), ArgOffset))
157-
return true;
158-
159-
// Pad SGPRs for kernarg alignment.
160-
ArgOffset = alignDown(ArgOffset, 4);
161-
unsigned Padding = ArgOffset - LastExplicitArgOffset;
162-
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
163-
unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;
164-
if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
165-
return false;
166-
167-
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
168-
return true;
169-
}
170-
171-
// Try to allocate SGPRs to preload implicit kernel arguments.
172-
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
173-
uint64_t LastExplicitArgOffset,
174-
IRBuilder<> &Builder) {
175-
Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
176-
F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
177-
if (!ImplicitArgPtr)
178-
return;
179-
180-
const DataLayout &DL = F.getParent()->getDataLayout();
181-
// Pair is the load and the load offset.
182-
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
183-
for (auto *U : ImplicitArgPtr->users()) {
184-
Instruction *CI = dyn_cast<Instruction>(U);
185-
if (!CI || CI->getParent()->getParent() != &F)
186-
continue;
187-
188-
for (auto *U : CI->users()) {
189-
int64_t Offset = 0;
190-
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
191-
if (!Load) {
192-
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
193-
continue;
194-
195-
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
196-
}
197-
198-
if (!Load || !Load->isSimple())
199-
continue;
200-
201-
// FIXME: Expand to handle 64-bit implicit args and large merged loads.
202-
LLVMContext &Ctx = F.getParent()->getContext();
203-
Type *LoadTy = Load->getType();
204-
HiddenArg HA = getHiddenArgFromOffset(Offset);
205-
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
206-
continue;
207-
208-
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
209-
}
210-
}
211-
212-
if (ImplicitArgLoads.empty())
213-
return;
214-
215-
// Allocate loads in order of offset. We need to be sure that the implicit
216-
// argument can actually be preloaded.
217-
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
218-
219-
// If we fail to preload any implicit argument we know we don't have SGPRs
220-
// to preload any subsequent ones with larger offsets. Find the first
221-
// argument that we cannot preload.
222-
auto *PreloadEnd = std::find_if(
223-
ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
224-
[&](const std::pair<LoadInst *, unsigned> &Load) {
225-
unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
226-
unsigned LoadOffset = Load.second;
227-
if (!tryAllocPreloadSGPRs(LoadSize,
228-
LoadOffset + ImplicitArgsBaseOffset,
229-
LastExplicitArgOffset))
230-
return true;
231-
232-
LastExplicitArgOffset =
233-
ImplicitArgsBaseOffset + LoadOffset + LoadSize;
234-
return false;
235-
});
236-
237-
if (PreloadEnd == ImplicitArgLoads.begin())
238-
return;
239-
240-
unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
241-
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
242-
assert(NF);
243-
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
244-
LoadInst *LoadInst = I->first;
245-
unsigned LoadOffset = I->second;
246-
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
247-
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
248-
Argument *Arg = NF->getArg(Index);
249-
LoadInst->replaceAllUsesWith(Arg);
250-
}
251-
}
252-
};
253-
25430
class AMDGPULowerKernelArguments : public FunctionPass {
25531
public:
25632
static char ID;
@@ -310,10 +86,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
31086
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
31187

31288
uint64_t ExplicitArgOffset = 0;
313-
// Preloaded kernel arguments must be sequential.
314-
bool InPreloadSequence = true;
315-
PreloadKernelArgInfo PreloadInfo(F, ST);
316-
31789
for (Argument &Arg : F.args()) {
31890
const bool IsByRef = Arg.hasByRefAttr();
31991
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
@@ -324,25 +96,10 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
32496
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
32597

32698
uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
327-
uint64_t LastExplicitArgOffset = ExplicitArgOffset;
32899
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
329100

330-
// Guard against the situation where hidden arguments have already been
331-
// lowered and added to the kernel function signiture, i.e. in a situation
332-
// where this pass has run twice.
333-
if (Arg.hasAttribute("amdgpu-hidden-argument"))
334-
break;
335-
336-
// Try to preload this argument into user SGPRs.
337-
if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
338-
!Arg.getType()->isAggregateType())
339-
if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
340-
LastExplicitArgOffset))
341-
continue;
342-
343-
InPreloadSequence = false;
344-
345-
if (Arg.use_empty())
101+
// Skip inreg arguments which should be preloaded.
102+
if (Arg.use_empty() || Arg.hasInRegAttr())
346103
continue;
347104

348105
// If this is byval, the loads are already explicit in the function. We just
@@ -482,14 +239,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
482239
KernArgSegment->addRetAttr(
483240
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
484241

485-
if (InPreloadSequence) {
486-
uint64_t ImplicitArgsBaseOffset =
487-
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
488-
BaseOffset;
489-
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
490-
ExplicitArgOffset, Builder);
491-
}
492-
493242
return true;
494243
}
495244

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
2727
MODULE_PASS("amdgpu-perf-hint",
2828
AMDGPUPerfHintAnalysisPass(
2929
*static_cast<const GCNTargetMachine *>(this)))
30+
MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
3031
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
3132
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
3233
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))

0 commit comments

Comments
 (0)