1313
1414#include " AMDGPU.h"
1515#include " GCNSubtarget.h"
16+ #include " llvm/Analysis/ValueTracking.h"
1617#include " llvm/CodeGen/TargetPassConfig.h"
1718#include " llvm/IR/IRBuilder.h"
1819#include " llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +32,88 @@ class PreloadKernelArgInfo {
3132 const GCNSubtarget &ST;
3233 unsigned NumFreeUserSGPRs;
3334
34- public:
35- SmallVector<llvm::Metadata *, 8 > KernelArgMetadata;
35+ enum HiddenArg : unsigned {
36+ HIDDEN_BLOCK_COUNT_X,
37+ HIDDEN_BLOCK_COUNT_Y,
38+ HIDDEN_BLOCK_COUNT_Z,
39+ HIDDEN_GROUP_SIZE_X,
40+ HIDDEN_GROUP_SIZE_Y,
41+ HIDDEN_GROUP_SIZE_Z,
42+ HIDDEN_REMAINDER_X,
43+ HIDDEN_REMAINDER_Y,
44+ HIDDEN_REMAINDER_Z,
45+ END_HIDDEN_ARGS
46+ };
47+
48+ struct HiddenArgInfo {
49+ unsigned Offset;
50+ unsigned Size;
51+ const char *Name;
52+ };
53+
54+ static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
55+ {0 , 4 , " _hidden_block_count_x" }, {4 , 4 , " _hidden_block_count_y" },
56+ {8 , 4 , " _hidden_block_count_z" }, {12 , 2 , " _hidden_group_size_x" },
57+ {14 , 2 , " _hidden_group_size_y" }, {16 , 2 , " _hidden_group_size_z" },
58+ {18 , 2 , " _hidden_remainder_x" }, {20 , 2 , " _hidden_remainder_y" },
59+ {22 , 2 , " _hidden_remainder_z" }};
60+
61+ static HiddenArg getHiddenArgIndexFromOffset (unsigned Offset) {
62+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
63+ if (HiddenArgs[I].Offset == Offset)
64+ return static_cast <HiddenArg>(I);
65+
66+ llvm_unreachable (" Unexpected hidden argument offset." );
67+ }
68+
69+ static Type *getHiddenArgType (LLVMContext &Ctx, HiddenArg HA) {
70+ if (HA < END_HIDDEN_ARGS)
71+ return Type::getIntNTy (Ctx, HiddenArgs[HA].Size * 8 );
72+
73+ llvm_unreachable (" Unexpected hidden argument." );
74+ }
75+
76+ static const char *getHiddenArgName (HiddenArg HA) {
77+ if (HA < END_HIDDEN_ARGS) {
78+ return HiddenArgs[HA].Name ;
79+ }
80+ llvm_unreachable (" Unexpected hidden argument." );
81+ }
3682
83+ Function *cloneFunctionWithPreloadImplicitArgs () {
84+ FunctionType *FT = F.getFunctionType ();
85+ std::vector<Type *> FTypes (FT->param_begin (), FT->param_end ());
86+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
87+ FTypes.push_back (getHiddenArgType (F.getContext (), HiddenArg (I)));
88+
89+ FunctionType *NFT =
90+ FunctionType::get (FT->getReturnType (), FTypes, FT->isVarArg ());
91+ Function *NF =
92+ Function::Create (NFT, F.getLinkage (), F.getAddressSpace (), F.getName ());
93+
94+ NF->copyAttributesFrom (&F);
95+ NF->copyMetadata (&F, 0 );
96+ NF->setIsNewDbgInfoFormat (F.IsNewDbgInfoFormat );
97+
98+ F.getParent ()->getFunctionList ().insert (F.getIterator (), NF);
99+ NF->takeName (&F);
100+ assert (F.use_empty ());
101+ NF->splice (NF->begin (), &F);
102+
103+ Function::arg_iterator NFArg = NF->arg_begin ();
104+ for (Argument &Arg : F.args ()) {
105+ Arg.replaceAllUsesWith (&*NFArg);
106+ NFArg->takeName (&Arg);
107+ ++NFArg;
108+ }
109+
110+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
111+ NFArg++->setName (getHiddenArgName (HiddenArg (I)));
112+
113+ return NF;
114+ }
115+
116+ public:
37117 PreloadKernelArgInfo (Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38118 setInitialFreeUserSGPRsCount ();
39119 }
@@ -64,6 +144,94 @@ class PreloadKernelArgInfo {
64144 NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65145 return true ;
66146 }
147+
148+ // Try to allocate SGPRs to preload implicit kernel arguments.
149+ void tryAllocImplicitArgPreloadSGPRs (uint64_t ImplicitArgsBaseOffset,
150+ IRBuilder<> &Builder) {
151+ StringRef Name = Intrinsic::getName (Intrinsic::amdgcn_implicitarg_ptr);
152+ Function *ImplicitArgPtr = F.getParent ()->getFunction (Name);
153+ if (!ImplicitArgPtr)
154+ return ;
155+
156+ const DataLayout &DL = F.getParent ()->getDataLayout ();
157+ // Pair is the load and the load offset.
158+ SmallVector<std::pair<LoadInst *, unsigned >, 4 > ImplicitArgLoads;
159+ for (auto *U : ImplicitArgPtr->users ()) {
160+ Instruction *CI = dyn_cast<Instruction>(U);
161+ if (!CI || CI->getParent ()->getParent () != &F)
162+ continue ;
163+
164+ for (auto *U : CI->users ()) {
165+ int64_t Offset = 0 ;
166+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
167+ if (!Load) {
168+ if (GetPointerBaseWithConstantOffset (U, Offset, DL) != CI)
169+ continue ;
170+
171+ Load = dyn_cast<LoadInst>(*U->user_begin ()); // Load from GEP?
172+ }
173+
174+ if (!Load || !Load->isSimple ())
175+ continue ;
176+
177+ // FIXME: Expand to handle 64-bit implicit args and large merged loads.
178+ unsigned LoadSize = Load->getType ()->getScalarSizeInBits ();
179+ if (LoadSize != 32 && LoadSize != 16 )
180+ continue ;
181+
182+ ImplicitArgLoads.push_back (std::make_pair (Load, Offset));
183+ }
184+ }
185+
186+ if (ImplicitArgLoads.empty ())
187+ return ;
188+
189+ // Allocate loads in order of offset. We need to be sure that the implicit
190+ // argument can actually be preloaded.
191+ std::sort (ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
192+ [](const std::pair<LoadInst *, unsigned > &A,
193+ const std::pair<LoadInst *, unsigned > &B) {
194+ return A.second < B.second ;
195+ });
196+
197+ uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
198+ bool AddedHiddenArgsToSignature = false ;
199+ Function *NF = nullptr ;
200+ unsigned LastPreloadIndex = 0 ;
201+ for (const auto &Load : ImplicitArgLoads) {
202+ LoadInst *LoadInst = Load.first ;
203+ Type *LoadType = LoadInst->getType ();
204+ auto LoadOffset = Load.second ;
205+ unsigned LoadSize = DL.getTypeStoreSize (LoadType);
206+ // If we fail to preload any implicit argument we know we don't have SGPRs
207+ // to preload any subsequent ones with larger offsets.
208+ if (!tryAllocPreloadSGPRs (LoadSize, LoadOffset + ImplicitArgsBaseOffset,
209+ LastExplicitArgOffset))
210+ break ;
211+
212+ if (!AddedHiddenArgsToSignature) {
213+ NF = cloneFunctionWithPreloadImplicitArgs ();
214+ AddedHiddenArgsToSignature = true ;
215+ }
216+
217+ LastExplicitArgOffset = LoadOffset + LoadSize;
218+ unsigned HiddenArgIndex = getHiddenArgIndexFromOffset (LoadOffset);
219+ assert (NF);
220+ unsigned Index = NF->arg_size () - END_HIDDEN_ARGS + HiddenArgIndex;
221+ Argument *Arg = NF->getArg (Index);
222+ LoadInst->replaceAllUsesWith (Arg);
223+ if (Index > HiddenArgIndex)
224+ LastPreloadIndex = HiddenArgIndex;
225+ }
226+
227+ // Ensure all hidden arguments up to the final preload are also
228+ // preloaded, even if some are unused.
229+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I)
230+ NF->getArg (NF->arg_size () - END_HIDDEN_ARGS + I)
231+ ->addAttr (Attribute::InReg);
232+
233+ F.removeFromParent ();
234+ }
67235};
68236
69237class AMDGPULowerKernelArguments : public FunctionPass {
@@ -281,6 +449,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281449 KernArgSegment->addRetAttr (
282450 Attribute::getWithAlignment (Ctx, std::max (KernArgBaseAlign, MaxAlign)));
283451
452+ if (InPreloadSequence) {
453+ uint64_t ImplicitArgsBaseOffset =
454+ alignTo (ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr ()) +
455+ BaseOffset;
456+ PreloadInfo.tryAllocImplicitArgPreloadSGPRs (ImplicitArgsBaseOffset,
457+ Builder);
458+ }
459+
284460 return true ;
285461}
286462
0 commit comments