1313
1414#include " AMDGPU.h"
1515#include " GCNSubtarget.h"
16+ #include " llvm/ADT/StringExtras.h"
17+ #include " llvm/Analysis/ValueTracking.h"
1618#include " llvm/CodeGen/TargetPassConfig.h"
1719#include " llvm/IR/IRBuilder.h"
1820#include " llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,93 @@ class PreloadKernelArgInfo {
3133 const GCNSubtarget &ST;
3234 unsigned NumFreeUserSGPRs;
3335
34- public:
35- SmallVector<llvm::Metadata *, 8 > KernelArgMetadata;
36+ enum HiddenArg : unsigned {
37+ HIDDEN_BLOCK_COUNT_X,
38+ HIDDEN_BLOCK_COUNT_Y,
39+ HIDDEN_BLOCK_COUNT_Z,
40+ HIDDEN_GROUP_SIZE_X,
41+ HIDDEN_GROUP_SIZE_Y,
42+ HIDDEN_GROUP_SIZE_Z,
43+ HIDDEN_REMAINDER_X,
44+ HIDDEN_REMAINDER_Y,
45+ HIDDEN_REMAINDER_Z,
46+ END_HIDDEN_ARGS
47+ };
48+
49+ struct HiddenArgInfo {
50+ unsigned Offset;
51+ unsigned Size;
52+ const char *Name;
53+ };
54+
55+ static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
56+ {0 , 4 , " _hidden_block_count_x" }, {4 , 4 , " _hidden_block_count_y" },
57+ {8 , 4 , " _hidden_block_count_z" }, {12 , 2 , " _hidden_group_size_x" },
58+ {14 , 2 , " _hidden_group_size_y" }, {16 , 2 , " _hidden_group_size_z" },
59+ {18 , 2 , " _hidden_remainder_x" }, {20 , 2 , " _hidden_remainder_y" },
60+ {22 , 2 , " _hidden_remainder_z" }};
61+
62+ static HiddenArg getHiddenArgIndexFromOffset (unsigned Offset) {
63+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
64+ if (HiddenArgs[I].Offset == Offset)
65+ return static_cast <HiddenArg>(I);
66+
67+ llvm_unreachable (" Unexpected hidden argument offset." );
68+ }
69+
70+ static Type *getHiddenArgType (LLVMContext &Ctx, HiddenArg HA) {
71+ if (HA < END_HIDDEN_ARGS)
72+ return Type::getIntNTy (Ctx, HiddenArgs[HA].Size * 8 );
73+
74+ llvm_unreachable (" Unexpected hidden argument." );
75+ }
3676
77+ static const char *getHiddenArgName (HiddenArg HA) {
78+ if (HA < END_HIDDEN_ARGS) {
79+ return HiddenArgs[HA].Name ;
80+ }
81+ llvm_unreachable (" Unexpected hidden argument." );
82+ }
83+
84+ Function *cloneFunctionWithPreloadImplicitArgs () {
85+ FunctionType *FT = F.getFunctionType ();
86+ SmallVector<Type *, 16 > FTypes (FT->param_begin (), FT->param_end ());
87+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
88+ FTypes.push_back (getHiddenArgType (F.getContext (), HiddenArg (I)));
89+
90+ FunctionType *NFT =
91+ FunctionType::get (FT->getReturnType (), FTypes, FT->isVarArg ());
92+ Function *NF =
93+ Function::Create (NFT, F.getLinkage (), F.getAddressSpace (), F.getName ());
94+
95+ NF->copyAttributesFrom (&F);
96+ NF->copyMetadata (&F, 0 );
97+ NF->setIsNewDbgInfoFormat (F.IsNewDbgInfoFormat );
98+
99+ F.getParent ()->getFunctionList ().insert (F.getIterator (), NF);
100+ NF->takeName (&F);
101+ NF->splice (NF->begin (), &F);
102+
103+ Function::arg_iterator NFArg = NF->arg_begin ();
104+ for (Argument &Arg : F.args ()) {
105+ Arg.replaceAllUsesWith (&*NFArg);
106+ NFArg->takeName (&Arg);
107+ ++NFArg;
108+ }
109+
110+ // Add an attribute that tracks the index offset to the first hidden
111+ // argument.
112+ NF->addFnAttr (" amdgpu-hidden-arg-offset" , utostr (NFArg->getArgNo ()));
113+
114+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
115+ NFArg++->setName (getHiddenArgName (HiddenArg (I)));
116+
117+ F.replaceAllUsesWith (NF);
118+
119+ return NF;
120+ }
121+
122+ public:
37123 PreloadKernelArgInfo (Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38124 setInitialFreeUserSGPRsCount ();
39125 }
@@ -64,6 +150,94 @@ class PreloadKernelArgInfo {
64150 NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65151 return true ;
66152 }
153+
154+ // Try to allocate SGPRs to preload implicit kernel arguments.
155+ void tryAllocImplicitArgPreloadSGPRs (uint64_t ImplicitArgsBaseOffset,
156+ IRBuilder<> &Builder) {
157+ StringRef Name = Intrinsic::getName (Intrinsic::amdgcn_implicitarg_ptr);
158+ Function *ImplicitArgPtr = F.getParent ()->getFunction (Name);
159+ if (!ImplicitArgPtr)
160+ return ;
161+
162+ const DataLayout &DL = F.getParent ()->getDataLayout ();
163+ // Pair is the load and the load offset.
164+ SmallVector<std::pair<LoadInst *, unsigned >, 4 > ImplicitArgLoads;
165+ for (auto *U : ImplicitArgPtr->users ()) {
166+ Instruction *CI = dyn_cast<Instruction>(U);
167+ if (!CI || CI->getParent ()->getParent () != &F)
168+ continue ;
169+
170+ for (auto *U : CI->users ()) {
171+ int64_t Offset = 0 ;
172+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
173+ if (!Load) {
174+ if (GetPointerBaseWithConstantOffset (U, Offset, DL) != CI)
175+ continue ;
176+
177+ Load = dyn_cast<LoadInst>(*U->user_begin ()); // Load from GEP?
178+ }
179+
180+ if (!Load || !Load->isSimple ())
181+ continue ;
182+
183+ // FIXME: Expand to handle 64-bit implicit args and large merged loads.
184+ unsigned LoadSize = Load->getType ()->getScalarSizeInBits ();
185+ if (LoadSize != 32 && LoadSize != 16 )
186+ continue ;
187+
188+ ImplicitArgLoads.push_back (std::make_pair (Load, Offset));
189+ }
190+ }
191+
192+ if (ImplicitArgLoads.empty ())
193+ return ;
194+
195+ // Allocate loads in order of offset. We need to be sure that the implicit
196+ // argument can actually be preloaded.
197+ std::sort (ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
198+ [](const std::pair<LoadInst *, unsigned > &A,
199+ const std::pair<LoadInst *, unsigned > &B) {
200+ return A.second < B.second ;
201+ });
202+
203+ uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
204+ bool AddedHiddenArgsToSignature = false ;
205+ Function *NF = nullptr ;
206+ unsigned LastPreloadIndex = 0 ;
207+ for (const auto &Load : ImplicitArgLoads) {
208+ LoadInst *LoadInst = Load.first ;
209+ Type *LoadType = LoadInst->getType ();
210+ auto LoadOffset = Load.second ;
211+ unsigned LoadSize = DL.getTypeStoreSize (LoadType);
212+ // If we fail to preload any implicit argument we know we don't have SGPRs
213+ // to preload any subsequent ones with larger offsets.
214+ if (!tryAllocPreloadSGPRs (LoadSize, LoadOffset + ImplicitArgsBaseOffset,
215+ LastExplicitArgOffset))
216+ break ;
217+
218+ if (!AddedHiddenArgsToSignature) {
219+ NF = cloneFunctionWithPreloadImplicitArgs ();
220+ AddedHiddenArgsToSignature = true ;
221+ }
222+
223+ LastExplicitArgOffset = LoadOffset + LoadSize;
224+ unsigned HiddenArgIndex = getHiddenArgIndexFromOffset (LoadOffset);
225+ assert (NF);
226+ unsigned Index = NF->arg_size () - END_HIDDEN_ARGS + HiddenArgIndex;
227+ Argument *Arg = NF->getArg (Index);
228+ LoadInst->replaceAllUsesWith (Arg);
229+ if (Index > HiddenArgIndex)
230+ LastPreloadIndex = HiddenArgIndex;
231+ }
232+
233+ // Ensure all hidden arguments up to the final preload are also
234+ // preloaded, even if some are unused.
235+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I)
236+ NF->getArg (NF->arg_size () - END_HIDDEN_ARGS + I)
237+ ->addAttr (Attribute::InReg);
238+
239+ F.removeFromParent ();
240+ }
67241};
68242
69243class AMDGPULowerKernelArguments : public FunctionPass {
@@ -281,6 +455,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281455 KernArgSegment->addRetAttr (
282456 Attribute::getWithAlignment (Ctx, std::max (KernArgBaseAlign, MaxAlign)));
283457
458+ if (InPreloadSequence) {
459+ uint64_t ImplicitArgsBaseOffset =
460+ alignTo (ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr ()) +
461+ BaseOffset;
462+ PreloadInfo.tryAllocImplicitArgPreloadSGPRs (ImplicitArgsBaseOffset,
463+ Builder);
464+ }
465+
284466 return true ;
285467}
286468
0 commit comments