@@ -27,230 +27,6 @@ using namespace llvm;
2727
2828namespace {
2929
30- class PreloadKernelArgInfo {
31- private:
32- Function &F;
33- const GCNSubtarget &ST;
34- unsigned NumFreeUserSGPRs;
35-
36- enum HiddenArg : unsigned {
37- HIDDEN_BLOCK_COUNT_X,
38- HIDDEN_BLOCK_COUNT_Y,
39- HIDDEN_BLOCK_COUNT_Z,
40- HIDDEN_GROUP_SIZE_X,
41- HIDDEN_GROUP_SIZE_Y,
42- HIDDEN_GROUP_SIZE_Z,
43- HIDDEN_REMAINDER_X,
44- HIDDEN_REMAINDER_Y,
45- HIDDEN_REMAINDER_Z,
46- END_HIDDEN_ARGS
47- };
48-
49- // Stores information about a specific hidden argument.
50- struct HiddenArgInfo {
51- // Offset in bytes from the location in the kernearg segment pointed to by
52- // the implicitarg pointer.
53- uint8_t Offset;
54- // The size of the hidden argument in bytes.
55- uint8_t Size;
56- // The name of the hidden argument in the kernel signature.
57- const char *Name;
58- };
59-
60- static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61- {0 , 4 , " _hidden_block_count_x" }, {4 , 4 , " _hidden_block_count_y" },
62- {8 , 4 , " _hidden_block_count_z" }, {12 , 2 , " _hidden_group_size_x" },
63- {14 , 2 , " _hidden_group_size_y" }, {16 , 2 , " _hidden_group_size_z" },
64- {18 , 2 , " _hidden_remainder_x" }, {20 , 2 , " _hidden_remainder_y" },
65- {22 , 2 , " _hidden_remainder_z" }};
66-
67- static HiddenArg getHiddenArgFromOffset (unsigned Offset) {
68- for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
69- if (HiddenArgs[I].Offset == Offset)
70- return static_cast <HiddenArg>(I);
71-
72- return END_HIDDEN_ARGS;
73- }
74-
75- static Type *getHiddenArgType (LLVMContext &Ctx, HiddenArg HA) {
76- if (HA < END_HIDDEN_ARGS)
77- return Type::getIntNTy (Ctx, HiddenArgs[HA].Size * 8 );
78-
79- llvm_unreachable (" Unexpected hidden argument." );
80- }
81-
82- static const char *getHiddenArgName (HiddenArg HA) {
83- if (HA < END_HIDDEN_ARGS) {
84- return HiddenArgs[HA].Name ;
85- }
86- llvm_unreachable (" Unexpected hidden argument." );
87- }
88-
89- // Clones the function after adding implicit arguments to the argument list
90- // and returns the new updated function. Preloaded implicit arguments are
91- // added up to and including the last one that will be preloaded, indicated by
92- // LastPreloadIndex. Currently preloading is only performed on the totality of
93- // sequential data from the kernarg segment including implicit (hidden)
94- // arguments. This means that all arguments up to the last preloaded argument
95- // will also be preloaded even if that data is unused.
96- Function *cloneFunctionWithPreloadImplicitArgs (unsigned LastPreloadIndex) {
97- FunctionType *FT = F.getFunctionType ();
98- LLVMContext &Ctx = F.getParent ()->getContext ();
99- SmallVector<Type *, 16 > FTypes (FT->param_begin (), FT->param_end ());
100- for (unsigned I = 0 ; I <= LastPreloadIndex; ++I)
101- FTypes.push_back (getHiddenArgType (Ctx, HiddenArg (I)));
102-
103- FunctionType *NFT =
104- FunctionType::get (FT->getReturnType (), FTypes, FT->isVarArg ());
105- Function *NF =
106- Function::Create (NFT, F.getLinkage (), F.getAddressSpace (), F.getName ());
107-
108- NF->copyAttributesFrom (&F);
109- NF->copyMetadata (&F, 0 );
110- NF->setIsNewDbgInfoFormat (F.IsNewDbgInfoFormat );
111-
112- F.getParent ()->getFunctionList ().insert (F.getIterator (), NF);
113- NF->takeName (&F);
114- NF->splice (NF->begin (), &F);
115-
116- Function::arg_iterator NFArg = NF->arg_begin ();
117- for (Argument &Arg : F.args ()) {
118- Arg.replaceAllUsesWith (&*NFArg);
119- NFArg->takeName (&Arg);
120- ++NFArg;
121- }
122-
123- AttrBuilder AB (Ctx);
124- AB.addAttribute (Attribute::InReg);
125- AB.addAttribute (" amdgpu-hidden-argument" );
126- AttributeList AL = NF->getAttributes ();
127- for (unsigned I = 0 ; I <= LastPreloadIndex; ++I) {
128- AL = AL.addParamAttributes (Ctx, NFArg->getArgNo (), AB);
129- NFArg++->setName (getHiddenArgName (HiddenArg (I)));
130- }
131-
132- NF->setAttributes (AL);
133- F.replaceAllUsesWith (NF);
134- F.setCallingConv (CallingConv::C);
135-
136- return NF;
137- }
138-
139- public:
140- PreloadKernelArgInfo (Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
141- setInitialFreeUserSGPRsCount ();
142- }
143-
144- // Returns the maximum number of user SGPRs that we have available to preload
145- // arguments.
146- void setInitialFreeUserSGPRsCount () {
147- GCNUserSGPRUsageInfo UserSGPRInfo (F, ST);
148- NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs ();
149- }
150-
151- bool tryAllocPreloadSGPRs (unsigned AllocSize, uint64_t ArgOffset,
152- uint64_t LastExplicitArgOffset) {
153- // Check if this argument may be loaded into the same register as the
154- // previous argument.
155- if (ArgOffset - LastExplicitArgOffset < 4 &&
156- !isAligned (Align (4 ), ArgOffset))
157- return true ;
158-
159- // Pad SGPRs for kernarg alignment.
160- ArgOffset = alignDown (ArgOffset, 4 );
161- unsigned Padding = ArgOffset - LastExplicitArgOffset;
162- unsigned PaddingSGPRs = alignTo (Padding, 4 ) / 4 ;
163- unsigned NumPreloadSGPRs = alignTo (AllocSize, 4 ) / 4 ;
164- if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
165- return false ;
166-
167- NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
168- return true ;
169- }
170-
171- // Try to allocate SGPRs to preload implicit kernel arguments.
172- void tryAllocImplicitArgPreloadSGPRs (uint64_t ImplicitArgsBaseOffset,
173- uint64_t LastExplicitArgOffset,
174- IRBuilder<> &Builder) {
175- Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists (
176- F.getParent (), Intrinsic::amdgcn_implicitarg_ptr);
177- if (!ImplicitArgPtr)
178- return ;
179-
180- const DataLayout &DL = F.getParent ()->getDataLayout ();
181- // Pair is the load and the load offset.
182- SmallVector<std::pair<LoadInst *, unsigned >, 4 > ImplicitArgLoads;
183- for (auto *U : ImplicitArgPtr->users ()) {
184- Instruction *CI = dyn_cast<Instruction>(U);
185- if (!CI || CI->getParent ()->getParent () != &F)
186- continue ;
187-
188- for (auto *U : CI->users ()) {
189- int64_t Offset = 0 ;
190- auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
191- if (!Load) {
192- if (GetPointerBaseWithConstantOffset (U, Offset, DL) != CI)
193- continue ;
194-
195- Load = dyn_cast<LoadInst>(*U->user_begin ()); // Load from GEP?
196- }
197-
198- if (!Load || !Load->isSimple ())
199- continue ;
200-
201- // FIXME: Expand to handle 64-bit implicit args and large merged loads.
202- LLVMContext &Ctx = F.getParent ()->getContext ();
203- Type *LoadTy = Load->getType ();
204- HiddenArg HA = getHiddenArgFromOffset (Offset);
205- if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType (Ctx, HA))
206- continue ;
207-
208- ImplicitArgLoads.push_back (std::make_pair (Load, Offset));
209- }
210- }
211-
212- if (ImplicitArgLoads.empty ())
213- return ;
214-
215- // Allocate loads in order of offset. We need to be sure that the implicit
216- // argument can actually be preloaded.
217- std::sort (ImplicitArgLoads.begin (), ImplicitArgLoads.end (), less_second ());
218-
219- // If we fail to preload any implicit argument we know we don't have SGPRs
220- // to preload any subsequent ones with larger offsets. Find the first
221- // argument that we cannot preload.
222- auto *PreloadEnd = std::find_if (
223- ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
224- [&](const std::pair<LoadInst *, unsigned > &Load) {
225- unsigned LoadSize = DL.getTypeStoreSize (Load.first ->getType ());
226- unsigned LoadOffset = Load.second ;
227- if (!tryAllocPreloadSGPRs (LoadSize,
228- LoadOffset + ImplicitArgsBaseOffset,
229- LastExplicitArgOffset))
230- return true ;
231-
232- LastExplicitArgOffset =
233- ImplicitArgsBaseOffset + LoadOffset + LoadSize;
234- return false ;
235- });
236-
237- if (PreloadEnd == ImplicitArgLoads.begin ())
238- return ;
239-
240- unsigned LastHiddenArgIndex = getHiddenArgFromOffset (PreloadEnd[-1 ].second );
241- Function *NF = cloneFunctionWithPreloadImplicitArgs (LastHiddenArgIndex);
242- assert (NF);
243- for (const auto *I = ImplicitArgLoads.begin (); I != PreloadEnd; ++I) {
244- LoadInst *LoadInst = I->first ;
245- unsigned LoadOffset = I->second ;
246- unsigned HiddenArgIndex = getHiddenArgFromOffset (LoadOffset);
247- unsigned Index = NF->arg_size () - LastHiddenArgIndex + HiddenArgIndex - 1 ;
248- Argument *Arg = NF->getArg (Index);
249- LoadInst->replaceAllUsesWith (Arg);
250- }
251- }
252- };
253-
25430class AMDGPULowerKernelArguments : public FunctionPass {
25531public:
25632 static char ID;
@@ -310,10 +86,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
31086 Attribute::getWithDereferenceableBytes (Ctx, TotalKernArgSize));
31187
31288 uint64_t ExplicitArgOffset = 0 ;
313- // Preloaded kernel arguments must be sequential.
314- bool InPreloadSequence = true ;
315- PreloadKernelArgInfo PreloadInfo (F, ST);
316-
31789 for (Argument &Arg : F.args ()) {
31890 const bool IsByRef = Arg.hasByRefAttr ();
31991 Type *ArgTy = IsByRef ? Arg.getParamByRefType () : Arg.getType ();
@@ -324,25 +96,10 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
32496 uint64_t AllocSize = DL.getTypeAllocSize (ArgTy);
32597
32698 uint64_t EltOffset = alignTo (ExplicitArgOffset, ABITypeAlign) + BaseOffset;
327- uint64_t LastExplicitArgOffset = ExplicitArgOffset;
32899 ExplicitArgOffset = alignTo (ExplicitArgOffset, ABITypeAlign) + AllocSize;
329100
330- // Guard against the situation where hidden arguments have already been
331- // lowered and added to the kernel function signiture, i.e. in a situation
332- // where this pass has run twice.
333- if (Arg.hasAttribute (" amdgpu-hidden-argument" ))
334- break ;
335-
336- // Try to preload this argument into user SGPRs.
337- if (Arg.hasInRegAttr () && InPreloadSequence && ST.hasKernargPreload () &&
338- !Arg.getType ()->isAggregateType ())
339- if (PreloadInfo.tryAllocPreloadSGPRs (AllocSize, EltOffset,
340- LastExplicitArgOffset))
341- continue ;
342-
343- InPreloadSequence = false ;
344-
345- if (Arg.use_empty ())
101+ // Skip inreg arguments which should be preloaded.
102+ if (Arg.use_empty () || Arg.hasInRegAttr ())
346103 continue ;
347104
348105 // If this is byval, the loads are already explicit in the function. We just
@@ -482,14 +239,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
482239 KernArgSegment->addRetAttr (
483240 Attribute::getWithAlignment (Ctx, std::max (KernArgBaseAlign, MaxAlign)));
484241
485- if (InPreloadSequence) {
486- uint64_t ImplicitArgsBaseOffset =
487- alignTo (ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr ()) +
488- BaseOffset;
489- PreloadInfo.tryAllocImplicitArgPreloadSGPRs (ImplicitArgsBaseOffset,
490- ExplicitArgOffset, Builder);
491- }
492-
493242 return true ;
494243}
495244
0 commit comments