@@ -27,231 +27,6 @@ using namespace llvm;
2727
2828namespace {
2929
30- class PreloadKernelArgInfo {
31- private:
32- Function &F;
33- const GCNSubtarget &ST;
34- unsigned NumFreeUserSGPRs;
35-
36- enum HiddenArg : unsigned {
37- HIDDEN_BLOCK_COUNT_X,
38- HIDDEN_BLOCK_COUNT_Y,
39- HIDDEN_BLOCK_COUNT_Z,
40- HIDDEN_GROUP_SIZE_X,
41- HIDDEN_GROUP_SIZE_Y,
42- HIDDEN_GROUP_SIZE_Z,
43- HIDDEN_REMAINDER_X,
44- HIDDEN_REMAINDER_Y,
45- HIDDEN_REMAINDER_Z,
46- END_HIDDEN_ARGS
47- };
48-
49- // Stores information about a specific hidden argument.
50- struct HiddenArgInfo {
51- // Offset in bytes from the location in the kernearg segment pointed to by
52- // the implicitarg pointer.
53- uint8_t Offset;
54- // The size of the hidden argument in bytes.
55- uint8_t Size;
56- // The name of the hidden argument in the kernel signature.
57- const char *Name;
58- };
59-
60- static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61- {0 , 4 , " _hidden_block_count_x" }, {4 , 4 , " _hidden_block_count_y" },
62- {8 , 4 , " _hidden_block_count_z" }, {12 , 2 , " _hidden_group_size_x" },
63- {14 , 2 , " _hidden_group_size_y" }, {16 , 2 , " _hidden_group_size_z" },
64- {18 , 2 , " _hidden_remainder_x" }, {20 , 2 , " _hidden_remainder_y" },
65- {22 , 2 , " _hidden_remainder_z" }};
66-
67- static HiddenArg getHiddenArgFromOffset (unsigned Offset) {
68- for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
69- if (HiddenArgs[I].Offset == Offset)
70- return static_cast <HiddenArg>(I);
71-
72- return END_HIDDEN_ARGS;
73- }
74-
75- static Type *getHiddenArgType (LLVMContext &Ctx, HiddenArg HA) {
76- if (HA < END_HIDDEN_ARGS)
77- return Type::getIntNTy (Ctx, HiddenArgs[HA].Size * 8 );
78-
79- llvm_unreachable (" Unexpected hidden argument." );
80- }
81-
82- static const char *getHiddenArgName (HiddenArg HA) {
83- if (HA < END_HIDDEN_ARGS) {
84- return HiddenArgs[HA].Name ;
85- }
86- llvm_unreachable (" Unexpected hidden argument." );
87- }
88-
89- // Clones the function after adding implicit arguments to the argument list
90- // and returns the new updated function. Preloaded implicit arguments are
91- // added up to and including the last one that will be preloaded, indicated by
92- // LastPreloadIndex. Currently preloading is only performed on the totality of
93- // sequential data from the kernarg segment including implicit (hidden)
94- // arguments. This means that all arguments up to the last preloaded argument
95- // will also be preloaded even if that data is unused.
96- Function *cloneFunctionWithPreloadImplicitArgs (unsigned LastPreloadIndex) {
97- FunctionType *FT = F.getFunctionType ();
98- LLVMContext &Ctx = F.getParent ()->getContext ();
99- SmallVector<Type *, 16 > FTypes (FT->param_begin (), FT->param_end ());
100- for (unsigned I = 0 ; I <= LastPreloadIndex; ++I)
101- FTypes.push_back (getHiddenArgType (Ctx, HiddenArg (I)));
102-
103- FunctionType *NFT =
104- FunctionType::get (FT->getReturnType (), FTypes, FT->isVarArg ());
105- Function *NF =
106- Function::Create (NFT, F.getLinkage (), F.getAddressSpace (), F.getName ());
107-
108- NF->copyAttributesFrom (&F);
109- NF->copyMetadata (&F, 0 );
110- NF->setIsNewDbgInfoFormat (F.IsNewDbgInfoFormat );
111-
112- F.getParent ()->getFunctionList ().insert (F.getIterator (), NF);
113- NF->takeName (&F);
114- NF->splice (NF->begin (), &F);
115-
116- Function::arg_iterator NFArg = NF->arg_begin ();
117- for (Argument &Arg : F.args ()) {
118- Arg.replaceAllUsesWith (&*NFArg);
119- NFArg->takeName (&Arg);
120- ++NFArg;
121- }
122-
123- AttrBuilder AB (Ctx);
124- AB.addAttribute (Attribute::InReg);
125- AB.addAttribute (" amdgpu-hidden-argument" );
126- AttributeList AL = NF->getAttributes ();
127- for (unsigned I = 0 ; I <= LastPreloadIndex; ++I) {
128- AL = AL.addParamAttributes (Ctx, NFArg->getArgNo (), AB);
129- NFArg++->setName (getHiddenArgName (HiddenArg (I)));
130- }
131-
132- NF->setAttributes (AL);
133- F.replaceAllUsesWith (NF);
134- F.setCallingConv (CallingConv::C);
135- F.clearMetadata ();
136-
137- return NF;
138- }
139-
140- public:
141- PreloadKernelArgInfo (Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
142- setInitialFreeUserSGPRsCount ();
143- }
144-
145- // Returns the maximum number of user SGPRs that we have available to preload
146- // arguments.
147- void setInitialFreeUserSGPRsCount () {
148- GCNUserSGPRUsageInfo UserSGPRInfo (F, ST);
149- NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs ();
150- }
151-
152- bool tryAllocPreloadSGPRs (unsigned AllocSize, uint64_t ArgOffset,
153- uint64_t LastExplicitArgOffset) {
154- // Check if this argument may be loaded into the same register as the
155- // previous argument.
156- if (ArgOffset - LastExplicitArgOffset < 4 &&
157- !isAligned (Align (4 ), ArgOffset))
158- return true ;
159-
160- // Pad SGPRs for kernarg alignment.
161- ArgOffset = alignDown (ArgOffset, 4 );
162- unsigned Padding = ArgOffset - LastExplicitArgOffset;
163- unsigned PaddingSGPRs = alignTo (Padding, 4 ) / 4 ;
164- unsigned NumPreloadSGPRs = alignTo (AllocSize, 4 ) / 4 ;
165- if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
166- return false ;
167-
168- NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
169- return true ;
170- }
171-
172- // Try to allocate SGPRs to preload implicit kernel arguments.
173- void tryAllocImplicitArgPreloadSGPRs (uint64_t ImplicitArgsBaseOffset,
174- uint64_t LastExplicitArgOffset,
175- IRBuilder<> &Builder) {
176- Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists (
177- F.getParent (), Intrinsic::amdgcn_implicitarg_ptr);
178- if (!ImplicitArgPtr)
179- return ;
180-
181- const DataLayout &DL = F.getParent ()->getDataLayout ();
182- // Pair is the load and the load offset.
183- SmallVector<std::pair<LoadInst *, unsigned >, 4 > ImplicitArgLoads;
184- for (auto *U : ImplicitArgPtr->users ()) {
185- Instruction *CI = dyn_cast<Instruction>(U);
186- if (!CI || CI->getParent ()->getParent () != &F)
187- continue ;
188-
189- for (auto *U : CI->users ()) {
190- int64_t Offset = 0 ;
191- auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
192- if (!Load) {
193- if (GetPointerBaseWithConstantOffset (U, Offset, DL) != CI)
194- continue ;
195-
196- Load = dyn_cast<LoadInst>(*U->user_begin ()); // Load from GEP?
197- }
198-
199- if (!Load || !Load->isSimple ())
200- continue ;
201-
202- // FIXME: Expand to handle 64-bit implicit args and large merged loads.
203- LLVMContext &Ctx = F.getParent ()->getContext ();
204- Type *LoadTy = Load->getType ();
205- HiddenArg HA = getHiddenArgFromOffset (Offset);
206- if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType (Ctx, HA))
207- continue ;
208-
209- ImplicitArgLoads.push_back (std::make_pair (Load, Offset));
210- }
211- }
212-
213- if (ImplicitArgLoads.empty ())
214- return ;
215-
216- // Allocate loads in order of offset. We need to be sure that the implicit
217- // argument can actually be preloaded.
218- std::sort (ImplicitArgLoads.begin (), ImplicitArgLoads.end (), less_second ());
219-
220- // If we fail to preload any implicit argument we know we don't have SGPRs
221- // to preload any subsequent ones with larger offsets. Find the first
222- // argument that we cannot preload.
223- auto *PreloadEnd = std::find_if (
224- ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
225- [&](const std::pair<LoadInst *, unsigned > &Load) {
226- unsigned LoadSize = DL.getTypeStoreSize (Load.first ->getType ());
227- unsigned LoadOffset = Load.second ;
228- if (!tryAllocPreloadSGPRs (LoadSize,
229- LoadOffset + ImplicitArgsBaseOffset,
230- LastExplicitArgOffset))
231- return true ;
232-
233- LastExplicitArgOffset =
234- ImplicitArgsBaseOffset + LoadOffset + LoadSize;
235- return false ;
236- });
237-
238- if (PreloadEnd == ImplicitArgLoads.begin ())
239- return ;
240-
241- unsigned LastHiddenArgIndex = getHiddenArgFromOffset (PreloadEnd[-1 ].second );
242- Function *NF = cloneFunctionWithPreloadImplicitArgs (LastHiddenArgIndex);
243- assert (NF);
244- for (const auto *I = ImplicitArgLoads.begin (); I != PreloadEnd; ++I) {
245- LoadInst *LoadInst = I->first ;
246- unsigned LoadOffset = I->second ;
247- unsigned HiddenArgIndex = getHiddenArgFromOffset (LoadOffset);
248- unsigned Index = NF->arg_size () - LastHiddenArgIndex + HiddenArgIndex - 1 ;
249- Argument *Arg = NF->getArg (Index);
250- LoadInst->replaceAllUsesWith (Arg);
251- }
252- }
253- };
254-
25530class AMDGPULowerKernelArguments : public FunctionPass {
25631public:
25732 static char ID;
@@ -311,10 +86,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
31186 Attribute::getWithDereferenceableBytes (Ctx, TotalKernArgSize));
31287
31388 uint64_t ExplicitArgOffset = 0 ;
314- // Preloaded kernel arguments must be sequential.
315- bool InPreloadSequence = true ;
316- PreloadKernelArgInfo PreloadInfo (F, ST);
317-
31889 for (Argument &Arg : F.args ()) {
31990 const bool IsByRef = Arg.hasByRefAttr ();
32091 Type *ArgTy = IsByRef ? Arg.getParamByRefType () : Arg.getType ();
@@ -325,25 +96,10 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
32596 uint64_t AllocSize = DL.getTypeAllocSize (ArgTy);
32697
32798 uint64_t EltOffset = alignTo (ExplicitArgOffset, ABITypeAlign) + BaseOffset;
328- uint64_t LastExplicitArgOffset = ExplicitArgOffset;
32999 ExplicitArgOffset = alignTo (ExplicitArgOffset, ABITypeAlign) + AllocSize;
330100
331- // Guard against the situation where hidden arguments have already been
332- // lowered and added to the kernel function signiture, i.e. in a situation
333- // where this pass has run twice.
334- if (Arg.hasAttribute (" amdgpu-hidden-argument" ))
335- break ;
336-
337- // Try to preload this argument into user SGPRs.
338- if (Arg.hasInRegAttr () && InPreloadSequence && ST.hasKernargPreload () &&
339- !Arg.getType ()->isAggregateType ())
340- if (PreloadInfo.tryAllocPreloadSGPRs (AllocSize, EltOffset,
341- LastExplicitArgOffset))
342- continue ;
343-
344- InPreloadSequence = false ;
345-
346- if (Arg.use_empty ())
101+ // Skip inreg arguments which should be preloaded.
102+ if (Arg.use_empty () || Arg.hasInRegAttr ())
347103 continue ;
348104
349105 // If this is byval, the loads are already explicit in the function. We just
@@ -483,14 +239,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
483239 KernArgSegment->addRetAttr (
484240 Attribute::getWithAlignment (Ctx, std::max (KernArgBaseAlign, MaxAlign)));
485241
486- if (InPreloadSequence) {
487- uint64_t ImplicitArgsBaseOffset =
488- alignTo (ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr ()) +
489- BaseOffset;
490- PreloadInfo.tryAllocImplicitArgPreloadSGPRs (ImplicitArgsBaseOffset,
491- ExplicitArgOffset, Builder);
492- }
493-
494242 return true ;
495243}
496244
0 commit comments