1313
1414#include " AMDGPU.h"
1515#include " GCNSubtarget.h"
16+ #include " llvm/ADT/StringExtras.h"
17+ #include " llvm/Analysis/ValueTracking.h"
1618#include " llvm/CodeGen/TargetPassConfig.h"
1719#include " llvm/IR/IRBuilder.h"
1820#include " llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,110 @@ class PreloadKernelArgInfo {
3133 const GCNSubtarget &ST;
3234 unsigned NumFreeUserSGPRs;
3335
34- public:
35- SmallVector<llvm::Metadata *, 8 > KernelArgMetadata;
36+ enum HiddenArg : unsigned {
37+ HIDDEN_BLOCK_COUNT_X,
38+ HIDDEN_BLOCK_COUNT_Y,
39+ HIDDEN_BLOCK_COUNT_Z,
40+ HIDDEN_GROUP_SIZE_X,
41+ HIDDEN_GROUP_SIZE_Y,
42+ HIDDEN_GROUP_SIZE_Z,
43+ HIDDEN_REMAINDER_X,
44+ HIDDEN_REMAINDER_Y,
45+ HIDDEN_REMAINDER_Z,
46+ END_HIDDEN_ARGS
47+ };
48+
49+ // Stores information about a specific hidden argument.
50+ struct HiddenArgInfo {
51+ // Offset in bytes from the location in the kernearg segment pointed to by
52+ // the implicitarg pointer.
53+ uint8_t Offset;
54+ // The size of the hidden argument in bytes.
55+ uint8_t Size;
56+ // The name of the hidden argument in the kernel signature.
57+ const char *Name;
58+ };
59+
60+ static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61+ {0 , 4 , " _hidden_block_count_x" }, {4 , 4 , " _hidden_block_count_y" },
62+ {8 , 4 , " _hidden_block_count_z" }, {12 , 2 , " _hidden_group_size_x" },
63+ {14 , 2 , " _hidden_group_size_y" }, {16 , 2 , " _hidden_group_size_z" },
64+ {18 , 2 , " _hidden_remainder_x" }, {20 , 2 , " _hidden_remainder_y" },
65+ {22 , 2 , " _hidden_remainder_z" }};
66+
67+ static HiddenArg getHiddenArgFromOffset (unsigned Offset) {
68+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
69+ if (HiddenArgs[I].Offset == Offset)
70+ return static_cast <HiddenArg>(I);
71+
72+ return END_HIDDEN_ARGS;
73+ }
74+
75+ static Type *getHiddenArgType (LLVMContext &Ctx, HiddenArg HA) {
76+ if (HA < END_HIDDEN_ARGS)
77+ return Type::getIntNTy (Ctx, HiddenArgs[HA].Size * 8 );
78+
79+ llvm_unreachable (" Unexpected hidden argument." );
80+ }
81+
82+ static const char *getHiddenArgName (HiddenArg HA) {
83+ if (HA < END_HIDDEN_ARGS) {
84+ return HiddenArgs[HA].Name ;
85+ }
86+ llvm_unreachable (" Unexpected hidden argument." );
87+ }
3688
89+ // Clones the function after adding implicit arguments to the argument list
90+ // and returns the new updated function. Preloaded implicit arguments are
91+ // added up to and including the last one that will be preloaded, indicated by
92+ // LastPreloadIndex. Currently preloading is only performed on the totality of
93+ // sequential data from the kernarg segment including implicit (hidden)
94+ // arguments. This means that all arguments up to the last preloaded argument
95+ // will also be preloaded even if that data is unused.
96+ Function *cloneFunctionWithPreloadImplicitArgs (unsigned LastPreloadIndex) {
97+ FunctionType *FT = F.getFunctionType ();
98+ LLVMContext &Ctx = F.getParent ()->getContext ();
99+ SmallVector<Type *, 16 > FTypes (FT->param_begin (), FT->param_end ());
100+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I)
101+ FTypes.push_back (getHiddenArgType (Ctx, HiddenArg (I)));
102+
103+ FunctionType *NFT =
104+ FunctionType::get (FT->getReturnType (), FTypes, FT->isVarArg ());
105+ Function *NF =
106+ Function::Create (NFT, F.getLinkage (), F.getAddressSpace (), F.getName ());
107+
108+ NF->copyAttributesFrom (&F);
109+ NF->copyMetadata (&F, 0 );
110+ NF->setIsNewDbgInfoFormat (F.IsNewDbgInfoFormat );
111+
112+ F.getParent ()->getFunctionList ().insert (F.getIterator (), NF);
113+ NF->takeName (&F);
114+ NF->splice (NF->begin (), &F);
115+
116+ Function::arg_iterator NFArg = NF->arg_begin ();
117+ for (Argument &Arg : F.args ()) {
118+ Arg.replaceAllUsesWith (&*NFArg);
119+ NFArg->takeName (&Arg);
120+ ++NFArg;
121+ }
122+
123+ AttrBuilder AB (Ctx);
124+ AB.addAttribute (Attribute::InReg);
125+ AB.addAttribute (" amdgpu-hidden-argument" );
126+ AttributeList AL = NF->getAttributes ();
127+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I) {
128+ AL = AL.addParamAttributes (Ctx, NFArg->getArgNo (), AB);
129+ NFArg++->setName (getHiddenArgName (HiddenArg (I)));
130+ }
131+
132+ NF->setAttributes (AL);
133+ F.replaceAllUsesWith (NF);
134+ F.setCallingConv (CallingConv::C);
135+
136+ return NF;
137+ }
138+
139+ public:
37140 PreloadKernelArgInfo (Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38141 setInitialFreeUserSGPRsCount ();
39142 }
@@ -86,6 +189,87 @@ class PreloadKernelArgInfo {
86189 << " \n " ;
87190 return true ;
88191 }
192+
193+ // Try to allocate SGPRs to preload implicit kernel arguments.
194+ void tryAllocImplicitArgPreloadSGPRs (uint64_t ImplicitArgsBaseOffset,
195+ IRBuilder<> &Builder) {
196+ StringRef Name = Intrinsic::getName (Intrinsic::amdgcn_implicitarg_ptr);
197+ Function *ImplicitArgPtr = F.getParent ()->getFunction (Name);
198+ if (!ImplicitArgPtr)
199+ return ;
200+
201+ const DataLayout &DL = F.getParent ()->getDataLayout ();
202+ // Pair is the load and the load offset.
203+ SmallVector<std::pair<LoadInst *, unsigned >, 4 > ImplicitArgLoads;
204+ for (auto *U : ImplicitArgPtr->users ()) {
205+ Instruction *CI = dyn_cast<Instruction>(U);
206+ if (!CI || CI->getParent ()->getParent () != &F)
207+ continue ;
208+
209+ for (auto *U : CI->users ()) {
210+ int64_t Offset = 0 ;
211+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
212+ if (!Load) {
213+ if (GetPointerBaseWithConstantOffset (U, Offset, DL) != CI)
214+ continue ;
215+
216+ Load = dyn_cast<LoadInst>(*U->user_begin ()); // Load from GEP?
217+ }
218+
219+ if (!Load || !Load->isSimple ())
220+ continue ;
221+
222+ // FIXME: Expand to handle 64-bit implicit args and large merged loads.
223+ LLVMContext &Ctx = F.getParent ()->getContext ();
224+ Type *LoadTy = Load->getType ();
225+ HiddenArg HA = getHiddenArgFromOffset (Offset);
226+ if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType (Ctx, HA))
227+ continue ;
228+
229+ ImplicitArgLoads.push_back (std::make_pair (Load, Offset));
230+ }
231+ }
232+
233+ if (ImplicitArgLoads.empty ())
234+ return ;
235+
236+ // Allocate loads in order of offset. We need to be sure that the implicit
237+ // argument can actually be preloaded.
238+ std::sort (ImplicitArgLoads.begin (), ImplicitArgLoads.end (), less_second ());
239+
240+ uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
241+ // If we fail to preload any implicit argument we know we don't have SGPRs
242+ // to preload any subsequent ones with larger offsets. Find the first
243+ // argument that we cannot preload.
244+ auto *PreloadEnd = std::find_if (
245+ ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
246+ [&](const std::pair<LoadInst *, unsigned > &Load) {
247+ unsigned LoadSize = DL.getTypeStoreSize (Load.first ->getType ());
248+ unsigned LoadOffset = Load.second ;
249+ if (!tryAllocPreloadSGPRs (LoadSize,
250+ LoadOffset + ImplicitArgsBaseOffset,
251+ LastExplicitArgOffset))
252+ return true ;
253+
254+ LastExplicitArgOffset = LoadOffset + LoadSize;
255+ return false ;
256+ });
257+
258+ if (PreloadEnd == ImplicitArgLoads.begin ())
259+ return ;
260+
261+ unsigned LastHiddenArgIndex = getHiddenArgFromOffset (PreloadEnd[-1 ].second );
262+ Function *NF = cloneFunctionWithPreloadImplicitArgs (LastHiddenArgIndex);
263+ assert (NF);
264+ for (const auto *I = ImplicitArgLoads.begin (); I != PreloadEnd; ++I) {
265+ LoadInst *LoadInst = I->first ;
266+ unsigned LoadOffset = I->second ;
267+ unsigned HiddenArgIndex = getHiddenArgFromOffset (LoadOffset);
268+ unsigned Index = NF->arg_size () - LastHiddenArgIndex + HiddenArgIndex - 1 ;
269+ Argument *Arg = NF->getArg (Index);
270+ LoadInst->replaceAllUsesWith (Arg);
271+ }
272+ }
89273};
90274
91275class AMDGPULowerKernelArguments : public FunctionPass {
@@ -169,6 +353,12 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
169353 uint64_t LastExplicitArgOffset = ExplicitArgOffset;
170354 ExplicitArgOffset = alignTo (ExplicitArgOffset, ABITypeAlign) + AllocSize;
171355
356+ // Guard against the situation where hidden arguments have already been
357+ // lowered and added to the kernel function signiture, i.e. in a situation
358+ // where this pass has run twice.
359+ if (Arg.hasAttribute (" amdgpu-hidden-argument" ))
360+ break ;
361+
172362 if (DBG) {
173363 llvm::errs () << " arg: " << Arg
174364 << " Arg.hasInRegAttr()=" << Arg.hasInRegAttr ()
@@ -315,6 +505,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
315505 KernArgSegment->addRetAttr (
316506 Attribute::getWithAlignment (Ctx, std::max (KernArgBaseAlign, MaxAlign)));
317507
508+ if (InPreloadSequence) {
509+ uint64_t ImplicitArgsBaseOffset =
510+ alignTo (ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr ()) +
511+ BaseOffset;
512+ PreloadInfo.tryAllocImplicitArgPreloadSGPRs (ImplicitArgsBaseOffset,
513+ Builder);
514+ }
515+
318516 return true ;
319517}
320518
0 commit comments