Skip to content

Commit 09f1402

Browse files
committed
[AMDGPU][NFCI] Remove preload kernarg alloc dep on DAG isel path
Makes the function allocatePreloadKernArgSGPRs callable from DAG isel and GIsel paths in preparation for supporting preloading kernargs with global isel. This is also a prerequisite for some refactoring of preload kernargs to allow preloading non-consecutive arguments.
1 parent 5996496 commit 09f1402

File tree

2 files changed

+78
-56
lines changed

2 files changed

+78
-56
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 78 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2465,71 +2465,94 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
24652465
// these from the dispatch pointer.
24662466
}
24672467

2468+
static bool allocPreloadKernArg(uint64_t &LastExplicitArgOffset,
2469+
uint64_t ExplicitArgOffset, uint64_t ArgOffset,
2470+
unsigned ArgSize, unsigned Idx,
2471+
MachineFunction &MF, const SIRegisterInfo &TRI,
2472+
SIMachineFunctionInfo &Info, CCState &CCInfo) {
2473+
if (ArgOffset >= ExplicitArgOffset)
2474+
return false;
2475+
2476+
const Align KernelArgBaseAlign = Align(16);
2477+
Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2478+
unsigned NumAllocSGPRs = alignTo(ArgSize, 4) / 4;
2479+
2480+
// Arg is preloaded into the previous SGPR.
2481+
if (ArgSize < 4 && Alignment < 4) {
2482+
Info.getArgInfo().PreloadKernArgs[Idx].Regs.push_back(
2483+
Info.getArgInfo().PreloadKernArgs[Idx - 1].Regs[0]);
2484+
return true;
2485+
}
2486+
2487+
unsigned Padding = ArgOffset - LastExplicitArgOffset;
2488+
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2489+
// Check for free user SGPRs for preloading.
2490+
if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2491+
Info.getUserSGPRInfo().getNumFreeUserSGPRs()) {
2492+
return false;
2493+
}
2494+
2495+
// Preload this argument.
2496+
const TargetRegisterClass *RC =
2497+
TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2498+
SmallVectorImpl<MCRegister> *PreloadRegs =
2499+
Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, Idx, PaddingSGPRs);
2500+
2501+
if (PreloadRegs->size() > 1)
2502+
RC = &AMDGPU::SGPR_32RegClass;
2503+
2504+
for (auto &Reg : *PreloadRegs) {
2505+
assert(Reg);
2506+
MF.addLiveIn(Reg, RC);
2507+
CCInfo.AllocateReg(Reg);
2508+
}
2509+
2510+
LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2511+
return true;
2512+
}
2513+
24682514
// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
24692515
// sequential starting from the first argument.
24702516
void SITargetLowering::allocatePreloadKernArgSGPRs(
2471-
CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2472-
const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
2517+
CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, MachineFunction &MF,
24732518
const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
24742519
Function &F = MF.getFunction();
2475-
unsigned LastExplicitArgOffset =
2476-
MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2477-
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2478-
bool InPreloadSequence = true;
2479-
unsigned InIdx = 0;
2520+
const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
2521+
uint64_t ExplicitArgOffset = BaseOffset;
2522+
uint64_t LastExplicitArgOffset = ExplicitArgOffset;
2523+
unsigned LocIdx = 0;
24802524
for (auto &Arg : F.args()) {
2481-
if (!InPreloadSequence || !Arg.hasInRegAttr())
2525+
const DataLayout &DL = F.getParent()->getDataLayout();
2526+
const bool IsByRef = Arg.hasByRefAttr();
2527+
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
2528+
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
2529+
if (AllocSize == 0)
24822530
break;
24832531

2484-
int ArgIdx = Arg.getArgNo();
2485-
// Don't preload non-original args or parts not in the current preload
2486-
// sequence.
2487-
if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2488-
(int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2532+
MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
2533+
Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
2534+
uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
2535+
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
2536+
if (!Arg.hasInRegAttr())
24892537
break;
24902538

2491-
for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2492-
(int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2493-
InIdx++) {
2494-
assert(ArgLocs[ArgIdx].isMemLoc());
2495-
auto &ArgLoc = ArgLocs[InIdx];
2496-
const Align KernelArgBaseAlign = Align(16);
2497-
unsigned ArgOffset = ArgLoc.getLocMemOffset();
2498-
Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2499-
unsigned NumAllocSGPRs =
2500-
alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2501-
2502-
// Arg is preloaded into the previous SGPR.
2503-
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2504-
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2505-
Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2506-
continue;
2507-
}
2508-
2509-
unsigned Padding = ArgOffset - LastExplicitArgOffset;
2510-
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2511-
// Check for free user SGPRs for preloading.
2512-
if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2513-
SGPRInfo.getNumFreeUserSGPRs()) {
2514-
InPreloadSequence = false;
2515-
break;
2516-
}
2517-
2518-
// Preload this argument.
2519-
const TargetRegisterClass *RC =
2520-
TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2521-
SmallVectorImpl<MCRegister> *PreloadRegs =
2522-
Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2523-
2524-
if (PreloadRegs->size() > 1)
2525-
RC = &AMDGPU::SGPR_32RegClass;
2526-
for (auto &Reg : *PreloadRegs) {
2527-
assert(Reg);
2528-
MF.addLiveIn(Reg, RC);
2529-
CCInfo.AllocateReg(Reg);
2539+
if (!ArgLocs.size()) {
2540+
// global isel
2541+
allocPreloadKernArg(LastExplicitArgOffset, ExplicitArgOffset,
2542+
ArgOffset, AllocSize, Arg.getArgNo(), MF,
2543+
TRI, Info, CCInfo);
2544+
} else {
2545+
// DAG isel
2546+
for (; LocIdx < ArgLocs.size(); LocIdx++) {
2547+
CCValAssign &ArgLoc = ArgLocs[LocIdx];
2548+
assert(ArgLoc.isMemLoc());
2549+
uint64_t LocOffset = ArgLoc.getLocMemOffset();
2550+
unsigned LocSize = ArgLoc.getLocVT().getStoreSize();
2551+
if (!allocPreloadKernArg(LastExplicitArgOffset, ExplicitArgOffset,
2552+
LocOffset, LocSize, LocIdx, MF, TRI, Info,
2553+
CCInfo))
2554+
break;
25302555
}
2531-
2532-
LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
25332556
}
25342557
}
25352558
}
@@ -2854,7 +2877,7 @@ SDValue SITargetLowering::LowerFormalArguments(
28542877
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
28552878
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
28562879
if (IsKernel && Subtarget->hasKernargPreload())
2857-
allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2880+
allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, MF, *TRI, *Info);
28582881

28592882
allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
28602883
} else if (!IsGraphics) {

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
563563

564564
void allocatePreloadKernArgSGPRs(CCState &CCInfo,
565565
SmallVectorImpl<CCValAssign> &ArgLocs,
566-
const SmallVectorImpl<ISD::InputArg> &Ins,
567566
MachineFunction &MF,
568567
const SIRegisterInfo &TRI,
569568
SIMachineFunctionInfo &Info) const;

0 commit comments

Comments
 (0)