Skip to content

Commit 7190fa6

Browse files
committed
[WIP][AMDGPU] Improve the handling of inreg arguments
When SGPRs available for `inreg` argument passing run out, the compiler silently falls back to using whole VGPRs to pass those arguments. Ideally, instead of using whole VGPRs, we should pack `inreg` arguments into individual lanes of VGPRs. This PR introduces `InregVGPRSpiller`, which handles this packing. It uses `v_writelane` at the call site to place `inreg` arguments into specific VGPR lanes, and then extracts them in the callee using `v_readlane`. Fixes #130443 and #129071.
1 parent 6b98134 commit 7190fa6

File tree

2 files changed

+125
-2
lines changed

2 files changed

+125
-2
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2841,6 +2841,86 @@ void SITargetLowering::insertCopiesSplitCSR(
28412841
}
28422842
}
28432843

2844+
class InregVPGRSpiller {
2845+
CCState &State;
2846+
const unsigned WaveFrontSize;
2847+
2848+
Register CurReg;
2849+
unsigned CurLane = 0;
2850+
2851+
protected:
2852+
SelectionDAG &DAG;
2853+
MachineFunction &MF;
2854+
2855+
Register getCurReg() const { return CurReg; }
2856+
unsigned getCurLane() const { return CurLane; }
2857+
2858+
InregVPGRSpiller(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
2859+
: State(State),
2860+
WaveFrontSize(MF.getSubtarget<GCNSubtarget>().getWavefrontSize()),
2861+
DAG(DAG), MF(MF) {}
2862+
2863+
void setReg(Register &Reg) {
2864+
if (CurReg.isValid()) {
2865+
State.DeallocateReg(Reg);
2866+
Reg = CurReg;
2867+
} else {
2868+
CurReg = Reg;
2869+
}
2870+
}
2871+
2872+
void forward() {
2873+
// We have used the same VGPRs of all the lanes, so we need to reset it and
2874+
// pick up a new one in the next move.
2875+
if (++CurLane % WaveFrontSize == 0)
2876+
CurReg = 0;
2877+
}
2878+
};
2879+
2880+
class InregVPGRSpillerCallee final : private InregVPGRSpiller {
2881+
public:
2882+
InregVPGRSpillerCallee(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
2883+
: InregVPGRSpiller(DAG, MF, State) {}
2884+
2885+
SDValue read(SDValue Chain, const SDLoc &SL, Register &Reg, EVT VT) {
2886+
setReg(Reg);
2887+
2888+
MF.addLiveIn(getCurReg(), &AMDGPU::VGPR_32RegClass);
2889+
2890+
// TODO: Do we need the chain here?
2891+
SmallVector<SDValue, 4> Operands{
2892+
DAG.getTargetConstant(Intrinsic::amdgcn_readlane, SL, MVT::i32),
2893+
DAG.getRegister(getCurReg(), VT),
2894+
DAG.getTargetConstant(getCurLane(), SL, MVT::i32)};
2895+
SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
2896+
2897+
forward();
2898+
2899+
return Res;
2900+
}
2901+
};
2902+
2903+
class InregVPGRSpillerCallSite final : private InregVPGRSpiller {
2904+
public:
2905+
InregVPGRSpillerCallSite(SelectionDAG &DAG, MachineFunction &MF,
2906+
CCState &State)
2907+
: InregVPGRSpiller(DAG, MF, State) {}
2908+
2909+
SDValue write(const SDLoc &SL, Register &Reg, SDValue V, EVT VT) {
2910+
setReg(Reg);
2911+
2912+
SmallVector<SDValue, 4> Operands{
2913+
DAG.getTargetConstant(Intrinsic::amdgcn_writelane, SL, MVT::i32),
2914+
DAG.getRegister(getCurReg(), VT), V,
2915+
DAG.getTargetConstant(getCurLane(), SL, MVT::i32)};
2916+
SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
2917+
2918+
forward();
2919+
2920+
return Res;
2921+
}
2922+
};
2923+
28442924
SDValue SITargetLowering::LowerFormalArguments(
28452925
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
28462926
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -2963,6 +3043,7 @@ SDValue SITargetLowering::LowerFormalArguments(
29633043
// FIXME: Alignment of explicit arguments totally broken with non-0 explicit
29643044
// kern arg offset.
29653045
const Align KernelArgBaseAlign = Align(16);
3046+
InregVPGRSpillerCallee Spiller(DAG, MF, CCInfo);
29663047

29673048
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
29683049
const ISD::InputArg &Arg = Ins[i];
@@ -3130,8 +3211,17 @@ SDValue SITargetLowering::LowerFormalArguments(
31303211
llvm_unreachable("Unexpected register class in LowerFormalArguments!");
31313212
EVT ValVT = VA.getValVT();
31323213

3133-
Reg = MF.addLiveIn(Reg, RC);
3134-
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3214+
SDValue Val;
3215+
// If an argument is marked inreg but gets pushed to a VGPR, it indicates
3216+
// we've run out of SGPRs for argument passing. In such cases, we'd prefer
3217+
// to start packing inreg arguments into individual lanes of VGPRs, rather
3218+
// than placing them directly into VGPRs.
3219+
if (RC == &AMDGPU::VGPR_32RegClass && Arg.Flags.isInReg()) {
3220+
Val = Spiller.read(Chain, DL, Reg, VT);
3221+
} else {
3222+
Reg = MF.addLiveIn(Reg, RC);
3223+
Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3224+
}
31353225

31363226
if (Arg.Flags.isSRet()) {
31373227
// The return object should be reasonably addressable.
@@ -3875,6 +3965,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
38753965

38763966
MVT PtrVT = MVT::i32;
38773967

3968+
InregVPGRSpillerCallSite Spiller(DAG, MF, CCInfo);
3969+
38783970
// Walk the register/memloc assignments, inserting copies/loads.
38793971
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
38803972
CCValAssign &VA = ArgLocs[i];
@@ -3904,6 +3996,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
39043996
}
39053997

39063998
if (VA.isRegLoc()) {
3999+
Register Reg = VA.getLocReg();
4000+
if (Outs[i].Flags.isInReg() && AMDGPU::VGPR_32RegClass.contains(Reg))
4001+
Arg = Spiller.write(DL, Reg, Arg, VA.getLocVT());
39074002
RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
39084003
} else {
39094004
assert(VA.isMemLoc());
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -o - %s | FileCheck %s
3+
4+
; arg3 is v0, arg4 is in v1. These should be packed into a lane and extracted with readlane
5+
define i32 @callee(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <2 x i32> inreg %arg2, i32 inreg %arg3, i32 inreg %arg4) {
6+
; CHECK-LABEL: test0:
7+
; CHECK: ; %bb.0:
8+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9+
; CHECK-NEXT: v_readlane_b32 s0, v0, 1
10+
; CHECK-NEXT: v_readlane_b32 s1, v0, 0
11+
; CHECK-NEXT: s_add_i32 s1, s1, s0
12+
; CHECK-NEXT: s_nop 0
13+
; CHECK-NEXT: v_mov_b32_e32 v0, s1
14+
; CHECK-NEXT: s_setpc_b64 s[30:31]
15+
%add = add i32 %arg3, %arg4
16+
ret i32 %add
17+
}
18+
19+
define amdgpu_kernel void @kernel(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p) {
20+
%arg0 = load <8 x i32>, ptr %p0
21+
%arg1 = load <8 x i32>, ptr %p1
22+
%arg2 = load <2 x i32>, ptr %p2
23+
%arg3 = load i32, ptr %p3
24+
%arg4 = load i32, ptr %p4
25+
%ret = call i32 @callee(<8 x i32> %arg0, <8 x i32> %arg1, <2 x i32> %arg2, i32 %arg3, i32 %arg4)
26+
store i32 %ret, ptr %p
27+
ret void
28+
}

0 commit comments

Comments
 (0)