@@ -2841,6 +2841,86 @@ void SITargetLowering::insertCopiesSplitCSR(
28412841 }
28422842}
28432843
2844+ class InregVPGRSpiller {
2845+ CCState &State;
2846+ const unsigned WaveFrontSize;
2847+
2848+ Register CurReg;
2849+ unsigned CurLane = 0;
2850+
2851+ protected:
2852+ SelectionDAG &DAG;
2853+ MachineFunction &MF;
2854+
2855+ Register getCurReg() const { return CurReg; }
2856+ unsigned getCurLane() const { return CurLane; }
2857+
2858+ InregVPGRSpiller(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
2859+ : State(State),
2860+ WaveFrontSize(MF.getSubtarget<GCNSubtarget>().getWavefrontSize()),
2861+ DAG(DAG), MF(MF) {}
2862+
2863+ void setReg(Register &Reg) {
2864+ if (CurReg.isValid()) {
2865+ State.DeallocateReg(Reg);
2866+ Reg = CurReg;
2867+ } else {
2868+ CurReg = Reg;
2869+ }
2870+ }
2871+
2872+ void forward() {
2873+ // We have used the same VGPRs of all the lanes, so we need to reset it and
2874+ // pick up a new one in the next move.
2875+ if (++CurLane % WaveFrontSize == 0)
2876+ CurReg = 0;
2877+ }
2878+ };
2879+
2880+ class InregVPGRSpillerCallee final : private InregVPGRSpiller {
2881+ public:
2882+ InregVPGRSpillerCallee(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
2883+ : InregVPGRSpiller(DAG, MF, State) {}
2884+
2885+ SDValue read(SDValue Chain, const SDLoc &SL, Register &Reg, EVT VT) {
2886+ setReg(Reg);
2887+
2888+ MF.addLiveIn(getCurReg(), &AMDGPU::VGPR_32RegClass);
2889+
2890+ // TODO: Do we need the chain here?
2891+ SmallVector<SDValue, 4> Operands{
2892+ DAG.getTargetConstant(Intrinsic::amdgcn_readlane, SL, MVT::i32),
2893+ DAG.getRegister(getCurReg(), VT),
2894+ DAG.getTargetConstant(getCurLane(), SL, MVT::i32)};
2895+ SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
2896+
2897+ forward();
2898+
2899+ return Res;
2900+ }
2901+ };
2902+
2903+ class InregVPGRSpillerCallSite final : private InregVPGRSpiller {
2904+ public:
2905+ InregVPGRSpillerCallSite(SelectionDAG &DAG, MachineFunction &MF,
2906+ CCState &State)
2907+ : InregVPGRSpiller(DAG, MF, State) {}
2908+
2909+ SDValue write(const SDLoc &SL, Register &Reg, SDValue V, EVT VT) {
2910+ setReg(Reg);
2911+
2912+ SmallVector<SDValue, 4> Operands{
2913+ DAG.getTargetConstant(Intrinsic::amdgcn_writelane, SL, MVT::i32),
2914+ DAG.getRegister(getCurReg(), VT), V,
2915+ DAG.getTargetConstant(getCurLane(), SL, MVT::i32)};
2916+ SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
2917+
2918+ forward();
2919+
2920+ return Res;
2921+ }
2922+ };
2923+
28442924SDValue SITargetLowering::LowerFormalArguments(
28452925 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
28462926 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -2963,6 +3043,7 @@ SDValue SITargetLowering::LowerFormalArguments(
29633043 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
29643044 // kern arg offset.
29653045 const Align KernelArgBaseAlign = Align(16);
3046+ InregVPGRSpillerCallee Spiller(DAG, MF, CCInfo);
29663047
29673048 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
29683049 const ISD::InputArg &Arg = Ins[i];
@@ -3130,8 +3211,17 @@ SDValue SITargetLowering::LowerFormalArguments(
31303211 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
31313212 EVT ValVT = VA.getValVT();
31323213
3133- Reg = MF.addLiveIn(Reg, RC);
3134- SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3214+ SDValue Val;
3215+ // If an argument is marked inreg but gets pushed to a VGPR, it indicates
3216+ // we've run out of SGPRs for argument passing. In such cases, we'd prefer
3217+ // to start packing inreg arguments into individual lanes of VGPRs, rather
3218+ // than placing them directly into VGPRs.
3219+ if (RC == &AMDGPU::VGPR_32RegClass && Arg.Flags.isInReg()) {
3220+ Val = Spiller.read(Chain, DL, Reg, VT);
3221+ } else {
3222+ Reg = MF.addLiveIn(Reg, RC);
3223+ Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3224+ }
31353225
31363226 if (Arg.Flags.isSRet()) {
31373227 // The return object should be reasonably addressable.
@@ -3875,6 +3965,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
38753965
38763966 MVT PtrVT = MVT::i32;
38773967
3968+ InregVPGRSpillerCallSite Spiller(DAG, MF, CCInfo);
3969+
38783970 // Walk the register/memloc assignments, inserting copies/loads.
38793971 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
38803972 CCValAssign &VA = ArgLocs[i];
@@ -3904,6 +3996,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
39043996 }
39053997
39063998 if (VA.isRegLoc()) {
3999+ Register Reg = VA.getLocReg();
4000+ if (Outs[i].Flags.isInReg() && AMDGPU::VGPR_32RegClass.contains(Reg))
4001+ Arg = Spiller.write(DL, Reg, Arg, VA.getLocVT());
39074002 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
39084003 } else {
39094004 assert(VA.isMemLoc());
0 commit comments