@@ -3565,7 +3565,6 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
35653565 if (IsVarArg)
35663566 return false;
35673567
3568- // FIXME: We need to know all arguments passed in SGPR are uniform.
35693568 for (const Argument &Arg : CallerF.args()) {
35703569 if (Arg.hasByValAttr())
35713570 return false;
@@ -3593,6 +3592,8 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
35933592 SmallVector<CCValAssign, 16> ArgLocs;
35943593 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
35953594
3595+ // FIXME: We are not allocating special input registers, so we will be
3596+ // deciding based on incorrect register assignments.
35963597 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
35973598
35983599 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -3602,6 +3603,21 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
36023603 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
36033604 return false;
36043605
3606+ for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3607+ // FIXME: What about inreg arguments that end up passed in memory?
3608+ if (!CCVA.isRegLoc())
3609+ continue;
3610+
3611+ // If we are passing an argument in an SGPR, and the value is divergent,
3612+ // this call requires a waterfall loop.
3613+ if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3614+ LLVM_DEBUG(
3615+ dbgs() << "Cannot tail call due to divergent outgoing argument in "
3616+ << printReg(CCVA.getLocReg(), TRI) << '\n');
3617+ return false;
3618+ }
3619+ }
3620+
36053621 const MachineRegisterInfo &MRI = MF.getRegInfo();
36063622 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
36073623}
@@ -3734,6 +3750,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
37343750 // arguments to begin at SP+0. Completely unused for non-tail calls.
37353751 int32_t FPDiff = 0;
37363752 MachineFrameInfo &MFI = MF.getFrameInfo();
3753+ auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
37373754
37383755 // Adjust the stack pointer for the new arguments...
37393756 // These operations are automatically eliminated by the prolog/epilog pass
@@ -3756,6 +3773,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
37563773 }
37573774 }
37583775
3776+ const unsigned NumSpecialInputs = RegsToPass.size();
3777+
37593778 MVT PtrVT = MVT::i32;
37603779
37613780 // Walk the register/memloc assignments, inserting copies/loads.
@@ -3857,16 +3876,40 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
38573876 if (!MemOpChains.empty())
38583877 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
38593878
3879+ SDValue ReadFirstLaneID =
3880+ DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3881+
3882+ SDValue TokenGlue;
3883+ if (CLI.ConvergenceControlToken) {
3884+ TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3885+ CLI.ConvergenceControlToken);
3886+ }
3887+
38603888 // Build a sequence of copy-to-reg nodes chained together with token chain
38613889 // and flag operands which copy the outgoing args into the appropriate regs.
38623890 SDValue InGlue;
3863- for (auto &RegToPass : RegsToPass) {
3864- Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3865- RegToPass.second, InGlue);
3891+
3892+ unsigned ArgIdx = 0;
3893+ for (auto [Reg, Val] : RegsToPass) {
3894+ if (ArgIdx++ >= NumSpecialInputs && !Val->isDivergent() &&
3895+ TRI->isSGPRPhysReg(Reg)) {
3896+ // Speculatively insert a readfirstlane in case this is a uniform value in
3897+ // a VGPR.
3898+ //
3899+ // FIXME: We need to execute this in a waterfall loop if it is a divergent
3900+ // value, so let that continue to produce invalid code.
3901+
3902+ SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3903+ if (TokenGlue)
3904+ ReadfirstlaneArgs.push_back(TokenGlue);
3905+ Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(),
3906+ ReadfirstlaneArgs);
3907+ }
3908+
3909+ Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
38663910 InGlue = Chain.getValue(1);
38673911 }
38683912
3869-
38703913 // We don't usually want to end the call-sequence here because we would tidy
38713914 // the frame up *after* the call, however in the ABI-changing tail-call case
38723915 // we've carefully laid out the parameters so that when sp is reset they'll be
@@ -3896,12 +3939,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
38963939 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
38973940
38983941 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3899- if (CLI.ConvergenceControlToken) {
3900- SDValue TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {},
3901- MVT::Glue, CLI.ConvergenceControlToken);
3942+ if (TokenGlue)
39023943 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3903- }
3904-
39053944 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
39063945 ReadfirstlaneArgs);
39073946 }
@@ -3928,7 +3967,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
39283967 }
39293968
39303969 // Add a register mask operand representing the call-preserved registers.
3931- auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
39323970 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
39333971 assert(Mask && "Missing call preserved mask for calling convention");
39343972 Ops.push_back(DAG.getRegisterMask(Mask));
0 commit comments