llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp‎
Lines changed: 3 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp‎
Lines changed: 49 additions & 10 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp‎
Lines changed: 49 additions & 10 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIRegisterInfo.h‎
Lines changed: 3 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIRegisterInfo.h‎
Lines changed: 3 additions & 0 deletions
@@ -1142,6 +1142,9 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
     return false;
   }
 
+  // FIXME: We need to check if any arguments passed in SGPR are uniform. If
+  // they are not, this cannot be a tail call. If they are uniform, but may be
+  // VGPR, we need to insert readfirstlanes.
   if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
     return false;
 
 
@@ -3593,6 +3593,8 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
 
+  // FIXME: We are not allocating special input registers, so we will be
+  // deciding based on incorrect register assignments.
   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
 
   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -3602,6 +3604,21 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
   if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
     return false;
 
+  for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
+    // FIXME: What about inreg arguments that end up passed in memory?
+    if (!CCVA.isRegLoc())
+      continue;
+
+    // If we are passing an argument in an SGPR, and the value is divergent,
+    // this call requires a waterfall loop.
+    if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
+      LLVM_DEBUG(
+          dbgs() << "Cannot tail call due to divergent outgoing argument in "
+                 << printReg(CCVA.getLocReg(), TRI) << '\n');
+      return false;
+    }
+  }
+
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
 }
@@ -3734,6 +3751,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   // arguments to begin at SP+0. Completely unused for non-tail calls.
   int32_t FPDiff = 0;
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
@@ -3756,6 +3774,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     }
   }
 
+  const unsigned NumSpecialInputs = RegsToPass.size();
+
   MVT PtrVT = MVT::i32;
 
   // Walk the register/memloc assignments, inserting copies/loads.
@@ -3857,16 +3877,40 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
+  SDValue ReadFirstLaneID =
+      DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
+
+  SDValue TokenGlue;
+  if (CLI.ConvergenceControlToken) {
+    TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
+                            CLI.ConvergenceControlToken);
+  }
+
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InGlue;
-  for (auto &RegToPass : RegsToPass) {
-    Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
-                             RegToPass.second, InGlue);
+
+  unsigned ArgIdx = 0;
+  for (auto [Reg, Val] : RegsToPass) {
+    if (ArgIdx++ >= NumSpecialInputs && !Val->isDivergent() &&
+        TRI->isSGPRPhysReg(Reg)) {
+      // Speculatively insert a readfirstlane in case this is a uniform value in
+      // a VGPR.
+      //
+      // FIXME: We need to execute this in a waterfall loop if it is a divergent
+      // value, so let that continue to produce invalid code.
+
+      SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
+      if (TokenGlue)
+        ReadfirstlaneArgs.push_back(TokenGlue);
+      Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(),
+                        ReadfirstlaneArgs);
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
     InGlue = Chain.getValue(1);
   }
 
-
   // We don't usually want to end the call-sequence here because we would tidy
   // the frame up *after* the call, however in the ABI-changing tail-call case
   // we've carefully laid out the parameters so that when sp is reset they'll be
@@ -3896,12 +3940,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
           DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
 
       SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
-      if (CLI.ConvergenceControlToken) {
-        SDValue TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {},
-                                        MVT::Glue, CLI.ConvergenceControlToken);
+      if (TokenGlue)
         ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
-      }
-
       Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
                            ReadfirstlaneArgs);
     }
@@ -3928,7 +3968,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   // Add a register mask operand representing the call-preserved registers.
-  auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
@@ -210,6 +210,9 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   }
 
   bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const;
+  bool isSGPRPhysReg(Register Reg) const {
+    return isSGPRClass(getPhysRegBaseClass(Reg));
+  }
 
   /// \returns true if this class contains only VGPR registers
   static bool isVGPRClass(const TargetRegisterClass *RC) {
Original file line number	Diff line number	Diff line change
`@@ -1142,6 +1142,9 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization(`
`1142`	`1142`	`return false;`
`1143`	`1143`	`}`
`1144`	`1144`
	`1145`	`+ // FIXME: We need to check if any arguments passed in SGPR are uniform. If`
	`1146`	`+ // they are not, this cannot be a tail call. If they are uniform, but may be`
	`1147`	`+ // VGPR, we need to insert readfirstlanes.`
`1145`	`1148`	`if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))`
`1146`	`1149`	`return false;`
`1147`	`1150`
Original file line number	Diff line number	Diff line change
`@@ -210,6 +210,9 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {`
`210`	`210`	`}`
`211`	`211`
`212`	`212`	`bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const;`
	`213`	`+ bool isSGPRPhysReg(Register Reg) const {`
	`214`	`+ return isSGPRClass(getPhysRegBaseClass(Reg));`
	`215`	`+ }`
`213`	`216`
`214`	`217`	`/// \returns true if this class contains only VGPR registers`
`215`	`218`	`static bool isVGPRClass(const TargetRegisterClass *RC) {`