llvm
diff --git a/‎llvm/include/llvm/CodeGen/SelectionDAGISel.h‎
Lines changed: 15 additions & 14 deletions b/‎llvm/include/llvm/CodeGen/SelectionDAGISel.h‎
Lines changed: 15 additions & 14 deletions
diff --git a/‎llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp‎
Lines changed: 9 additions & 4 deletions b/‎llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp‎
Lines changed: 95 additions & 31 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp‎
Lines changed: 95 additions & 31 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp‎
Lines changed: 66 additions & 22 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp‎
Lines changed: 66 additions & 22 deletions
@@ -328,20 +328,21 @@ class SelectionDAGISel {
   };
 
   enum {
-    OPFL_None       = 0,  // Node has no chain or glue input and isn't variadic.
-    OPFL_Chain      = 1,     // Node has a chain input.
-    OPFL_GlueInput  = 2,     // Node has a glue input.
-    OPFL_GlueOutput = 4,     // Node has a glue output.
-    OPFL_MemRefs    = 8,     // Node gets accumulated MemRefs.
-    OPFL_Variadic0  = 1<<4,  // Node is variadic, root has 0 fixed inputs.
-    OPFL_Variadic1  = 2<<4,  // Node is variadic, root has 1 fixed inputs.
-    OPFL_Variadic2  = 3<<4,  // Node is variadic, root has 2 fixed inputs.
-    OPFL_Variadic3  = 4<<4,  // Node is variadic, root has 3 fixed inputs.
-    OPFL_Variadic4  = 5<<4,  // Node is variadic, root has 4 fixed inputs.
-    OPFL_Variadic5  = 6<<4,  // Node is variadic, root has 5 fixed inputs.
-    OPFL_Variadic6  = 7<<4,  // Node is variadic, root has 6 fixed inputs.
-
-    OPFL_VariadicInfo = OPFL_Variadic6
+    OPFL_None = 0,       // Node has no chain or glue input and isn't variadic.
+    OPFL_Chain = 1,      // Node has a chain input.
+    OPFL_GlueInput = 2,  // Node has a glue input.
+    OPFL_GlueOutput = 4, // Node has a glue output.
+    OPFL_MemRefs = 8,    // Node gets accumulated MemRefs.
+    OPFL_Variadic0 = 1 << 4, // Node is variadic, root has 0 fixed inputs.
+    OPFL_Variadic1 = 2 << 4, // Node is variadic, root has 1 fixed inputs.
+    OPFL_Variadic2 = 3 << 4, // Node is variadic, root has 2 fixed inputs.
+    OPFL_Variadic3 = 4 << 4, // Node is variadic, root has 3 fixed inputs.
+    OPFL_Variadic4 = 5 << 4, // Node is variadic, root has 4 fixed inputs.
+    OPFL_Variadic5 = 6 << 4, // Node is variadic, root has 5 fixed inputs.
+    OPFL_Variadic6 = 7 << 4, // Node is variadic, root has 6 fixed inputs.
+    OPFL_Variadic7 = 8 << 4, // Node is variadic, root has 7 fixed inputs.
+
+    OPFL_VariadicInfo = 15 << 4 // Mask for extracting the OPFL_VariadicN bits.
   };
 
   /// getNumFixedFromVariadicInfo - Transform an EmitNode flags word into the
 
@@ -7996,10 +7996,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
   case Intrinsic::amdgcn_cs_chain: {
-    assert(I.arg_size() == 5 && "Additional args not supported yet");
-    assert(cast<ConstantInt>(I.getOperand(4))->isZero() &&
-           "Non-zero flags not supported yet");
-
     // At this point we don't care if it's amdgpu_cs_chain or
     // amdgpu_cs_chain_preserve.
     CallingConv::ID CC = CallingConv::AMDGPU_CS_Chain;
@@ -8026,6 +8022,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     assert(!Args[1].IsInReg && "VGPR args should not be marked inreg");
     Args[2].IsInReg = true; // EXEC should be inreg
 
+    // Forward the flags and any additional arguments.
+    for (unsigned Idx = 4; Idx < I.arg_size(); ++Idx) {
+      TargetLowering::ArgListEntry Arg;
+      Arg.Node = getValue(I.getOperand(Idx));
+      Arg.Ty = I.getOperand(Idx)->getType();
+      Arg.setAttributes(&I, Idx);
+      Args.push_back(Arg);
+    }
+
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(getCurSDLoc())
         .setChain(getRoot())
 
@@ -953,17 +953,22 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
 }
 
 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
-                              bool IsTailCall, bool isWave32,
-                              CallingConv::ID CC) {
+                              bool IsTailCall, bool IsWave32,
+                              CallingConv::ID CC,
+                              bool IsDynamicVGPRChainCall = false) {
   // For calls to amdgpu_cs_chain functions, the address is known to be uniform.
   assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
          "Indirect calls can't be tail calls, "
          "because the address can be divergent");
   if (!IsTailCall)
     return AMDGPU::G_SI_CALL;
 
-  if (AMDGPU::isChainCC(CC))
-    return isWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
+  if (AMDGPU::isChainCC(CC)) {
+    if (IsDynamicVGPRChainCall)
+      return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR
+                      : AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR;
+    return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
+  }
 
   return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
                                          AMDGPU::SI_TCRETURN;
@@ -972,7 +977,8 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
 // Add operands to call instruction to track the callee.
 static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
                                   MachineIRBuilder &MIRBuilder,
-                                  AMDGPUCallLowering::CallLoweringInfo &Info) {
+                                  AMDGPUCallLowering::CallLoweringInfo &Info,
+                                  bool IsDynamicVGPRChainCall = false) {
   if (Info.Callee.isReg()) {
     CallInst.addReg(Info.Callee.getReg());
     CallInst.addImm(0);
@@ -983,7 +989,12 @@ static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
     auto Ptr = MIRBuilder.buildGlobalValue(
       LLT::pointer(GV->getAddressSpace(), 64), GV);
     CallInst.addReg(Ptr.getReg(0));
-    CallInst.add(Info.Callee);
+
+    if (IsDynamicVGPRChainCall)
+      // DynamicVGPR chain calls are always indirect.
+      CallInst.addImm(0);
+    else
+      CallInst.add(Info.Callee);
   } else
     return false;
 
@@ -1177,6 +1188,18 @@ void AMDGPUCallLowering::handleImplicitCallArguments(
   }
 }
 
+namespace {
+// Chain calls have special arguments that we need to handle. These have the
+// same index as they do in the llvm.amdgcn.cs.chain intrinsic.
+enum ChainCallArgIdx {
+  Exec = 1,
+  Flags = 4,
+  NumVGPRs = 5,
+  FallbackExec = 6,
+  FallbackCallee = 7,
+};
+} // anonymous namespace
+
 bool AMDGPUCallLowering::lowerTailCall(
     MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
     SmallVectorImpl<ArgInfo> &OutArgs) const {
@@ -1185,6 +1208,8 @@ bool AMDGPUCallLowering::lowerTailCall(
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   const Function &F = MF.getFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
 
   // True when we're tail calling, but without -tailcallopt.
@@ -1200,34 +1225,78 @@ bool AMDGPUCallLowering::lowerTailCall(
   if (!IsSibCall)
     CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
 
-  unsigned Opc =
-      getCallOpcode(MF, Info.Callee.isReg(), true, ST.isWave32(), CalleeCC);
+  bool IsChainCall = AMDGPU::isChainCC(Info.CallConv);
+  bool IsDynamicVGPRChainCall = false;
+
+  if (IsChainCall) {
+    ArgInfo FlagsArg = Info.OrigArgs[ChainCallArgIdx::Flags];
+    const APInt &FlagsValue = cast<ConstantInt>(FlagsArg.OrigValue)->getValue();
+    if (FlagsValue.isZero()) {
+      if (Info.OrigArgs.size() != 5) {
+        LLVM_DEBUG(dbgs() << "No additional args allowed if flags == 0");
+        return false;
+      }
+    } else if (FlagsValue.isOneBitSet(0)) {
+      IsDynamicVGPRChainCall = true;
+
+      if (Info.OrigArgs.size() != 8) {
+        LLVM_DEBUG(dbgs() << "Expected 3 additional args");
+        return false;
+      }
+
+      // On GFX12, we can only change the VGPR allocation for wave32.
+      if (!ST.isWave32()) {
+        LLVM_DEBUG(dbgs() << "Dynamic VGPR mode is only supported for wave32");
+        return false;
+      }
+
+      ArgInfo FallbackExecArg = Info.OrigArgs[ChainCallArgIdx::FallbackExec];
+      assert(FallbackExecArg.Regs.size() == 1 &&
+             "Expected single register for fallback EXEC");
+      if (!FallbackExecArg.Ty->isIntegerTy(ST.getWavefrontSize())) {
+        LLVM_DEBUG(dbgs() << "Bad type for fallback EXEC");
+        return false;
+      }
+    }
+  }
+
+  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), /*IsTailCall*/ true,
+                               ST.isWave32(), CalleeCC, IsDynamicVGPRChainCall);
   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
-  if (!addCallTargetOperands(MIB, MIRBuilder, Info))
+  if (!addCallTargetOperands(MIB, MIRBuilder, Info, IsDynamicVGPRChainCall))
     return false;
 
   // Byte offset for the tail call. When we are sibcalling, this will always
   // be 0.
   MIB.addImm(0);
 
-  // If this is a chain call, we need to pass in the EXEC mask.
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  if (AMDGPU::isChainCC(Info.CallConv)) {
-    ArgInfo ExecArg = Info.OrigArgs[1];
+  // If this is a chain call, we need to pass in the EXEC mask as well as any
+  // other special args.
+  if (IsChainCall) {
+    auto AddRegOrImm = [&](const ArgInfo &Arg) {
+      if (auto CI = dyn_cast<ConstantInt>(Arg.OrigValue)) {
+        MIB.addImm(CI->getSExtValue());
+      } else {
+        MIB.addReg(Arg.Regs[0]);
+        unsigned Idx = MIB->getNumOperands() - 1;
+        MIB->getOperand(Idx).setReg(constrainOperandRegClass(
+            MF, *TRI, MRI, *TII, *ST.getRegBankInfo(), *MIB, MIB->getDesc(),
+            MIB->getOperand(Idx), Idx));
+      }
+    };
+
+    ArgInfo ExecArg = Info.OrigArgs[ChainCallArgIdx::Exec];
     assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");
 
-    if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize()))
+    if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize())) {
+      LLVM_DEBUG(dbgs() << "Bad type for EXEC");
       return false;
-
-    if (const auto *CI = dyn_cast<ConstantInt>(ExecArg.OrigValue)) {
-      MIB.addImm(CI->getSExtValue());
-    } else {
-      MIB.addReg(ExecArg.Regs[0]);
-      unsigned Idx = MIB->getNumOperands() - 1;
-      MIB->getOperand(Idx).setReg(constrainOperandRegClass(
-          MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
-          MIB->getDesc(), MIB->getOperand(Idx), Idx));
     }
+
+    AddRegOrImm(ExecArg);
+    if (IsDynamicVGPRChainCall)
+      std::for_each(Info.OrigArgs.begin() + ChainCallArgIdx::NumVGPRs,
+                    Info.OrigArgs.end(), AddRegOrImm);
   }
 
   // Tell the call which registers are clobbered.
@@ -1329,9 +1398,9 @@ bool AMDGPUCallLowering::lowerTailCall(
   // FIXME: We should define regbankselectable call instructions to handle
   // divergent call targets.
   if (MIB->getOperand(0).isReg()) {
-    MIB->getOperand(0).setReg(constrainOperandRegClass(
-        MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
-        MIB->getDesc(), MIB->getOperand(0), 0));
+    MIB->getOperand(0).setReg(
+        constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(),
+                                 *MIB, MIB->getDesc(), MIB->getOperand(0), 0));
   }
 
   MF.getFrameInfo().setHasTailCall();
@@ -1345,11 +1414,6 @@ bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
   ArgInfo Callee = Info.OrigArgs[0];
   ArgInfo SGPRArgs = Info.OrigArgs[2];
   ArgInfo VGPRArgs = Info.OrigArgs[3];
-  ArgInfo Flags = Info.OrigArgs[4];
-
-  assert(cast<ConstantInt>(Flags.OrigValue)->isZero() &&
-         "Non-zero flags aren't supported yet.");
-  assert(Info.OrigArgs.size() == 5 && "Additional args aren't supported yet.");
 
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
 
@@ -3657,6 +3657,19 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   return true;
 }
 
+namespace {
+// Chain calls have special arguments that we need to handle. These are
+// tagging along at the end of the arguments list(s), after the SGPR and VGPR
+// arguments (index 0 and 1 respectively).
+enum ChainCallArgIdx {
+  Exec = 2,
+  Flags,
+  NumVGPRs,
+  FallbackExec,
+  FallbackCallee
+};
+} // anonymous namespace
+
 // The wave scratch offset register is used as the global base pointer.
 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                                     SmallVectorImpl<SDValue> &InVals) const {
@@ -3665,37 +3678,67 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SelectionDAG &DAG = CLI.DAG;
 
-  TargetLowering::ArgListEntry RequestedExec;
-  if (IsChainCallConv) {
-    // The last argument should be the value that we need to put in EXEC.
-    // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
-    // don't treat it like the rest of the arguments.
-    RequestedExec = CLI.Args.back();
-    assert(RequestedExec.Node && "No node for EXEC");
+  const SDLoc &DL = CLI.DL;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
 
-    if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
+  llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
+  if (IsChainCallConv) {
+    // The last arguments should be the value that we need to put in EXEC,
+    // followed by the flags and any other arguments with special meanings.
+    // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
+    // we don't treat them like the "real" arguments.
+    auto RequestedExecIt = std::find_if(
+        CLI.Outs.begin(), CLI.Outs.end(),
+        [](const ISD::OutputArg &Arg) { return Arg.OrigArgIndex == 2; });
+    assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
+
+    size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
+    CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
+                      CLI.OutVals.end());
+    CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
+
+    assert(CLI.Outs.back().OrigArgIndex < 2 &&
+           "Haven't popped all the special args");
+
+    TargetLowering::ArgListEntry RequestedExecArg =
+        CLI.Args[ChainCallArgIdx::Exec];
+    if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
       return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
 
-    assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
-    CLI.Outs.pop_back();
-    CLI.OutVals.pop_back();
+    // Convert constants into TargetConstants, so they become immediate operands
+    // instead of being selected into S_MOV.
+    auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
+      if (auto ArgNode = dyn_cast<ConstantSDNode>(Arg.Node))
+        ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
+            ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
+      else
+        ChainCallSpecialArgs.push_back(Arg.Node);
+    };
 
-    if (RequestedExec.Ty->isIntegerTy(64)) {
-      assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
-      CLI.Outs.pop_back();
-      CLI.OutVals.pop_back();
-    }
+    PushNodeOrTargetConstant(RequestedExecArg);
+
+    // Process any other special arguments depending on the value of the flags.
+    TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
+
+    const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
+    if (FlagsValue.isZero()) {
+      if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
+        return lowerUnhandledCall(CLI, InVals,
+                                  "No additional args allowed if flags == 0");
+    } else if (FlagsValue.isOneBitSet(0)) {
+      if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
+        return lowerUnhandledCall(CLI, InVals, "Expected 3 additional args");
+      }
 
-    assert(CLI.Outs.back().OrigArgIndex != 2 &&
-           "Haven't popped all the pieces of the EXEC mask");
+      std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
+                    CLI.Args.end(), PushNodeOrTargetConstant);
+    }
   }
 
-  const SDLoc &DL = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
-  SDValue Chain = CLI.Chain;
-  SDValue Callee = CLI.Callee;
   bool &IsTailCall = CLI.IsTailCall;
   bool IsVarArg = CLI.IsVarArg;
   bool IsSibCall = false;
@@ -3983,7 +4026,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   if (IsChainCallConv)
-    Ops.push_back(RequestedExec.Node);
+    Ops.insert(Ops.end(), ChainCallSpecialArgs.begin(),
+               ChainCallSpecialArgs.end());
 
   // Add argument registers to the end of the list so that they are known live
   // into the call.