diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index b2697c81fd825..9c1ed8365961f 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4999,6 +4999,12 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
     llvm_unreachable("Not Implemented");
   }
 
+  /// Finds the incoming stack arguments which overlap the given fixed stack
+  /// object and incorporates their load into the current chain. This prevents
+  /// an upcoming store from clobbering the stack argument before it's used.
+  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+                              MachineFrameInfo &MFI, int ClobberedFI) const;
+
   /// Target-specific cleanup for formal ByVal parameters.
   virtual void HandleByVal(CCState *, unsigned &, Align) const {}
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 521d8f07434e6..ec1ce8bfa7ef3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -115,6 +115,36 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
   return true;
 }
 
+SDValue TargetLowering::addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+                                            MachineFrameInfo &MFI,
+                                            int ClobberedFI) const {
+  SmallVector<SDValue, 8> ArgChains;
+  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
+  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument corresponding
+  for (SDNode *U : DAG.getEntryNode().getNode()->users())
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+        if (FI->getIndex() < 0) {
+          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
+          int64_t InLastByte = InFirstByte;
+          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
+
+          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
+              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
+            ArgChains.push_back(SDValue(L, 1));
+        }
+
+  // Build a tokenfactor for all the chains.
+  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+}
+
 /// Set CallLoweringInfo attribute flags based on a call instruction
 /// and called function attributes.
 void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6072fd9d8f242..dbc534b269393 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9353,37 +9353,6 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   return true;
 }
 
-SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
-                                                   SelectionDAG &DAG,
-                                                   MachineFrameInfo &MFI,
-                                                   int ClobberedFI) const {
-  SmallVector<SDValue, 8> ArgChains;
-  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
-  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
-
-  // Include the original chain at the beginning of the list. When this is
-  // used by target LowerCall hooks, this helps legalize find the
-  // CALLSEQ_BEGIN node.
-  ArgChains.push_back(Chain);
-
-  // Add a chain value for each stack argument corresponding
-  for (SDNode *U : DAG.getEntryNode().getNode()->users())
-    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
-      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
-        if (FI->getIndex() < 0) {
-          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
-          int64_t InLastByte = InFirstByte;
-          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
-
-          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
-              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
-            ArgChains.push_back(SDValue(L, 1));
-        }
-
-  // Build a tokenfactor for all the chains.
-  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
-}
-
 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
                                                    bool TailCallOpt) const {
   return (CallCC == CallingConv::Fast && TailCallOpt) ||
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 32aa913181a21..8f62af30d8c63 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -630,12 +630,6 @@ class AArch64TargetLowering : public TargetLowering {
   bool
   isEligibleForTailCallOptimization(const CallLoweringInfo &CLI) const;
 
-  /// Finds the incoming stack arguments which overlap the given fixed stack
-  /// object and incorporates their load into the current chain. This prevents
-  /// an upcoming store from clobbering the stack argument before it's used.
-  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
-                              MachineFrameInfo &MFI, int ClobberedFI) const;
-
   bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
 
   void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 971dfdbe3e70a..963e32f7557a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1355,40 +1355,6 @@ CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
 }
 
-SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
-                                                  SelectionDAG &DAG,
-                                                  MachineFrameInfo &MFI,
-                                                  int ClobberedFI) const {
-  SmallVector<SDValue, 8> ArgChains;
-  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
-  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
-
-  // Include the original chain at the beginning of the list. When this is
-  // used by target LowerCall hooks, this helps legalize find the
-  // CALLSEQ_BEGIN node.
-  ArgChains.push_back(Chain);
-
-  // Add a chain value for each stack argument corresponding
-  for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
-    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
-      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
-        if (FI->getIndex() < 0) {
-          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
-          int64_t InLastByte = InFirstByte;
-          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
-
-          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
-              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
-            ArgChains.push_back(SDValue(L, 1));
-        }
-      }
-    }
-  }
-
-  // Build a tokenfactor for all the chains.
-  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
-}
-
 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
                                                  SmallVectorImpl<SDValue> &InVals,
                                                  StringRef Reason) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 9c0eff99981cd..435c917a29456 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -255,11 +255,6 @@ class AMDGPUTargetLowering : public TargetLowering {
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
-  SDValue addTokenForArgument(SDValue Chain,
-                              SelectionDAG &DAG,
-                              MachineFrameInfo &MFI,
-                              int ClobberedFI) const;
-
   SDValue lowerUnhandledCall(CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals,
                              StringRef Reason) const;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index be53f51afe79f..fc4f3e12ac6ad 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23420,6 +23420,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
   MachineFunction &MF = DAG.getMachineFunction();
+  RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
 
   switch (CallConv) {
   default:
@@ -23548,6 +23549,8 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
       continue;
     }
     InVals.push_back(ArgValue);
+    if (Ins[InsIdx].Flags.isByVal())
+      RVFI->addIncomingByValArgs(ArgValue);
   }
 
   if (any_of(ArgLocs,
@@ -23560,7 +23563,6 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
     const TargetRegisterClass *RC = &RISCV::GPRRegClass;
     MachineFrameInfo &MFI = MF.getFrameInfo();
     MachineRegisterInfo &RegInfo = MF.getRegInfo();
-    RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
 
     // Size of the vararg save area. For now, the varargs save area is either
     // zero or large enough to hold a0-a7.
@@ -23608,6 +23610,8 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
     RVFI->setVarArgsSaveSize(VarArgsSaveSize);
   }
 
+  RVFI->setArgumentStackSize(CCInfo.getStackSize());
+
   // All stores are grouped in one node to allow the matching between
   // the size of Ins and InVals. This only happens for vararg functions.
   if (!OutChains.empty()) {
@@ -23629,6 +23633,7 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization(
   auto &Outs = CLI.Outs;
   auto &Caller = MF.getFunction();
   auto CallerCC = Caller.getCallingConv();
+  auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
 
   // Exception-handling functions need a special set of instructions to
   // indicate a return to the hardware. Tail-calling another function would
@@ -23638,29 +23643,28 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization(
   if (Caller.hasFnAttribute("interrupt"))
     return false;
 
-  // Do not tail call opt if the stack is used to pass parameters.
-  if (CCInfo.getStackSize() != 0)
+  // If the stack arguments for this call do not fit into our own save area then
+  // the call cannot be made tail.
+  if (CCInfo.getStackSize() > RVFI->getArgumentStackSize())
     return false;
 
-  // Do not tail call opt if any parameters need to be passed indirectly.
-  // Since long doubles (fp128) and i128 are larger than 2*XLEN, they are
-  // passed indirectly. So the address of the value will be passed in a
-  // register, or if not available, then the address is put on the stack. In
-  // order to pass indirectly, space on the stack often needs to be allocated
-  // in order to store the value. In this case the CCInfo.getNextStackOffset()
-  // != 0 check is not enough and we need to check if any CCValAssign ArgsLocs
-  // are passed CCValAssign::Indirect.
-  for (auto &VA : ArgLocs)
-    if (VA.getLocInfo() == CCValAssign::Indirect)
-      return false;
-
   // Do not tail call opt if either caller or callee uses struct return
   // semantics.
   auto IsCallerStructRet = Caller.hasStructRetAttr();
   auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
-  if (IsCallerStructRet || IsCalleeStructRet)
+  if (IsCallerStructRet != IsCalleeStructRet)
     return false;
 
+  // Do not tail call opt if caller's and callee's byval arguments do not match.
+  for (unsigned i = 0, j = 0; i < Outs.size(); i++) {
+    if (!Outs[i].Flags.isByVal())
+      continue;
+    if (j++ >= RVFI->getIncomingByValArgsSize())
+      return false;
+    if (RVFI->getIncomingByValArgs(i).getValueType() != Outs[i].ArgVT)
+      return false;
+  }
+
   // The callee has to preserve all registers the caller needs to preserve.
   const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
@@ -23670,12 +23674,12 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization(
       return false;
   }
 
-  // Byval parameters hand the function a pointer directly into the stack area
-  // we want to reuse during a tail call. Working around this *is* possible
-  // but less efficient and uglier in LowerCall.
-  for (auto &Arg : Outs)
-    if (Arg.Flags.isByVal())
-      return false;
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+    return false;
 
   return true;
 }
@@ -23704,6 +23708,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   const CallBase *CB = CLI.CB;
 
   MachineFunction &MF = DAG.getMachineFunction();
+  RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
   MachineFunction::CallSiteInfo CSInfo;
 
   // Set type id for call site info.
@@ -23738,7 +23743,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Create local copies for byval args
   SmallVector<SDValue, 8> ByValArgs;
-  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+  for (unsigned i = 0, j = 0, e = Outs.size(); i != e; ++i) {
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
     if (!Flags.isByVal())
       continue;
@@ -23747,16 +23752,27 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
     unsigned Size = Flags.getByValSize();
     Align Alignment = Flags.getNonZeroByValAlign();
 
-    int FI =
-        MF.getFrameInfo().CreateStackObject(Size, Alignment, /*isSS=*/false);
-    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     SDValue SizeNode = DAG.getConstant(Size, DL, XLenVT);
+    SDValue Dst;
 
-    Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment,
-                          /*IsVolatile=*/false,
-                          /*AlwaysInline=*/false, /*CI*/ nullptr, IsTailCall,
-                          MachinePointerInfo(), MachinePointerInfo());
-    ByValArgs.push_back(FIPtr);
+    if (IsTailCall) {
+      SDValue CallerArg = RVFI->getIncomingByValArgs(j++);
+      if (isa<GlobalAddressSDNode>(Arg) || isa<ExternalSymbolSDNode>(Arg) ||
+          isa<FrameIndexSDNode>(Arg))
+        Dst = CallerArg;
+    } else {
+      int FI =
+          MF.getFrameInfo().CreateStackObject(Size, Alignment, /*isSS=*/false);
+      Dst = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+    }
+    if (Dst) {
+      Chain =
+          DAG.getMemcpy(Chain, DL, Dst, Arg, SizeNode, Alignment,
+                        /*IsVolatile=*/false,
+                        /*AlwaysInline=*/false, /*CI=*/nullptr, std::nullopt,
+                        MachinePointerInfo(), MachinePointerInfo());
+      ByValArgs.push_back(Dst);
+    }
   }
 
   if (!IsTailCall)
@@ -23859,8 +23875,12 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
     }
 
     // Use local copy if it is a byval arg.
-    if (Flags.isByVal())
-      ArgValue = ByValArgs[j++];
+    if (Flags.isByVal()) {
+      if (!IsTailCall || (isa<GlobalAddressSDNode>(ArgValue) ||
+                          isa<ExternalSymbolSDNode>(ArgValue) ||
+                          isa<FrameIndexSDNode>(ArgValue)))
+        ArgValue = ByValArgs[j++];
+    }
 
     if (VA.isRegLoc()) {
       // Queue up the argument copies and emit them at the end.
@@ -23871,20 +23891,32 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
         CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
     } else {
       assert(VA.isMemLoc() && "Argument not register or memory");
-      assert(!IsTailCall && "Tail call not allowed if stack is used "
-                            "for passing parameters");
+      SDValue DstAddr;
+      MachinePointerInfo DstInfo;
+      int32_t Offset = VA.getLocMemOffset();
 
       // Work out the address of the stack slot.
       if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
-      SDValue Address =
-          DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
-                      DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
+
+      if (IsTailCall) {
+        unsigned OpSize = divideCeil(VA.getValVT().getSizeInBits(), 8);
+        int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
+        DstAddr = DAG.getFrameIndex(FI, PtrVT);
+        DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
+        // Make sure any stack arguments overlapping with where we're storing
+        // are loaded before this eventual operation. Otherwise they'll be
+        // clobbered.
+        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+      } else {
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
+        DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+        DstInfo = MachinePointerInfo::getStack(MF, Offset);
+      }
 
       // Emit the store.
       MemOpChains.push_back(
-          DAG.getStore(Chain, DL, ArgValue, Address,
-                       MachinePointerInfo::getStack(MF, VA.getLocMemOffset())));
+          DAG.getStore(Chain, DL, ArgValue, DstAddr, DstInfo));
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index f9be80feae211..9c2cd708f2784 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -65,6 +65,14 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
   uint64_t RVVPadding = 0;
   /// Size of stack frame to save callee saved registers
   unsigned CalleeSavedStackSize = 0;
+
+  /// ArgumentStackSize - amount of bytes on stack consumed by the arguments
+  /// being passed on the stack
+  unsigned ArgumentStackSize = 0;
+
+  /// Incoming ByVal arguments
+  SmallVector<SDValue, 8> IncomingByValArgs;
+
   /// Is there any vector argument or return?
   bool IsVectorCall = false;
 
@@ -142,6 +150,13 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
   unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
   void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
 
+  unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+  void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+  void addIncomingByValArgs(SDValue Val) { IncomingByValArgs.push_back(Val); }
+  SDValue &getIncomingByValArgs(int Idx) { return IncomingByValArgs[Idx]; }
+  unsigned getIncomingByValArgsSize() { return IncomingByValArgs.size(); }
+
   enum class PushPopKind { None = 0, StdExtZcmp, VendorXqccmp };
 
   PushPopKind getPushPopKind(const MachineFunction &MF) const;
diff --git a/llvm/test/CodeGen/RISCV/musttail-call.ll b/llvm/test/CodeGen/RISCV/musttail-call.ll
index f6ec5307b8bad..a3ac3560378db 100644
--- a/llvm/test/CodeGen/RISCV/musttail-call.ll
+++ b/llvm/test/CodeGen/RISCV/musttail-call.ll
@@ -9,12 +9,13 @@
 ; RUN: not --crash llc -mtriple riscv64-unknown-elf -o - %s \
 ; RUN: 2>&1 | FileCheck %s
 
-%struct.A = type { i32 }
+declare void @callee_musttail()
 
-declare void @callee_musttail(ptr sret(%struct.A) %a)
-define void @caller_musttail(ptr sret(%struct.A) %a) {
+define void @caller_musttail() #0 {
 ; CHECK: LLVM ERROR: failed to perform tail call elimination on a call site marked musttail
 entry:
-  musttail call void @callee_musttail(ptr sret(%struct.A) %a)
+  musttail call void @callee_musttail()
   ret void
 }
+
+attributes #0 = { "interrupt"="machine" }
diff --git a/llvm/test/CodeGen/RISCV/musttail.ll b/llvm/test/CodeGen/RISCV/musttail.ll
new file mode 100644
index 0000000000000..4765fe7a4f233
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/musttail.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 %s -o - | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 %s -o - | FileCheck %s --check-prefix=RV64
+
+declare i32 @many_args_callee(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9)
+
+define i32 @many_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) {
+; RV32-LABEL: many_args_tail:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 9
+; RV32-NEXT:    li t0, 8
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    li a2, 2
+; RV32-NEXT:    li a3, 3
+; RV32-NEXT:    li a4, 4
+; RV32-NEXT:    li a5, 5
+; RV32-NEXT:    li a6, 6
+; RV32-NEXT:    sw a0, 4(sp)
+; RV32-NEXT:    li a7, 7
+; RV32-NEXT:    sw t0, 0(sp)
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    tail many_args_callee
+;
+; RV64-LABEL: many_args_tail:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 9
+; RV64-NEXT:    li t0, 8
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    li a2, 2
+; RV64-NEXT:    li a3, 3
+; RV64-NEXT:    li a4, 4
+; RV64-NEXT:    li a5, 5
+; RV64-NEXT:    li a6, 6
+; RV64-NEXT:    sd a0, 8(sp)
+; RV64-NEXT:    li a7, 7
+; RV64-NEXT:    sd t0, 0(sp)
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    tail many_args_callee
+  %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
+  ret i32 %ret
+}
+
+define i32 @many_args_musttail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) {
+; RV32-LABEL: many_args_musttail:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 9
+; RV32-NEXT:    li t0, 8
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    li a2, 2
+; RV32-NEXT:    li a3, 3
+; RV32-NEXT:    li a4, 4
+; RV32-NEXT:    li a5, 5
+; RV32-NEXT:    li a6, 6
+; RV32-NEXT:    sw a0, 4(sp)
+; RV32-NEXT:    li a7, 7
+; RV32-NEXT:    sw t0, 0(sp)
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    tail many_args_callee
+;
+; RV64-LABEL: many_args_musttail:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 9
+; RV64-NEXT:    li t0, 8
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    li a2, 2
+; RV64-NEXT:    li a3, 3
+; RV64-NEXT:    li a4, 4
+; RV64-NEXT:    li a5, 5
+; RV64-NEXT:    li a6, 6
+; RV64-NEXT:    sd a0, 8(sp)
+; RV64-NEXT:    li a7, 7
+; RV64-NEXT:    sd t0, 0(sp)
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    tail many_args_callee
+  %ret = musttail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
+  ret i32 %ret
+}
+
+; This function has more arguments than it's tail-callee. This isn't valid for
+; the musttail attribute, but can still be tail-called as a non-guaranteed
+; optimisation, because the outgoing arguments to @many_args_callee fit in the
+; stack space allocated by the caller of @more_args_tail.
+define i32 @more_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) {
+; RV32-LABEL: more_args_tail:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 9
+; RV32-NEXT:    li t0, 8
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    li a2, 2
+; RV32-NEXT:    li a3, 3
+; RV32-NEXT:    li a4, 4
+; RV32-NEXT:    li a5, 5
+; RV32-NEXT:    li a6, 6
+; RV32-NEXT:    sw a0, 4(sp)
+; RV32-NEXT:    li a7, 7
+; RV32-NEXT:    sw t0, 0(sp)
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    tail many_args_callee
+;
+; RV64-LABEL: more_args_tail:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 9
+; RV64-NEXT:    li t0, 8
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    li a2, 2
+; RV64-NEXT:    li a3, 3
+; RV64-NEXT:    li a4, 4
+; RV64-NEXT:    li a5, 5
+; RV64-NEXT:    li a6, 6
+; RV64-NEXT:    sd a0, 8(sp)
+; RV64-NEXT:    li a7, 7
+; RV64-NEXT:    sd t0, 0(sp)
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    tail many_args_callee
+  %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
+  ret i32 %ret
+}
+
+; Again, this isn't valid for musttail, but can be tail-called in practice
+; because the stack size is the same.
+define i32 @different_args_tail_32bit(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4) {
+; RV32-LABEL: different_args_tail_32bit:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 9
+; RV32-NEXT:    li t0, 8
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    li a2, 2
+; RV32-NEXT:    li a3, 3
+; RV32-NEXT:    li a4, 4
+; RV32-NEXT:    li a5, 5
+; RV32-NEXT:    li a6, 6
+; RV32-NEXT:    sw a0, 4(sp)
+; RV32-NEXT:    li a7, 7
+; RV32-NEXT:    sw t0, 0(sp)
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    tail many_args_callee
+;
+; RV64-LABEL: different_args_tail_32bit:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    .cfi_def_cfa_offset 32
+; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    li a0, 9
+; RV64-NEXT:    li t0, 8
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    li a2, 2
+; RV64-NEXT:    li a3, 3
+; RV64-NEXT:    li a4, 4
+; RV64-NEXT:    li a5, 5
+; RV64-NEXT:    li a6, 6
+; RV64-NEXT:    li a7, 7
+; RV64-NEXT:    sd t0, 0(sp)
+; RV64-NEXT:    sd a0, 8(sp)
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    call many_args_callee
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+  %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
+  ret i32 %ret
+}
+
+define i32 @different_args_tail_64bit(i128 %0, i128 %1, i128 %2, i128 %3, i128 %4) {
+; RV32-LABEL: different_args_tail_64bit:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    li a0, 9
+; RV32-NEXT:    li t0, 8
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    li a2, 2
+; RV32-NEXT:    li a3, 3
+; RV32-NEXT:    li a4, 4
+; RV32-NEXT:    li a5, 5
+; RV32-NEXT:    li a6, 6
+; RV32-NEXT:    li a7, 7
+; RV32-NEXT:    sw t0, 0(sp)
+; RV32-NEXT:    sw a0, 4(sp)
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    call many_args_callee
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: different_args_tail_64bit:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 9
+; RV64-NEXT:    li t0, 8
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    li a2, 2
+; RV64-NEXT:    li a3, 3
+; RV64-NEXT:    li a4, 4
+; RV64-NEXT:    li a5, 5
+; RV64-NEXT:    li a6, 6
+; RV64-NEXT:    sd a0, 8(sp)
+; RV64-NEXT:    li a7, 7
+; RV64-NEXT:    sd t0, 0(sp)
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    tail many_args_callee
+  %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
+  ret i32 %ret
+}
+
+; Here, the caller requires less stack space for it's arguments than the
+; callee, so it would not ba valid to do a tail-call.
+define i32 @fewer_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4) {
+; RV32-LABEL: fewer_args_tail:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    li a0, 9
+; RV32-NEXT:    li t0, 8
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    li a2, 2
+; RV32-NEXT:    li a3, 3
+; RV32-NEXT:    li a4, 4
+; RV32-NEXT:    li a5, 5
+; RV32-NEXT:    li a6, 6
+; RV32-NEXT:    li a7, 7
+; RV32-NEXT:    sw t0, 0(sp)
+; RV32-NEXT:    sw a0, 4(sp)
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    call many_args_callee
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: fewer_args_tail:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    .cfi_def_cfa_offset 32
+; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    li a0, 9
+; RV64-NEXT:    li t0, 8
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    li a2, 2
+; RV64-NEXT:    li a3, 3
+; RV64-NEXT:    li a4, 4
+; RV64-NEXT:    li a5, 5
+; RV64-NEXT:    li a6, 6
+; RV64-NEXT:    li a7, 7
+; RV64-NEXT:    sd t0, 0(sp)
+; RV64-NEXT:    sd a0, 8(sp)
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    call many_args_callee
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+  %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
+  ret i32 %ret
+}
+
+declare void @foo(i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+define void @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8) nounwind {
+; RV32-LABEL: bar:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv s0, a7
+; RV32-NEXT:    mv s1, a6
+; RV32-NEXT:    mv s2, a5
+; RV32-NEXT:    mv s3, a4
+; RV32-NEXT:    mv s4, a3
+; RV32-NEXT:    mv s5, a2
+; RV32-NEXT:    mv s6, a1
+; RV32-NEXT:    mv s7, a0
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    sw a0, 0(sp)
+; RV32-NEXT:    mv a0, s7
+; RV32-NEXT:    call foo
+; RV32-NEXT:    li a0, 2
+; RV32-NEXT:    sw a0, 48(sp)
+; RV32-NEXT:    mv a0, s7
+; RV32-NEXT:    mv a1, s6
+; RV32-NEXT:    mv a2, s5
+; RV32-NEXT:    mv a3, s4
+; RV32-NEXT:    mv a4, s3
+; RV32-NEXT:    mv a5, s2
+; RV32-NEXT:    mv a6, s1
+; RV32-NEXT:    mv a7, s0
+; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    tail foo
+;
+; RV64-LABEL: bar:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv s0, a7
+; RV64-NEXT:    mv s1, a6
+; RV64-NEXT:    mv s2, a5
+; RV64-NEXT:    mv s3, a4
+; RV64-NEXT:    mv s4, a3
+; RV64-NEXT:    mv s5, a2
+; RV64-NEXT:    mv s6, a1
+; RV64-NEXT:    mv s7, a0
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    sd a0, 0(sp)
+; RV64-NEXT:    mv a0, s7
+; RV64-NEXT:    call foo
+; RV64-NEXT:    li a0, 2
+; RV64-NEXT:    sd a0, 80(sp)
+; RV64-NEXT:    mv a0, s7
+; RV64-NEXT:    mv a1, s6
+; RV64-NEXT:    mv a2, s5
+; RV64-NEXT:    mv a3, s4
+; RV64-NEXT:    mv a4, s3
+; RV64-NEXT:    mv a5, s2
+; RV64-NEXT:    mv a6, s1
+; RV64-NEXT:    mv a7, s0
+; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 80
+; RV64-NEXT:    tail foo
+entry:
+  call void @foo(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 1)
+  musttail call void @foo(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 2)
+  ret void
+}
+
+declare void @sret_callee(ptr sret({ double, double }) align 8)
+
+; Functions which return by sret can be tail-called because the incoming sret
+; pointer gets passed through to the callee.
+define void @sret_caller_tail(ptr sret({ double, double }) align 8 %result) {
+; RV32-LABEL: sret_caller_tail:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    tail sret_callee
+;
+; RV64-LABEL: sret_caller_tail:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    tail sret_callee
+entry:
+  tail call void @sret_callee(ptr sret({ double, double }) align 8 %result)
+  ret void
+}
+
+define void @sret_caller_musttail(ptr sret({ double, double }) align 8 %result) {
+; RV32-LABEL: sret_caller_musttail:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    tail sret_callee
+;
+; RV64-LABEL: sret_caller_musttail:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    tail sret_callee
+entry:
+  musttail call void @sret_callee(ptr sret({ double, double }) align 8 %result)
+  ret void
+}
+
+%twenty_bytes = type { [5 x i32] }
+declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4)
+
+; Functions with byval parameters can be tail-called, because the value is
+; actually passed in registers in the same way for the caller and callee.
+define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; RV32-LABEL: large_caller:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    tail large_callee
+;
+; RV64-LABEL: large_caller:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    tail large_callee
+entry:
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a)
+  ret void
+}
+
+; As above, but with some inline asm to test that the arguments in r4 is
+; re-loaded before the call.
+define void @large_caller_check_regs(%twenty_bytes* byval(%twenty_bytes) align 4 %a) nounwind {
+; RV32-LABEL: large_caller_check_regs:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    #APP
+; RV32-NEXT:    #NO_APP
+; RV32-NEXT:    tail large_callee
+;
+; RV64-LABEL: large_caller_check_regs:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    #APP
+; RV64-NEXT:    #NO_APP
+; RV64-NEXT:    tail large_callee
+entry:
+  tail call void asm sideeffect "", "~{r4}"()
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a)
+  ret void
+}
+
+; The IR for this one looks dodgy, because it has an alloca passed to a
+; musttail function, but it is passed as a byval argument, so will be copied
+; into the stack space allocated by @large_caller_new_value's caller, so is
+; valid.
+define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) nounwind {
+; RV32-LABEL: large_caller_new_value:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    li a2, 2
+; RV32-NEXT:    li a3, 3
+; RV32-NEXT:    li a4, 4
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
+; RV32-NEXT:    sw a3, 24(sp)
+; RV32-NEXT:    sw a4, 28(sp)
+; RV32-NEXT:    sw a4, 16(a0)
+; RV32-NEXT:    sw zero, 0(a0)
+; RV32-NEXT:    sw a1, 4(a0)
+; RV32-NEXT:    sw a2, 8(a0)
+; RV32-NEXT:    sw a3, 12(a0)
+; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    tail large_callee
+;
+; RV64-LABEL: large_caller_new_value:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    li a2, 2
+; RV64-NEXT:    li a3, 3
+; RV64-NEXT:    li a4, 4
+; RV64-NEXT:    sw zero, 12(sp)
+; RV64-NEXT:    sw a1, 16(sp)
+; RV64-NEXT:    sw a2, 20(sp)
+; RV64-NEXT:    sw a3, 24(sp)
+; RV64-NEXT:    sw a4, 28(sp)
+; RV64-NEXT:    sw a4, 16(a0)
+; RV64-NEXT:    sw zero, 0(a0)
+; RV64-NEXT:    sw a1, 4(a0)
+; RV64-NEXT:    sw a2, 8(a0)
+; RV64-NEXT:    sw a3, 12(a0)
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    tail large_callee
+entry:
+  %y = alloca %twenty_bytes, align 4
+  store i32 0, ptr %y, align 4
+  %0 = getelementptr inbounds i8, ptr %y, i32 4
+  store i32 1, ptr %0, align 4
+  %1 = getelementptr inbounds i8, ptr %y, i32 8
+  store i32 2, ptr %1, align 4
+  %2 = getelementptr inbounds i8, ptr %y, i32 12
+  store i32 3, ptr %2, align 4
+  %3 = getelementptr inbounds i8, ptr %y, i32 16
+  store i32 4, ptr %3, align 4
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y)
+  ret void
+}
+
+declare void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4, %twenty_bytes* byval(%twenty_bytes) align 4)
+define void @swap_byvals(%twenty_bytes* byval(%twenty_bytes) align 4 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) {
+; RV32-LABEL: swap_byvals:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    tail two_byvals_callee
+;
+; RV64-LABEL: swap_byvals:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:    tail two_byvals_callee
+entry:
+  musttail call void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b, %twenty_bytes* byval(%twenty_bytes) align 4 %a)
+  ret void
+}
+
+; A forwarded byval arg, but in a different argument register, so it needs to
+; be moved between registers first. This can't be musttail because of the
+; different signatures, but is still tail-called as an optimisation.
+declare void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4)
+define void @shift_byval(i32 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) {
+; RV32-LABEL: shift_byval:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    tail shift_byval_callee
+;
+; RV64-LABEL: shift_byval:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    tail shift_byval_callee
+entry:
+  tail call void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b)
+  ret void
+}
+
+; A global object passed to a byval argument, so it must be copied, but doesn't
+; need a stack temporary.
+@large_global = external global %twenty_bytes
+define void @large_caller_from_global(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; RV32-LABEL: large_caller_from_global:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lui a1, %hi(large_global)
+; RV32-NEXT:    addi a1, a1, %lo(large_global)
+; RV32-NEXT:    lw a2, 16(a1)
+; RV32-NEXT:    sw a2, 16(a0)
+; RV32-NEXT:    lw a2, 12(a1)
+; RV32-NEXT:    sw a2, 12(a0)
+; RV32-NEXT:    lw a2, 8(a1)
+; RV32-NEXT:    sw a2, 8(a0)
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    tail large_callee
+;
+; RV64-LABEL: large_caller_from_global:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lui a1, %hi(large_global)
+; RV64-NEXT:    addi a1, a1, %lo(large_global)
+; RV64-NEXT:    lw a2, 16(a1)
+; RV64-NEXT:    sw a2, 16(a0)
+; RV64-NEXT:    lw a2, 12(a1)
+; RV64-NEXT:    sw a2, 12(a0)
+; RV64-NEXT:    lw a2, 8(a1)
+; RV64-NEXT:    sw a2, 8(a0)
+; RV64-NEXT:    lw a2, 4(a1)
+; RV64-NEXT:    sw a2, 4(a0)
+; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    sw a1, 0(a0)
+; RV64-NEXT:    tail large_callee
+entry:
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 @large_global)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
index 6756fea8a1f85..fa68006059fdb 100644
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -204,49 +204,39 @@ declare i32 @callee_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %
 define i32 @caller_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n) nounwind {
 ; CHECK-LABEL: caller_args:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -32
-; CHECK-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; CHECK-NEXT:    lw t0, 32(sp)
-; CHECK-NEXT:    lw t1, 36(sp)
-; CHECK-NEXT:    lw t2, 40(sp)
-; CHECK-NEXT:    lw t3, 44(sp)
-; CHECK-NEXT:    lw t4, 48(sp)
-; CHECK-NEXT:    lw t5, 52(sp)
-; CHECK-NEXT:    sw t4, 16(sp)
-; CHECK-NEXT:    sw t5, 20(sp)
+; CHECK-NEXT:    lw t0, 0(sp)
+; CHECK-NEXT:    lw t1, 20(sp)
+; CHECK-NEXT:    lw t2, 4(sp)
+; CHECK-NEXT:    lw t3, 8(sp)
+; CHECK-NEXT:    lw t4, 12(sp)
+; CHECK-NEXT:    lw t5, 16(sp)
+; CHECK-NEXT:    sw t2, 4(sp)
+; CHECK-NEXT:    sw t3, 8(sp)
+; CHECK-NEXT:    sw t4, 12(sp)
+; CHECK-NEXT:    sw t5, 16(sp)
+; CHECK-NEXT:    sw t1, 20(sp)
 ; CHECK-NEXT:    sw t0, 0(sp)
-; CHECK-NEXT:    sw t1, 4(sp)
-; CHECK-NEXT:    sw t2, 8(sp)
-; CHECK-NEXT:    sw t3, 12(sp)
-; CHECK-NEXT:    call callee_args
-; CHECK-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 32
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    tail callee_args
 ;
 ; CHECK-LARGE-ZICFILP-LABEL: caller_args:
 ; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
 ; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
-; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -32
-; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; CHECK-LARGE-ZICFILP-NEXT:    lw t0, 32(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    lw t1, 36(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    lw t3, 40(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    lw t4, 44(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    lw t2, 48(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    lw t5, 52(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    sw t2, 16(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    sw t5, 20(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t0, 0(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t1, 20(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, 4(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t3, 16(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t4, 12(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t5, 8(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t2, 4(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi8:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc t2, %pcrel_hi(.LCPI6_0)
 ; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi8)(t2)
-; CHECK-LARGE-ZICFILP-NEXT:    sw t0, 0(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    sw t1, 4(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    sw t3, 8(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t5, 8(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    sw t4, 12(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
-; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 32
-; CHECK-LARGE-ZICFILP-NEXT:    ret
+; CHECK-LARGE-ZICFILP-NEXT:    sw t3, 16(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t1, 20(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t0, 0(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    jr t2
 entry:
   %r = tail call i32 @callee_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n)
   ret i32 %r
@@ -257,24 +247,20 @@ declare i32 @callee_indirect_args(fp128 %a)
 define void @caller_indirect_args() nounwind {
 ; CHECK-LABEL: caller_indirect_args:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -32
-; CHECK-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    lui a1, 262128
 ; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    sw zero, 0(sp)
 ; CHECK-NEXT:    sw zero, 4(sp)
 ; CHECK-NEXT:    sw zero, 8(sp)
 ; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    call callee_indirect_args
-; CHECK-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 32
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    tail callee_indirect_args
 ;
 ; CHECK-LARGE-ZICFILP-LABEL: caller_indirect_args:
 ; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
 ; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
-; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -32
-; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
 ; CHECK-LARGE-ZICFILP-NEXT:    lui a1, 262128
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi9:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI7_0)
@@ -284,10 +270,8 @@ define void @caller_indirect_args() nounwind {
 ; CHECK-LARGE-ZICFILP-NEXT:    sw zero, 4(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    sw zero, 8(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    sw a1, 12(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
-; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 32
-; CHECK-LARGE-ZICFILP-NEXT:    ret
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; CHECK-LARGE-ZICFILP-NEXT:    jr t2
 entry:
   %call = tail call i32 @callee_indirect_args(fp128 0xL00000000000000003FFF000000000000)
   ret void