diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index b2697c81fd825..9c1ed8365961f 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4999,6 +4999,12 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { llvm_unreachable("Not Implemented"); } + /// Finds the incoming stack arguments which overlap the given fixed stack + /// object and incorporates their load into the current chain. This prevents + /// an upcoming store from clobbering the stack argument before it's used. + SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, + MachineFrameInfo &MFI, int ClobberedFI) const; + /// Target-specific cleanup for formal ByVal parameters. virtual void HandleByVal(CCState *, unsigned &, Align) const {} diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 521d8f07434e6..ec1ce8bfa7ef3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -115,6 +115,36 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI, return true; } +SDValue TargetLowering::addTokenForArgument(SDValue Chain, SelectionDAG &DAG, + MachineFrameInfo &MFI, + int ClobberedFI) const { + SmallVector ArgChains; + int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); + int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; + + // Include the original chain at the beginning of the list. When this is + // used by target LowerCall hooks, this helps legalize find the + // CALLSEQ_BEGIN node. + ArgChains.push_back(Chain); + + // Add a chain value for each stack argument corresponding + for (SDNode *U : DAG.getEntryNode().getNode()->users()) + if (LoadSDNode *L = dyn_cast(U)) + if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) + if (FI->getIndex() < 0) { + int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); + int64_t InLastByte = InFirstByte; + InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; + + if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || + (FirstByte <= InFirstByte && InFirstByte <= LastByte)) + ArgChains.push_back(SDValue(L, 1)); + } + + // Build a tokenfactor for all the chains. + return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); +} + /// Set CallLoweringInfo attribute flags based on a call instruction /// and called function attributes. void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6072fd9d8f242..dbc534b269393 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9353,37 +9353,6 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return true; } -SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, - SelectionDAG &DAG, - MachineFrameInfo &MFI, - int ClobberedFI) const { - SmallVector ArgChains; - int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); - int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; - - // Include the original chain at the beginning of the list. When this is - // used by target LowerCall hooks, this helps legalize find the - // CALLSEQ_BEGIN node. - ArgChains.push_back(Chain); - - // Add a chain value for each stack argument corresponding - for (SDNode *U : DAG.getEntryNode().getNode()->users()) - if (LoadSDNode *L = dyn_cast(U)) - if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) - if (FI->getIndex() < 0) { - int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); - int64_t InLastByte = InFirstByte; - InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; - - if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || - (FirstByte <= InFirstByte && InFirstByte <= LastByte)) - ArgChains.push_back(SDValue(L, 1)); - } - - // Build a tokenfactor for all the chains. - return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); -} - bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const { return (CallCC == CallingConv::Fast && TailCallOpt) || diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 32aa913181a21..8f62af30d8c63 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -630,12 +630,6 @@ class AArch64TargetLowering : public TargetLowering { bool isEligibleForTailCallOptimization(const CallLoweringInfo &CLI) const; - /// Finds the incoming stack arguments which overlap the given fixed stack - /// object and incorporates their load into the current chain. This prevents - /// an upcoming store from clobbering the stack argument before it's used. - SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, - MachineFrameInfo &MFI, int ClobberedFI) const; - bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const; void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 971dfdbe3e70a..963e32f7557a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1355,40 +1355,6 @@ CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); } -SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, - SelectionDAG &DAG, - MachineFrameInfo &MFI, - int ClobberedFI) const { - SmallVector ArgChains; - int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); - int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; - - // Include the original chain at the beginning of the list. When this is - // used by target LowerCall hooks, this helps legalize find the - // CALLSEQ_BEGIN node. - ArgChains.push_back(Chain); - - // Add a chain value for each stack argument corresponding - for (SDNode *U : DAG.getEntryNode().getNode()->users()) { - if (LoadSDNode *L = dyn_cast(U)) { - if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) { - if (FI->getIndex() < 0) { - int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); - int64_t InLastByte = InFirstByte; - InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; - - if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || - (FirstByte <= InFirstByte && InFirstByte <= LastByte)) - ArgChains.push_back(SDValue(L, 1)); - } - } - } - } - - // Build a tokenfactor for all the chains. - return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); -} - SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals, StringRef Reason) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 9c0eff99981cd..435c917a29456 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -255,11 +255,6 @@ class AMDGPUTargetLowering : public TargetLowering { const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; - SDValue addTokenForArgument(SDValue Chain, - SelectionDAG &DAG, - MachineFrameInfo &MFI, - int ClobberedFI) const; - SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals, StringRef Reason) const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index be53f51afe79f..fc4f3e12ac6ad 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -23420,6 +23420,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); + RISCVMachineFunctionInfo *RVFI = MF.getInfo(); switch (CallConv) { default: @@ -23548,6 +23549,8 @@ SDValue RISCVTargetLowering::LowerFormalArguments( continue; } InVals.push_back(ArgValue); + if (Ins[InsIdx].Flags.isByVal()) + RVFI->addIncomingByValArgs(ArgValue); } if (any_of(ArgLocs, @@ -23560,7 +23563,6 @@ SDValue RISCVTargetLowering::LowerFormalArguments( const TargetRegisterClass *RC = &RISCV::GPRRegClass; MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); - RISCVMachineFunctionInfo *RVFI = MF.getInfo(); // Size of the vararg save area. For now, the varargs save area is either // zero or large enough to hold a0-a7. @@ -23608,6 +23610,8 @@ SDValue RISCVTargetLowering::LowerFormalArguments( RVFI->setVarArgsSaveSize(VarArgsSaveSize); } + RVFI->setArgumentStackSize(CCInfo.getStackSize()); + // All stores are grouped in one node to allow the matching between // the size of Ins and InVals. This only happens for vararg functions. if (!OutChains.empty()) { @@ -23629,6 +23633,7 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization( auto &Outs = CLI.Outs; auto &Caller = MF.getFunction(); auto CallerCC = Caller.getCallingConv(); + auto *RVFI = MF.getInfo(); // Exception-handling functions need a special set of instructions to // indicate a return to the hardware. Tail-calling another function would @@ -23638,29 +23643,28 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization( if (Caller.hasFnAttribute("interrupt")) return false; - // Do not tail call opt if the stack is used to pass parameters. - if (CCInfo.getStackSize() != 0) + // If the stack arguments for this call do not fit into our own save area then + // the call cannot be made tail. + if (CCInfo.getStackSize() > RVFI->getArgumentStackSize()) return false; - // Do not tail call opt if any parameters need to be passed indirectly. - // Since long doubles (fp128) and i128 are larger than 2*XLEN, they are - // passed indirectly. So the address of the value will be passed in a - // register, or if not available, then the address is put on the stack. In - // order to pass indirectly, space on the stack often needs to be allocated - // in order to store the value. In this case the CCInfo.getNextStackOffset() - // != 0 check is not enough and we need to check if any CCValAssign ArgsLocs - // are passed CCValAssign::Indirect. - for (auto &VA : ArgLocs) - if (VA.getLocInfo() == CCValAssign::Indirect) - return false; - // Do not tail call opt if either caller or callee uses struct return // semantics. auto IsCallerStructRet = Caller.hasStructRetAttr(); auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet(); - if (IsCallerStructRet || IsCalleeStructRet) + if (IsCallerStructRet != IsCalleeStructRet) return false; + // Do not tail call opt if caller's and callee's byval arguments do not match. + for (unsigned i = 0, j = 0; i < Outs.size(); i++) { + if (!Outs[i].Flags.isByVal()) + continue; + if (j++ >= RVFI->getIncomingByValArgsSize()) + return false; + if (RVFI->getIncomingByValArgs(i).getValueType() != Outs[i].ArgVT) + return false; + } + // The callee has to preserve all registers the caller needs to preserve. const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); @@ -23670,12 +23674,12 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization( return false; } - // Byval parameters hand the function a pointer directly into the stack area - // we want to reuse during a tail call. Working around this *is* possible - // but less efficient and uglier in LowerCall. - for (auto &Arg : Outs) - if (Arg.Flags.isByVal()) - return false; + // If the callee takes no arguments then go on to check the results of the + // call. + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SmallVectorImpl &OutVals = CLI.OutVals; + if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) + return false; return true; } @@ -23704,6 +23708,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); + RISCVMachineFunctionInfo *RVFI = MF.getInfo(); MachineFunction::CallSiteInfo CSInfo; // Set type id for call site info. @@ -23738,7 +23743,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, // Create local copies for byval args SmallVector ByValArgs; - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + for (unsigned i = 0, j = 0, e = Outs.size(); i != e; ++i) { ISD::ArgFlagsTy Flags = Outs[i].Flags; if (!Flags.isByVal()) continue; @@ -23747,16 +23752,27 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned Size = Flags.getByValSize(); Align Alignment = Flags.getNonZeroByValAlign(); - int FI = - MF.getFrameInfo().CreateStackObject(Size, Alignment, /*isSS=*/false); - SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue SizeNode = DAG.getConstant(Size, DL, XLenVT); + SDValue Dst; - Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment, - /*IsVolatile=*/false, - /*AlwaysInline=*/false, /*CI*/ nullptr, IsTailCall, - MachinePointerInfo(), MachinePointerInfo()); - ByValArgs.push_back(FIPtr); + if (IsTailCall) { + SDValue CallerArg = RVFI->getIncomingByValArgs(j++); + if (isa(Arg) || isa(Arg) || + isa(Arg)) + Dst = CallerArg; + } else { + int FI = + MF.getFrameInfo().CreateStackObject(Size, Alignment, /*isSS=*/false); + Dst = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + } + if (Dst) { + Chain = + DAG.getMemcpy(Chain, DL, Dst, Arg, SizeNode, Alignment, + /*IsVolatile=*/false, + /*AlwaysInline=*/false, /*CI=*/nullptr, std::nullopt, + MachinePointerInfo(), MachinePointerInfo()); + ByValArgs.push_back(Dst); + } } if (!IsTailCall) @@ -23859,8 +23875,12 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, } // Use local copy if it is a byval arg. - if (Flags.isByVal()) - ArgValue = ByValArgs[j++]; + if (Flags.isByVal()) { + if (!IsTailCall || (isa(ArgValue) || + isa(ArgValue) || + isa(ArgValue))) + ArgValue = ByValArgs[j++]; + } if (VA.isRegLoc()) { // Queue up the argument copies and emit them at the end. @@ -23871,20 +23891,32 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i); } else { assert(VA.isMemLoc() && "Argument not register or memory"); - assert(!IsTailCall && "Tail call not allowed if stack is used " - "for passing parameters"); + SDValue DstAddr; + MachinePointerInfo DstInfo; + int32_t Offset = VA.getLocMemOffset(); // Work out the address of the stack slot. if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT); - SDValue Address = - DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, - DAG.getIntPtrConstant(VA.getLocMemOffset(), DL)); + + if (IsTailCall) { + unsigned OpSize = divideCeil(VA.getValVT().getSizeInBits(), 8); + int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); + DstAddr = DAG.getFrameIndex(FI, PtrVT); + DstInfo = MachinePointerInfo::getFixedStack(MF, FI); + // Make sure any stack arguments overlapping with where we're storing + // are loaded before this eventual operation. Otherwise they'll be + // clobbered. + Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); + } else { + SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); + DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); + DstInfo = MachinePointerInfo::getStack(MF, Offset); + } // Emit the store. MemOpChains.push_back( - DAG.getStore(Chain, DL, ArgValue, Address, - MachinePointerInfo::getStack(MF, VA.getLocMemOffset()))); + DAG.getStore(Chain, DL, ArgValue, DstAddr, DstInfo)); } } diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h index f9be80feae211..9c2cd708f2784 100644 --- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h +++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h @@ -65,6 +65,14 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo { uint64_t RVVPadding = 0; /// Size of stack frame to save callee saved registers unsigned CalleeSavedStackSize = 0; + + /// ArgumentStackSize - amount of bytes on stack consumed by the arguments + /// being passed on the stack + unsigned ArgumentStackSize = 0; + + /// Incoming ByVal arguments + SmallVector IncomingByValArgs; + /// Is there any vector argument or return? bool IsVectorCall = false; @@ -142,6 +150,13 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo { unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; } void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; } + unsigned getArgumentStackSize() const { return ArgumentStackSize; } + void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; } + + void addIncomingByValArgs(SDValue Val) { IncomingByValArgs.push_back(Val); } + SDValue &getIncomingByValArgs(int Idx) { return IncomingByValArgs[Idx]; } + unsigned getIncomingByValArgsSize() { return IncomingByValArgs.size(); } + enum class PushPopKind { None = 0, StdExtZcmp, VendorXqccmp }; PushPopKind getPushPopKind(const MachineFunction &MF) const; diff --git a/llvm/test/CodeGen/RISCV/musttail-call.ll b/llvm/test/CodeGen/RISCV/musttail-call.ll index f6ec5307b8bad..a3ac3560378db 100644 --- a/llvm/test/CodeGen/RISCV/musttail-call.ll +++ b/llvm/test/CodeGen/RISCV/musttail-call.ll @@ -9,12 +9,13 @@ ; RUN: not --crash llc -mtriple riscv64-unknown-elf -o - %s \ ; RUN: 2>&1 | FileCheck %s -%struct.A = type { i32 } +declare void @callee_musttail() -declare void @callee_musttail(ptr sret(%struct.A) %a) -define void @caller_musttail(ptr sret(%struct.A) %a) { +define void @caller_musttail() #0 { ; CHECK: LLVM ERROR: failed to perform tail call elimination on a call site marked musttail entry: - musttail call void @callee_musttail(ptr sret(%struct.A) %a) + musttail call void @callee_musttail() ret void } + +attributes #0 = { "interrupt"="machine" } diff --git a/llvm/test/CodeGen/RISCV/musttail.ll b/llvm/test/CodeGen/RISCV/musttail.ll new file mode 100644 index 0000000000000..4765fe7a4f233 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/musttail.ll @@ -0,0 +1,571 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 %s -o - | FileCheck %s --check-prefix=RV32 +; RUN: llc -mtriple=riscv64 %s -o - | FileCheck %s --check-prefix=RV64 + +declare i32 @many_args_callee(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) + +define i32 @many_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) { +; RV32-LABEL: many_args_tail: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 9 +; RV32-NEXT: li t0, 8 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: li a2, 2 +; RV32-NEXT: li a3, 3 +; RV32-NEXT: li a4, 4 +; RV32-NEXT: li a5, 5 +; RV32-NEXT: li a6, 6 +; RV32-NEXT: sw a0, 4(sp) +; RV32-NEXT: li a7, 7 +; RV32-NEXT: sw t0, 0(sp) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: tail many_args_callee +; +; RV64-LABEL: many_args_tail: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 9 +; RV64-NEXT: li t0, 8 +; RV64-NEXT: li a1, 1 +; RV64-NEXT: li a2, 2 +; RV64-NEXT: li a3, 3 +; RV64-NEXT: li a4, 4 +; RV64-NEXT: li a5, 5 +; RV64-NEXT: li a6, 6 +; RV64-NEXT: sd a0, 8(sp) +; RV64-NEXT: li a7, 7 +; RV64-NEXT: sd t0, 0(sp) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: tail many_args_callee + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +define i32 @many_args_musttail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) { +; RV32-LABEL: many_args_musttail: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 9 +; RV32-NEXT: li t0, 8 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: li a2, 2 +; RV32-NEXT: li a3, 3 +; RV32-NEXT: li a4, 4 +; RV32-NEXT: li a5, 5 +; RV32-NEXT: li a6, 6 +; RV32-NEXT: sw a0, 4(sp) +; RV32-NEXT: li a7, 7 +; RV32-NEXT: sw t0, 0(sp) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: tail many_args_callee +; +; RV64-LABEL: many_args_musttail: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 9 +; RV64-NEXT: li t0, 8 +; RV64-NEXT: li a1, 1 +; RV64-NEXT: li a2, 2 +; RV64-NEXT: li a3, 3 +; RV64-NEXT: li a4, 4 +; RV64-NEXT: li a5, 5 +; RV64-NEXT: li a6, 6 +; RV64-NEXT: sd a0, 8(sp) +; RV64-NEXT: li a7, 7 +; RV64-NEXT: sd t0, 0(sp) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: tail many_args_callee + %ret = musttail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +; This function has more arguments than it's tail-callee. This isn't valid for +; the musttail attribute, but can still be tail-called as a non-guaranteed +; optimisation, because the outgoing arguments to @many_args_callee fit in the +; stack space allocated by the caller of @more_args_tail. +define i32 @more_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) { +; RV32-LABEL: more_args_tail: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 9 +; RV32-NEXT: li t0, 8 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: li a2, 2 +; RV32-NEXT: li a3, 3 +; RV32-NEXT: li a4, 4 +; RV32-NEXT: li a5, 5 +; RV32-NEXT: li a6, 6 +; RV32-NEXT: sw a0, 4(sp) +; RV32-NEXT: li a7, 7 +; RV32-NEXT: sw t0, 0(sp) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: tail many_args_callee +; +; RV64-LABEL: more_args_tail: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 9 +; RV64-NEXT: li t0, 8 +; RV64-NEXT: li a1, 1 +; RV64-NEXT: li a2, 2 +; RV64-NEXT: li a3, 3 +; RV64-NEXT: li a4, 4 +; RV64-NEXT: li a5, 5 +; RV64-NEXT: li a6, 6 +; RV64-NEXT: sd a0, 8(sp) +; RV64-NEXT: li a7, 7 +; RV64-NEXT: sd t0, 0(sp) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: tail many_args_callee + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +; Again, this isn't valid for musttail, but can be tail-called in practice +; because the stack size is the same. +define i32 @different_args_tail_32bit(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4) { +; RV32-LABEL: different_args_tail_32bit: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 9 +; RV32-NEXT: li t0, 8 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: li a2, 2 +; RV32-NEXT: li a3, 3 +; RV32-NEXT: li a4, 4 +; RV32-NEXT: li a5, 5 +; RV32-NEXT: li a6, 6 +; RV32-NEXT: sw a0, 4(sp) +; RV32-NEXT: li a7, 7 +; RV32-NEXT: sw t0, 0(sp) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: tail many_args_callee +; +; RV64-LABEL: different_args_tail_32bit: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: li a0, 9 +; RV64-NEXT: li t0, 8 +; RV64-NEXT: li a1, 1 +; RV64-NEXT: li a2, 2 +; RV64-NEXT: li a3, 3 +; RV64-NEXT: li a4, 4 +; RV64-NEXT: li a5, 5 +; RV64-NEXT: li a6, 6 +; RV64-NEXT: li a7, 7 +; RV64-NEXT: sd t0, 0(sp) +; RV64-NEXT: sd a0, 8(sp) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: call many_args_callee +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +define i32 @different_args_tail_64bit(i128 %0, i128 %1, i128 %2, i128 %3, i128 %4) { +; RV32-LABEL: different_args_tail_64bit: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: li a0, 9 +; RV32-NEXT: li t0, 8 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: li a2, 2 +; RV32-NEXT: li a3, 3 +; RV32-NEXT: li a4, 4 +; RV32-NEXT: li a5, 5 +; RV32-NEXT: li a6, 6 +; RV32-NEXT: li a7, 7 +; RV32-NEXT: sw t0, 0(sp) +; RV32-NEXT: sw a0, 4(sp) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: call many_args_callee +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: different_args_tail_64bit: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 9 +; RV64-NEXT: li t0, 8 +; RV64-NEXT: li a1, 1 +; RV64-NEXT: li a2, 2 +; RV64-NEXT: li a3, 3 +; RV64-NEXT: li a4, 4 +; RV64-NEXT: li a5, 5 +; RV64-NEXT: li a6, 6 +; RV64-NEXT: sd a0, 8(sp) +; RV64-NEXT: li a7, 7 +; RV64-NEXT: sd t0, 0(sp) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: tail many_args_callee + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +; Here, the caller requires less stack space for it's arguments than the +; callee, so it would not ba valid to do a tail-call. +define i32 @fewer_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4) { +; RV32-LABEL: fewer_args_tail: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: li a0, 9 +; RV32-NEXT: li t0, 8 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: li a2, 2 +; RV32-NEXT: li a3, 3 +; RV32-NEXT: li a4, 4 +; RV32-NEXT: li a5, 5 +; RV32-NEXT: li a6, 6 +; RV32-NEXT: li a7, 7 +; RV32-NEXT: sw t0, 0(sp) +; RV32-NEXT: sw a0, 4(sp) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: call many_args_callee +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: fewer_args_tail: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: li a0, 9 +; RV64-NEXT: li t0, 8 +; RV64-NEXT: li a1, 1 +; RV64-NEXT: li a2, 2 +; RV64-NEXT: li a3, 3 +; RV64-NEXT: li a4, 4 +; RV64-NEXT: li a5, 5 +; RV64-NEXT: li a6, 6 +; RV64-NEXT: li a7, 7 +; RV64-NEXT: sd t0, 0(sp) +; RV64-NEXT: sd a0, 8(sp) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: call many_args_callee +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +declare void @foo(i32, i32, i32, i32, i32, i32, i32, i32, i32) + +define void @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8) nounwind { +; RV32-LABEL: bar: +; RV32: # %bb.0: # %entry +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: mv s0, a7 +; RV32-NEXT: mv s1, a6 +; RV32-NEXT: mv s2, a5 +; RV32-NEXT: mv s3, a4 +; RV32-NEXT: mv s4, a3 +; RV32-NEXT: mv s5, a2 +; RV32-NEXT: mv s6, a1 +; RV32-NEXT: mv s7, a0 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: mv a0, s7 +; RV32-NEXT: call foo +; RV32-NEXT: li a0, 2 +; RV32-NEXT: sw a0, 48(sp) +; RV32-NEXT: mv a0, s7 +; RV32-NEXT: mv a1, s6 +; RV32-NEXT: mv a2, s5 +; RV32-NEXT: mv a3, s4 +; RV32-NEXT: mv a4, s3 +; RV32-NEXT: mv a5, s2 +; RV32-NEXT: mv a6, s1 +; RV32-NEXT: mv a7, s0 +; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: tail foo +; +; RV64-LABEL: bar: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s6, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s7, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: mv s0, a7 +; RV64-NEXT: mv s1, a6 +; RV64-NEXT: mv s2, a5 +; RV64-NEXT: mv s3, a4 +; RV64-NEXT: mv s4, a3 +; RV64-NEXT: mv s5, a2 +; RV64-NEXT: mv s6, a1 +; RV64-NEXT: mv s7, a0 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: sd a0, 0(sp) +; RV64-NEXT: mv a0, s7 +; RV64-NEXT: call foo +; RV64-NEXT: li a0, 2 +; RV64-NEXT: sd a0, 80(sp) +; RV64-NEXT: mv a0, s7 +; RV64-NEXT: mv a1, s6 +; RV64-NEXT: mv a2, s5 +; RV64-NEXT: mv a3, s4 +; RV64-NEXT: mv a4, s3 +; RV64-NEXT: mv a5, s2 +; RV64-NEXT: mv a6, s1 +; RV64-NEXT: mv a7, s0 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s6, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s7, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: tail foo +entry: + call void @foo(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 1) + musttail call void @foo(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 2) + ret void +} + +declare void @sret_callee(ptr sret({ double, double }) align 8) + +; Functions which return by sret can be tail-called because the incoming sret +; pointer gets passed through to the callee. +define void @sret_caller_tail(ptr sret({ double, double }) align 8 %result) { +; RV32-LABEL: sret_caller_tail: +; RV32: # %bb.0: # %entry +; RV32-NEXT: tail sret_callee +; +; RV64-LABEL: sret_caller_tail: +; RV64: # %bb.0: # %entry +; RV64-NEXT: tail sret_callee +entry: + tail call void @sret_callee(ptr sret({ double, double }) align 8 %result) + ret void +} + +define void @sret_caller_musttail(ptr sret({ double, double }) align 8 %result) { +; RV32-LABEL: sret_caller_musttail: +; RV32: # %bb.0: # %entry +; RV32-NEXT: tail sret_callee +; +; RV64-LABEL: sret_caller_musttail: +; RV64: # %bb.0: # %entry +; RV64-NEXT: tail sret_callee +entry: + musttail call void @sret_callee(ptr sret({ double, double }) align 8 %result) + ret void +} + +%twenty_bytes = type { [5 x i32] } +declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4) + +; Functions with byval parameters can be tail-called, because the value is +; actually passed in registers in the same way for the caller and callee. +define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; RV32-LABEL: large_caller: +; RV32: # %bb.0: # %entry +; RV32-NEXT: tail large_callee +; +; RV64-LABEL: large_caller: +; RV64: # %bb.0: # %entry +; RV64-NEXT: tail large_callee +entry: + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; As above, but with some inline asm to test that the arguments in r4 is +; re-loaded before the call. +define void @large_caller_check_regs(%twenty_bytes* byval(%twenty_bytes) align 4 %a) nounwind { +; RV32-LABEL: large_caller_check_regs: +; RV32: # %bb.0: # %entry +; RV32-NEXT: #APP +; RV32-NEXT: #NO_APP +; RV32-NEXT: tail large_callee +; +; RV64-LABEL: large_caller_check_regs: +; RV64: # %bb.0: # %entry +; RV64-NEXT: #APP +; RV64-NEXT: #NO_APP +; RV64-NEXT: tail large_callee +entry: + tail call void asm sideeffect "", "~{r4}"() + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; The IR for this one looks dodgy, because it has an alloca passed to a +; musttail function, but it is passed as a byval argument, so will be copied +; into the stack space allocated by @large_caller_new_value's caller, so is +; valid. +define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) nounwind { +; RV32-LABEL: large_caller_new_value: +; RV32: # %bb.0: # %entry +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: li a2, 2 +; RV32-NEXT: li a3, 3 +; RV32-NEXT: li a4, 4 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a2, 20(sp) +; RV32-NEXT: sw a3, 24(sp) +; RV32-NEXT: sw a4, 28(sp) +; RV32-NEXT: sw a4, 16(a0) +; RV32-NEXT: sw zero, 0(a0) +; RV32-NEXT: sw a1, 4(a0) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: sw a3, 12(a0) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: tail large_callee +; +; RV64-LABEL: large_caller_new_value: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: li a1, 1 +; RV64-NEXT: li a2, 2 +; RV64-NEXT: li a3, 3 +; RV64-NEXT: li a4, 4 +; RV64-NEXT: sw zero, 12(sp) +; RV64-NEXT: sw a1, 16(sp) +; RV64-NEXT: sw a2, 20(sp) +; RV64-NEXT: sw a3, 24(sp) +; RV64-NEXT: sw a4, 28(sp) +; RV64-NEXT: sw a4, 16(a0) +; RV64-NEXT: sw zero, 0(a0) +; RV64-NEXT: sw a1, 4(a0) +; RV64-NEXT: sw a2, 8(a0) +; RV64-NEXT: sw a3, 12(a0) +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: tail large_callee +entry: + %y = alloca %twenty_bytes, align 4 + store i32 0, ptr %y, align 4 + %0 = getelementptr inbounds i8, ptr %y, i32 4 + store i32 1, ptr %0, align 4 + %1 = getelementptr inbounds i8, ptr %y, i32 8 + store i32 2, ptr %1, align 4 + %2 = getelementptr inbounds i8, ptr %y, i32 12 + store i32 3, ptr %2, align 4 + %3 = getelementptr inbounds i8, ptr %y, i32 16 + store i32 4, ptr %3, align 4 + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y) + ret void +} + +declare void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4, %twenty_bytes* byval(%twenty_bytes) align 4) +define void @swap_byvals(%twenty_bytes* byval(%twenty_bytes) align 4 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) { +; RV32-LABEL: swap_byvals: +; RV32: # %bb.0: # %entry +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: tail two_byvals_callee +; +; RV64-LABEL: swap_byvals: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: tail two_byvals_callee +entry: + musttail call void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b, %twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; A forwarded byval arg, but in a different argument register, so it needs to +; be moved between registers first. This can't be musttail because of the +; different signatures, but is still tail-called as an optimisation. +declare void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4) +define void @shift_byval(i32 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) { +; RV32-LABEL: shift_byval: +; RV32: # %bb.0: # %entry +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: tail shift_byval_callee +; +; RV64-LABEL: shift_byval: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: tail shift_byval_callee +entry: + tail call void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b) + ret void +} + +; A global object passed to a byval argument, so it must be copied, but doesn't +; need a stack temporary. +@large_global = external global %twenty_bytes +define void @large_caller_from_global(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; RV32-LABEL: large_caller_from_global: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lui a1, %hi(large_global) +; RV32-NEXT: addi a1, a1, %lo(large_global) +; RV32-NEXT: lw a2, 16(a1) +; RV32-NEXT: sw a2, 16(a0) +; RV32-NEXT: lw a2, 12(a1) +; RV32-NEXT: sw a2, 12(a0) +; RV32-NEXT: lw a2, 8(a1) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: tail large_callee +; +; RV64-LABEL: large_caller_from_global: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lui a1, %hi(large_global) +; RV64-NEXT: addi a1, a1, %lo(large_global) +; RV64-NEXT: lw a2, 16(a1) +; RV64-NEXT: sw a2, 16(a0) +; RV64-NEXT: lw a2, 12(a1) +; RV64-NEXT: sw a2, 12(a0) +; RV64-NEXT: lw a2, 8(a1) +; RV64-NEXT: sw a2, 8(a0) +; RV64-NEXT: lw a2, 4(a1) +; RV64-NEXT: sw a2, 4(a0) +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: sw a1, 0(a0) +; RV64-NEXT: tail large_callee +entry: + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 @large_global) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll index 6756fea8a1f85..fa68006059fdb 100644 --- a/llvm/test/CodeGen/RISCV/tail-calls.ll +++ b/llvm/test/CodeGen/RISCV/tail-calls.ll @@ -204,49 +204,39 @@ declare i32 @callee_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 % define i32 @caller_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n) nounwind { ; CHECK-LABEL: caller_args: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw t0, 32(sp) -; CHECK-NEXT: lw t1, 36(sp) -; CHECK-NEXT: lw t2, 40(sp) -; CHECK-NEXT: lw t3, 44(sp) -; CHECK-NEXT: lw t4, 48(sp) -; CHECK-NEXT: lw t5, 52(sp) -; CHECK-NEXT: sw t4, 16(sp) -; CHECK-NEXT: sw t5, 20(sp) +; CHECK-NEXT: lw t0, 0(sp) +; CHECK-NEXT: lw t1, 20(sp) +; CHECK-NEXT: lw t2, 4(sp) +; CHECK-NEXT: lw t3, 8(sp) +; CHECK-NEXT: lw t4, 12(sp) +; CHECK-NEXT: lw t5, 16(sp) +; CHECK-NEXT: sw t2, 4(sp) +; CHECK-NEXT: sw t3, 8(sp) +; CHECK-NEXT: sw t4, 12(sp) +; CHECK-NEXT: sw t5, 16(sp) +; CHECK-NEXT: sw t1, 20(sp) ; CHECK-NEXT: sw t0, 0(sp) -; CHECK-NEXT: sw t1, 4(sp) -; CHECK-NEXT: sw t2, 8(sp) -; CHECK-NEXT: sw t3, 12(sp) -; CHECK-NEXT: call callee_args -; CHECK-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 -; CHECK-NEXT: ret +; CHECK-NEXT: tail callee_args ; ; CHECK-LARGE-ZICFILP-LABEL: caller_args: ; CHECK-LARGE-ZICFILP: # %bb.0: # %entry ; CHECK-LARGE-ZICFILP-NEXT: lpad 0 -; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, -32 -; CHECK-LARGE-ZICFILP-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; CHECK-LARGE-ZICFILP-NEXT: lw t0, 32(sp) -; CHECK-LARGE-ZICFILP-NEXT: lw t1, 36(sp) -; CHECK-LARGE-ZICFILP-NEXT: lw t3, 40(sp) -; CHECK-LARGE-ZICFILP-NEXT: lw t4, 44(sp) -; CHECK-LARGE-ZICFILP-NEXT: lw t2, 48(sp) -; CHECK-LARGE-ZICFILP-NEXT: lw t5, 52(sp) -; CHECK-LARGE-ZICFILP-NEXT: sw t2, 16(sp) -; CHECK-LARGE-ZICFILP-NEXT: sw t5, 20(sp) +; CHECK-LARGE-ZICFILP-NEXT: lw t0, 0(sp) +; CHECK-LARGE-ZICFILP-NEXT: lw t1, 20(sp) +; CHECK-LARGE-ZICFILP-NEXT: lw t2, 4(sp) +; CHECK-LARGE-ZICFILP-NEXT: lw t3, 16(sp) +; CHECK-LARGE-ZICFILP-NEXT: lw t4, 12(sp) +; CHECK-LARGE-ZICFILP-NEXT: lw t5, 8(sp) +; CHECK-LARGE-ZICFILP-NEXT: sw t2, 4(sp) ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi8: ; CHECK-LARGE-ZICFILP-NEXT: auipc t2, %pcrel_hi(.LCPI6_0) ; CHECK-LARGE-ZICFILP-NEXT: lw t2, %pcrel_lo(.Lpcrel_hi8)(t2) -; CHECK-LARGE-ZICFILP-NEXT: sw t0, 0(sp) -; CHECK-LARGE-ZICFILP-NEXT: sw t1, 4(sp) -; CHECK-LARGE-ZICFILP-NEXT: sw t3, 8(sp) +; CHECK-LARGE-ZICFILP-NEXT: sw t5, 8(sp) ; CHECK-LARGE-ZICFILP-NEXT: sw t4, 12(sp) -; CHECK-LARGE-ZICFILP-NEXT: jalr t2 -; CHECK-LARGE-ZICFILP-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, 32 -; CHECK-LARGE-ZICFILP-NEXT: ret +; CHECK-LARGE-ZICFILP-NEXT: sw t3, 16(sp) +; CHECK-LARGE-ZICFILP-NEXT: sw t1, 20(sp) +; CHECK-LARGE-ZICFILP-NEXT: sw t0, 0(sp) +; CHECK-LARGE-ZICFILP-NEXT: jr t2 entry: %r = tail call i32 @callee_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n) ret i32 %r @@ -257,24 +247,20 @@ declare i32 @callee_indirect_args(fp128 %a) define void @caller_indirect_args() nounwind { ; CHECK-LABEL: caller_indirect_args: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: lui a1, 262128 ; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: sw zero, 0(sp) ; CHECK-NEXT: sw zero, 4(sp) ; CHECK-NEXT: sw zero, 8(sp) ; CHECK-NEXT: sw a1, 12(sp) -; CHECK-NEXT: call callee_indirect_args -; CHECK-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 -; CHECK-NEXT: ret +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: tail callee_indirect_args ; ; CHECK-LARGE-ZICFILP-LABEL: caller_indirect_args: ; CHECK-LARGE-ZICFILP: # %bb.0: # %entry ; CHECK-LARGE-ZICFILP-NEXT: lpad 0 -; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, -32 -; CHECK-LARGE-ZICFILP-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, -16 ; CHECK-LARGE-ZICFILP-NEXT: lui a1, 262128 ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi9: ; CHECK-LARGE-ZICFILP-NEXT: auipc a0, %pcrel_hi(.LCPI7_0) @@ -284,10 +270,8 @@ define void @caller_indirect_args() nounwind { ; CHECK-LARGE-ZICFILP-NEXT: sw zero, 4(sp) ; CHECK-LARGE-ZICFILP-NEXT: sw zero, 8(sp) ; CHECK-LARGE-ZICFILP-NEXT: sw a1, 12(sp) -; CHECK-LARGE-ZICFILP-NEXT: jalr t2 -; CHECK-LARGE-ZICFILP-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, 32 -; CHECK-LARGE-ZICFILP-NEXT: ret +; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, 16 +; CHECK-LARGE-ZICFILP-NEXT: jr t2 entry: %call = tail call i32 @callee_indirect_args(fp128 0xL00000000000000003FFF000000000000) ret void