diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 4ae981e4013e9..8f45ba4575401 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5112,7 +5112,11 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, RawAddress SRetAlloca = RawAddress::invalid(); llvm::Value *UnusedReturnSizePtr = nullptr; if (RetAI.isIndirect() || RetAI.isInAlloca() || RetAI.isCoerceAndExpand()) { - if (IsVirtualFunctionPointerThunk && RetAI.isIndirect()) { + // For virtual function pointer thunks and musttail calls, we must always + // forward an incoming SRet pointer to the callee, because a local alloca + // would be de-allocated before the call. These cases both guarantee that + // there will be an incoming SRet argument of the correct type. + if ((IsVirtualFunctionPointerThunk || IsMustTail) && RetAI.isIndirect()) { SRetPtr = makeNaturalAddressForPointer(CurFn->arg_begin() + IRFunctionArgs.getSRetArgNo(), RetTy, CharUnits::fromQuantity(1)); diff --git a/clang/test/CodeGen/musttail-sret.cpp b/clang/test/CodeGen/musttail-sret.cpp new file mode 100644 index 0000000000000..ca67c218cd67f --- /dev/null +++ b/clang/test/CodeGen/musttail-sret.cpp @@ -0,0 +1,84 @@ +// RUN: %clang_cc1 -triple=arm %s -emit-llvm -O3 -o - | FileCheck %s --check-prefix=CHECK-ARM +// RUN: %clang_cc1 -triple=arm64 %s -emit-llvm -O3 -o - | FileCheck %s --check-prefix=CHECK-ARM64 +// RUN: %clang_cc1 -triple=i686 %s -emit-llvm -O3 -o - | FileCheck %s --check-prefix=CHECK-X86 +// RUN: %clang_cc1 -triple=x86_64 %s -emit-llvm -O3 -o - | FileCheck %s --check-prefix=CHECK-X64 + +// Sret tests +struct Big { + int a, b, c, d, e, f, g, h; +}; + +struct Big F1(signed short P0); + +struct Big F2(signed short P0) { + signed short P1 = 20391; + [[clang::musttail]] return F1(P1); +} + +// CHECK-NOT: alloca +// CHECK-ARM: musttail call arm_aapcscc void @_Z2F1s(ptr dead_on_unwind writable sret(%struct.Big) align 4 %agg.result, i16 noundef signext 20391) +// CHECK-ARM64: musttail call void @_Z2F1s(ptr dead_on_unwind writable sret(%struct.Big) align 4 %agg.result, i16 noundef 20391) +// CHECK-X86: musttail call void @_Z2F1s(ptr dead_on_unwind writable sret(%struct.Big) align 4 %agg.result, i16 noundef signext 20391) +// CHECK-X64: musttail call void @_Z2F1s(ptr dead_on_unwind writable sret(%struct.Big) align 4 %agg.result, i16 noundef signext 20391) + +struct ReallyBig { + int a[100]; +}; + +// Indirect sret tests +// Function pointer for testing indirect musttail call. +struct FunctionPointers { + ReallyBig (*F3)(int, int, int, int, float, double); + ReallyBig (*F4)(int, int, int, char, float, double); +}; + +struct ReallyBig F3(int P0, int P1, int P2, int P3, float P4, double P5); +struct ReallyBig F4(int P0, int P1, int P2, char P3, float P4, double P5); + +static struct FunctionPointers FP = {F3, F4}; + +struct ReallyBig F5 (int P0, int P1, int P2, int P3, float P4, double P5) { + [[clang::musttail]] return FP.F3(P0, P1, P2, P3, P4, P5); +} + +// CHECK-NOT: alloca +// CHECK-ARM: musttail call arm_aapcscc void @_Z2F3iiiifd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i32 noundef %P3, float noundef %P4, double noundef %P5) +// CHECK-ARM64: musttail call void @_Z2F3iiiifd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i32 noundef %P3, float noundef %P4, double noundef %P5) +// CHECK-X86: musttail call void @_Z2F3iiiifd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i32 noundef %P3, float noundef %P4, double noundef %P5) +// CHECK-X64: musttail call void @_Z2F3iiiifd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i32 noundef %P3, float noundef %P4, double noundef %P5) + +struct ReallyBig F6 (int P0, int P1, int P2, char P3, float P4, double P5) { + [[clang::musttail]] return FP.F4(P0, P1, P2, P3, P4, P5); +} + +// Complex and BitInt. Special cases for sret. +// CHECK-NOT: alloca +// CHECK-ARM: musttail call arm_aapcscc void @_Z2F4iiicfd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i8 noundef signext %P3, float noundef %P4, double noundef %P5) +// CHECK-ARM64: musttail call void @_Z2F4iiicfd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i8 noundef %P3, float noundef %P4, double noundef %P5) +// CHECK-X86: musttail call void @_Z2F4iiicfd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i8 noundef signext %P3, float noundef %P4, double noundef %P5) +// CHECK-X64: musttail call void @_Z2F4iiicfd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i8 noundef signext %P3, float noundef %P4, double noundef %P5) + +double _Complex F7(signed short P0); + +double _Complex F8(signed short P0) { + signed short P1 = 20391; + [[clang::musttail]] return F7(P1); +} + +// CHECK-NOT: alloca +// CHECK-ARM: musttail call arm_aapcscc void @_Z2F7s(ptr dead_on_unwind writable sret({ double, double }) align 8 %agg.result, i16 noundef signext 20391) +// CHECK-ARM64: musttail call noundef { double, double } @_Z2F7s(i16 noundef 20391) +// CHECK-X86: musttail call void @_Z2F7s(ptr dead_on_unwind writable sret({ double, double }) align 4 %agg.result, i16 noundef signext 20391) +// CHECK-X64: musttail call noundef { double, double } @_Z2F7s(i16 noundef signext 20391) + +signed _BitInt(100) F9(float P0, float P1, double P2, char P3); + +signed _BitInt(100) F10(float P0, float P1, double P2, char P3) { + [[clang::musttail]] return F9(P0, P1, P2, P3); +} + +// CHECK-NOT: alloca +// CHECK-ARM: musttail call arm_aapcscc void @_Z2F9ffdc(ptr dead_on_unwind writable sret(i128) align 8 %agg.result, float noundef %P0, float noundef %P1, double noundef %P2, i8 noundef signext %P3) +// CHECK-ARM64: musttail call noundef i100 @_Z2F9ffdc(float noundef %P0, float noundef %P1, double noundef %P2, i8 noundef %P3) +// CHECK-X86: musttail call void @_Z2F9ffdc(ptr dead_on_unwind writable sret(i128) align 4 %agg.result, float noundef %P0, float noundef %P1, double noundef %P2, i8 noundef signext %P3) +// CHECK-X64: musttail call noundef { i64, i64 } @_Z2F9ffdc(float noundef %P0, float noundef %P1, double noundef %P2, i8 noundef signext %P3) \ No newline at end of file diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 1733424a8b669..e738dbfb56966 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2325,6 +2325,59 @@ std::pair ARMTargetLowering::computeAddrForCallArg( return std::make_pair(DstAddr, DstInfo); } +// Returns the type of copying which is required to set up a byval argument to +// a tail-called function. This isn't needed for non-tail calls, because they +// always need the equivalent of CopyOnce, but tail-calls sometimes need two to +// avoid clobbering another argument (CopyViaTemp), and sometimes can be +// optimised to zero copies when forwarding an argument from the caller's +// caller (NoCopy). +ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall( + SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo(); + + // Globals are always safe to copy from. + if (isa(Src) || isa(Src)) + return CopyOnce; + + // Can only analyse frame index nodes, conservatively assume we need a + // temporary. + auto *SrcFrameIdxNode = dyn_cast(Src); + auto *DstFrameIdxNode = dyn_cast(Dst); + if (!SrcFrameIdxNode || !DstFrameIdxNode) + return CopyViaTemp; + + int SrcFI = SrcFrameIdxNode->getIndex(); + int DstFI = DstFrameIdxNode->getIndex(); + assert(MFI.isFixedObjectIndex(DstFI) && + "byval passed in non-fixed stack slot"); + + int64_t SrcOffset = MFI.getObjectOffset(SrcFI); + int64_t DstOffset = MFI.getObjectOffset(DstFI); + + // If the source is in the local frame, then the copy to the argument memory + // is always valid. + bool FixedSrc = MFI.isFixedObjectIndex(SrcFI); + if (!FixedSrc || + (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize())) + return CopyOnce; + + // In the case of byval arguments split between registers and the stack, + // computeAddrForCallArg returns a FrameIndex which corresponds only to the + // stack portion, but the Src SDValue will refer to the full value, including + // the local stack memory that the register portion gets stored into. We only + // need to compare them for equality, so normalise on the full value version. + uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI); + DstOffset -= RegSize; + + // If the value is already in the correct location, then no copying is + // needed. If not, then we need to copy via a temporary. + if (SrcOffset == DstOffset) + return NoCopy; + else + return CopyViaTemp; +} + void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, @@ -2380,6 +2433,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MachineFunction::CallSiteInfo CSInfo; bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; @@ -2408,8 +2462,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, isTailCall = false; // For both the non-secure calls and the returns from a CMSE entry function, - // the function needs to do some extra work afte r the call, or before the - // return, respectively, thus it cannot end with atail call + // the function needs to do some extra work after the call, or before the + // return, respectively, thus it cannot end with a tail call if (isCmseNSCall || AFI->isCmseNSEntryFunction()) isTailCall = false; @@ -2492,6 +2546,66 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPassVector RegsToPass; SmallVector MemOpChains; + // If we are doing a tail-call, any byval arguments will be written to stack + // space which was used for incoming arguments. If any the values being used + // are incoming byval arguments to this function, then they might be + // overwritten by the stores of the outgoing arguments. To avoid this, we + // need to make a temporary copy of them in local stack space, then copy back + // to the argument area. + DenseMap ByValTemporaries; + SDValue ByValTempChain; + if (isTailCall) { + SmallVector ByValCopyChains; + for (const CCValAssign &VA : ArgLocs) { + unsigned ArgIdx = VA.getValNo(); + SDValue Src = OutVals[ArgIdx]; + ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags; + + if (!Flags.isByVal()) + continue; + + SDValue Dst; + MachinePointerInfo DstInfo; + std::tie(Dst, DstInfo) = + computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff); + ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags); + + if (Copy == NoCopy) { + // If the argument is already at the correct offset on the stack + // (because we are forwarding a byval argument from our caller), we + // don't need any copying. + continue; + } else if (Copy == CopyOnce) { + // If the argument is in our local stack frame, no other argument + // preparation can clobber it, so we can copy it to the final location + // later. + ByValTemporaries[ArgIdx] = Src; + } else { + assert(Copy == CopyViaTemp && "unexpected enum value"); + // If we might be copying this argument from the outgoing argument + // stack area, we need to copy via a temporary in the local stack + // frame. + int TempFrameIdx = MFI.CreateStackObject( + Flags.getByValSize(), Flags.getNonZeroByValAlign(), false); + SDValue Temp = + DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout())); + + SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); + SDValue AlignNode = + DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32); + + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode}; + ByValCopyChains.push_back( + DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops)); + ByValTemporaries[ArgIdx] = Temp; + } + } + if (!ByValCopyChains.empty()) + ByValTempChain = + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains); + } + // During a tail call, stores to the argument area must happen after all of // the function's incoming arguments have been loaded because they may alias. // This is done by folding in a TokenFactor from LowerFormalArguments, but @@ -2529,6 +2643,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) { Chain = DAG.getStackArgumentTokenFactor(Chain); + if (ByValTempChain) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain, + ByValTempChain); AfterFormalArgLoads = true; } @@ -2600,8 +2717,18 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); - if (CurByValIdx < ByValArgsCount) { + SDValue ByValSrc; + bool NeedsStackCopy; + if (ByValTemporaries.contains(realArgIdx)) { + ByValSrc = ByValTemporaries[realArgIdx]; + NeedsStackCopy = true; + } else { + ByValSrc = Arg; + NeedsStackCopy = !isTailCall; + } + // If part of the argument is in registers, load them. + if (CurByValIdx < ByValArgsCount) { unsigned RegBegin, RegEnd; CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); @@ -2610,7 +2737,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned int i, j; for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); - SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); + SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const); SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), DAG.InferPtrAlign(AddArg)); @@ -2625,14 +2752,16 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CCInfo.nextInRegsParam(); } - if (Flags.getByValSize() > 4*offset) { + // If the memory part of the argument isn't already in the correct place + // (which can happen with tail calls), copy it into the argument area. + if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) { auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Dst; MachinePointerInfo DstInfo; std::tie(Dst, DstInfo) = computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); - SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); + SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset); SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, MVT::i32); SDValue AlignNode = @@ -2962,50 +3091,6 @@ void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, Size = std::max(Size - Excess, 0); } -/// MatchingStackOffset - Return true if the given stack call argument is -/// already available in the same position (relatively) of the caller's -/// incoming argument stack. -static -bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, - MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, - const TargetInstrInfo *TII) { - unsigned Bytes = Arg.getValueSizeInBits() / 8; - int FI = std::numeric_limits::max(); - if (Arg.getOpcode() == ISD::CopyFromReg) { - Register VR = cast(Arg.getOperand(1))->getReg(); - if (!VR.isVirtual()) - return false; - MachineInstr *Def = MRI->getVRegDef(VR); - if (!Def) - return false; - if (!Flags.isByVal()) { - if (!TII->isLoadFromStackSlot(*Def, FI)) - return false; - } else { - return false; - } - } else if (LoadSDNode *Ld = dyn_cast(Arg)) { - if (Flags.isByVal()) - // ByVal argument is passed in as a pointer but it's now being - // dereferenced. e.g. - // define @foo(%struct.X* %A) { - // tail call @bar(%struct.X* byval %A) - // } - return false; - SDValue Ptr = Ld->getBasePtr(); - FrameIndexSDNode *FINode = dyn_cast(Ptr); - if (!FINode) - return false; - FI = FINode->getIndex(); - } else - return false; - - assert(FI != std::numeric_limits::max()); - if (!MFI.isFixedObjectIndex(FI)) - return false; - return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); -} - /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. Note that this function also @@ -3047,8 +3132,10 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( for (const CCValAssign &AL : ArgLocs) if (AL.isRegLoc()) AddressRegisters.erase(AL.getLocReg()); - if (AddressRegisters.empty()) + if (AddressRegisters.empty()) { + LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n"); return false; + } } // Look for obvious safe cases to perform tail call optimization that do not @@ -3057,18 +3144,26 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( // Exception-handling functions need a special set of instructions to indicate // a return to the hardware. Tail-calling another function would probably // break this. - if (CallerF.hasFnAttribute("interrupt")) + if (CallerF.hasFnAttribute("interrupt")) { + LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n"); return false; + } - if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) + if (canGuaranteeTCO(CalleeCC, + getTargetMachine().Options.GuaranteedTailCallOpt)) { + LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false") + << " (guaranteed tail-call CC)\n"); return CalleeCC == CallerCC; + } // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet(); bool isCallerStructRet = MF.getFunction().hasStructRetAttr(); - if (isCalleeStructRet || isCallerStructRet) + if (isCalleeStructRet != isCallerStructRet) { + LLVM_DEBUG(dbgs() << "false (struct-ret)\n"); return false; + } // Externally-defined functions with weak linkage should not be // tail-called on ARM when the OS does not support dynamic @@ -3081,8 +3176,11 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( const GlobalValue *GV = G->getGlobal(); const Triple &TT = getTargetMachine().getTargetTriple(); if (GV->hasExternalWeakLinkage() && - (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) + (!TT.isOSWindows() || TT.isOSBinFormatELF() || + TT.isOSBinFormatMachO())) { + LLVM_DEBUG(dbgs() << "false (external weak linkage)\n"); return false; + } } // Check that the call results are passed in the same way. @@ -3091,70 +3189,44 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( getEffectiveCallingConv(CalleeCC, isVarArg), getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins, CCAssignFnForReturn(CalleeCC, isVarArg), - CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) + CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) { + LLVM_DEBUG(dbgs() << "false (incompatible results)\n"); return false; + } // The callee has to preserve all registers the caller needs to preserve. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (CalleeCC != CallerCC) { const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); - if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) + if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) { + LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n"); return false; + } } - // If Caller's vararg or byval argument has been split between registers and - // stack, do not perform tail call, since part of the argument is in caller's - // local frame. + // If Caller's vararg argument has been split between registers and stack, do + // not perform tail call, since part of the argument is in caller's local + // frame. const ARMFunctionInfo *AFI_Caller = MF.getInfo(); - if (AFI_Caller->getArgRegsSaveSize()) + if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) { + LLVM_DEBUG(dbgs() << "false (arg reg save area)\n"); return false; + } // If the callee takes no arguments then go on to check the results of the // call. - if (!Outs.empty()) { - if (CCInfo.getStackSize()) { - // Check if the arguments are already laid out in the right way as - // the caller's fixed stack objects. - MachineFrameInfo &MFI = MF.getFrameInfo(); - const MachineRegisterInfo *MRI = &MF.getRegInfo(); - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); - i != e; - ++i, ++realArgIdx) { - CCValAssign &VA = ArgLocs[i]; - EVT RegVT = VA.getLocVT(); - SDValue Arg = OutVals[realArgIdx]; - ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; - if (VA.getLocInfo() == CCValAssign::Indirect) - return false; - if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) { - // f64 and vector types are split into multiple registers or - // register/stack-slot combinations. The types will not match - // the registers; give up on memory f64 refs until we figure - // out what to do about this. - if (!VA.isRegLoc()) - return false; - if (!ArgLocs[++i].isRegLoc()) - return false; - if (RegVT == MVT::v2f64) { - if (!ArgLocs[++i].isRegLoc()) - return false; - if (!ArgLocs[++i].isRegLoc()) - return false; - } - } else if (!VA.isRegLoc()) { - if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, - MFI, MRI, TII)) - return false; - } - } - } - - const MachineRegisterInfo &MRI = MF.getRegInfo(); - if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) - return false; + const MachineRegisterInfo &MRI = MF.getRegInfo(); + if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) { + LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n"); + return false; } + // If the stack arguments for this call do not fit into our own save area then + // the call cannot be made tail. + if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize()) + return false; + + LLVM_DEBUG(dbgs() << "true\n"); return true; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index a255e9b6fc365..675671d748a1a 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -395,6 +395,19 @@ class VectorType; // ARMTargetLowering - ARM Implementation of the TargetLowering interface class ARMTargetLowering : public TargetLowering { + // Copying needed for an outgoing byval argument. + enum ByValCopyKind { + // Argument is already in the correct location, no copy needed. + NoCopy, + // Argument value is currently in the local stack frame, needs copying to + // outgoing arguemnt area. + CopyOnce, + // Argument value is currently in the outgoing argument area, but not at + // the correct offset, so needs copying via a temporary in local stack + // space. + CopyViaTemp, + }; + public: explicit ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI); @@ -811,6 +824,9 @@ class VectorType; computeAddrForCallArg(const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr, bool IsTailCall, int SPDiff) const; + ByValCopyKind ByValNeedsCopyForTailCall(SelectionDAG &DAG, SDValue Src, + SDValue Dst, + ISD::ArgFlagsTy Flags) const; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll index d8e22f4f5312a..e186ae3a96150 100644 --- a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll +++ b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll @@ -12,17 +12,11 @@ define void @check227( ; arg1 --> SP+188 entry: - -;CHECK: sub sp, sp, #12 -;CHECK: push {r11, lr} -;CHECK: sub sp, sp, #4 -;CHECK: add r0, sp, #12 -;CHECK: stm r0, {r1, r2, r3} -;CHECK: ldr r0, [sp, #212] -;CHECK: bl useInt -;CHECK: add sp, sp, #4 -;CHECK: pop {r11, lr} -;CHECK: add sp, sp, #12 +; CHECK: sub sp, sp, #12 +; CHECK: stm sp, {r1, r2, r3} +; CHECK: ldr r0, [sp, #200] +; CHECK: add sp, sp, #12 +; CHECK: b useInt %0 = ptrtoint ptr %arg1 to i32 tail call void @useInt(i32 %0) diff --git a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll index 0c5d22984b99e..efdecce9ae723 100644 --- a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll +++ b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll @@ -7,14 +7,11 @@ define void @foo(ptr byval(%struct4bytes) %p0, ; --> R0 ptr byval(%struct20bytes) %p1 ; --> R1,R2,R3, [SP+0 .. SP+8) ) { -;CHECK: sub sp, sp, #16 -;CHECK: push {r11, lr} -;CHECK: add r12, sp, #8 -;CHECK: stm r12, {r0, r1, r2, r3} -;CHECK: add r0, sp, #12 -;CHECK: bl useInt -;CHECK: pop {r11, lr} -;CHECK: add sp, sp, #16 +;CHECK: sub sp, sp, #16 +;CHECK: stm sp, {r0, r1, r2, r3} +;CHECK: add r0, sp, #4 +;CHECK: add sp, sp, #16 +;CHECK: b useInt %1 = ptrtoint ptr %p1 to i32 tail call void @useInt(i32 %1) diff --git a/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll b/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll index 4996cc8ecbf02..1e46b081acfdf 100644 --- a/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll +++ b/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll @@ -1,8 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=arm-eabi -mattr=+neon -float-abi=soft %s -o - | FileCheck %s ; CHECK: function1 -; CHECK-NOT: vmov define double @function1(double %a, double %b, double %c, double %d, double %e, double %f) nounwind noinline ssp { +; CHECK-LABEL: function1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} +; CHECK-NEXT: vldr d16, [sp, #40] +; CHECK-NEXT: vldr d17, [sp, #32] +; CHECK-NEXT: vmov r12, lr, d16 +; CHECK-NEXT: vldr d16, [sp, #16] +; CHECK-NEXT: vmov r4, r5, d17 +; CHECK-NEXT: vldr d17, [sp, #24] +; CHECK-NEXT: str r3, [sp, #36] +; CHECK-NEXT: str r2, [sp, #32] +; CHECK-NEXT: str r1, [sp, #44] +; CHECK-NEXT: str r0, [sp, #40] +; CHECK-NEXT: vstr d17, [sp, #16] +; CHECK-NEXT: vstr d16, [sp, #24] +; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: mov r1, lr +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: pop {r4, r5, r11, lr} +; CHECK-NEXT: b function2 entry: %call = tail call double @function2(double %f, double %e, double %d, double %c, double %b, double %a) nounwind ret double %call diff --git a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll index 6fc56967bc7aa..65aff46658fd1 100644 --- a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll +++ b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll @@ -145,26 +145,21 @@ entry: define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x half>, <8 x half>) { ; SOFT-LABEL: many_args_test: ; SOFT: @ %bb.0: @ %entry -; SOFT-NEXT: push {r11, lr} -; SOFT-NEXT: sub sp, sp, #32 -; SOFT-NEXT: add r12, sp, #80 +; SOFT-NEXT: add r12, sp, #40 ; SOFT-NEXT: vld1.64 {d16, d17}, [r12] -; SOFT-NEXT: add r12, sp, #48 +; SOFT-NEXT: add r12, sp, #8 ; SOFT-NEXT: vabs.f16 q8, q8 ; SOFT-NEXT: vld1.64 {d18, d19}, [r12] -; SOFT-NEXT: add r12, sp, #64 +; SOFT-NEXT: add r12, sp, #24 ; SOFT-NEXT: vadd.f16 q8, q8, q9 ; SOFT-NEXT: vld1.64 {d18, d19}, [r12] ; SOFT-NEXT: add r12, sp, #16 ; SOFT-NEXT: vmul.f16 q8, q9, q8 ; SOFT-NEXT: vst1.64 {d16, d17}, [r12] -; SOFT-NEXT: mov r12, sp -; SOFT-NEXT: vldr d16, [sp, #40] -; SOFT-NEXT: vst1.16 {d16}, [r12:64]! -; SOFT-NEXT: str r3, [r12] -; SOFT-NEXT: bl use -; SOFT-NEXT: add sp, sp, #32 -; SOFT-NEXT: pop {r11, pc} +; SOFT-NEXT: vldr d16, [sp] +; SOFT-NEXT: vstr d16, [sp] +; SOFT-NEXT: str r3, [sp, #8] +; SOFT-NEXT: b use ; ; HARD-LABEL: many_args_test: ; HARD: @ %bb.0: @ %entry @@ -177,33 +172,25 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal ; ; SOFTEB-LABEL: many_args_test: ; SOFTEB: @ %bb.0: @ %entry -; SOFTEB-NEXT: .save {r11, lr} -; SOFTEB-NEXT: push {r11, lr} -; SOFTEB-NEXT: .pad #32 -; SOFTEB-NEXT: sub sp, sp, #32 -; SOFTEB-NEXT: add r12, sp, #80 -; SOFTEB-NEXT: mov lr, sp +; SOFTEB-NEXT: add r12, sp, #40 ; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12] -; SOFTEB-NEXT: add r12, sp, #48 +; SOFTEB-NEXT: add r12, sp, #8 ; SOFTEB-NEXT: vrev64.16 q8, q8 ; SOFTEB-NEXT: vabs.f16 q8, q8 ; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12] -; SOFTEB-NEXT: add r12, sp, #64 +; SOFTEB-NEXT: add r12, sp, #24 ; SOFTEB-NEXT: vrev64.16 q9, q9 ; SOFTEB-NEXT: vadd.f16 q8, q8, q9 ; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12] ; SOFTEB-NEXT: add r12, sp, #16 ; SOFTEB-NEXT: vrev64.16 q9, q9 ; SOFTEB-NEXT: vmul.f16 q8, q9, q8 -; SOFTEB-NEXT: vldr d18, [sp, #40] -; SOFTEB-NEXT: vrev64.16 d18, d18 -; SOFTEB-NEXT: vst1.16 {d18}, [lr:64]! -; SOFTEB-NEXT: str r3, [lr] +; SOFTEB-NEXT: vldr d18, [sp] ; SOFTEB-NEXT: vrev64.16 q8, q8 ; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12] -; SOFTEB-NEXT: bl use -; SOFTEB-NEXT: add sp, sp, #32 -; SOFTEB-NEXT: pop {r11, pc} +; SOFTEB-NEXT: vstr d18, [sp] +; SOFTEB-NEXT: str r3, [sp, #8] +; SOFTEB-NEXT: b use ; ; HARDEB-LABEL: many_args_test: ; HARDEB: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/ARM/musttail.ll b/llvm/test/CodeGen/ARM/musttail.ll new file mode 100644 index 0000000000000..aa7b81d070021 --- /dev/null +++ b/llvm/test/CodeGen/ARM/musttail.ll @@ -0,0 +1,342 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=armv7a-none-eabi %s -o - | FileCheck %s + +declare i32 @many_args_callee(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) + +define i32 @many_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) { +; CHECK-LABEL: many_args_tail: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r0, #5 +; CHECK-NEXT: mov r1, #2 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: mov r0, #6 +; CHECK-NEXT: str r0, [sp, #4] +; CHECK-NEXT: mov r0, #1 +; CHECK-NEXT: mov r2, #3 +; CHECK-NEXT: mov r3, #4 +; CHECK-NEXT: b many_args_callee + %ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6) + ret i32 %ret +} + +define i32 @many_args_musttail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) { +; CHECK-LABEL: many_args_musttail: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r0, #5 +; CHECK-NEXT: mov r1, #2 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: mov r0, #6 +; CHECK-NEXT: str r0, [sp, #4] +; CHECK-NEXT: mov r0, #1 +; CHECK-NEXT: mov r2, #3 +; CHECK-NEXT: mov r3, #4 +; CHECK-NEXT: b many_args_callee + %ret = musttail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6) + ret i32 %ret +} + +; This function has more arguments than it's tail-callee. This isn't valid for +; the musttail attribute, but can still be tail-called as a non-guaranteed +; optimisation, because the outgoing arguments to @many_args_callee fit in the +; stack space allocated by the caller of @more_args_tail. +define i32 @more_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) { +; CHECK-LABEL: more_args_tail: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r0, #5 +; CHECK-NEXT: mov r1, #2 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: mov r0, #6 +; CHECK-NEXT: str r0, [sp, #4] +; CHECK-NEXT: mov r0, #1 +; CHECK-NEXT: mov r2, #3 +; CHECK-NEXT: mov r3, #4 +; CHECK-NEXT: b many_args_callee + %ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6) + ret i32 %ret +} + +; Again, this isn't valid for musttail, but can be tail-called in practice +; because the stack size if the same. +define i32 @different_args_tail(i64 %0, i64 %1, i64 %2) { +; CHECK-LABEL: different_args_tail: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r0, #5 +; CHECK-NEXT: mov r1, #2 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: mov r0, #6 +; CHECK-NEXT: str r0, [sp, #4] +; CHECK-NEXT: mov r0, #1 +; CHECK-NEXT: mov r2, #3 +; CHECK-NEXT: mov r3, #4 +; CHECK-NEXT: b many_args_callee + %ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6) + ret i32 %ret +} + +; Here, the caller requires less stack space for it's arguments than the +; callee, so it would not ba valid to do a tail-call. +define i32 @fewer_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4) { +; CHECK-LABEL: fewer_args_tail: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, sp, #8 +; CHECK-NEXT: mov r1, #6 +; CHECK-NEXT: mov r0, #5 +; CHECK-NEXT: strd r0, r1, [sp] +; CHECK-NEXT: mov r0, #1 +; CHECK-NEXT: mov r1, #2 +; CHECK-NEXT: mov r2, #3 +; CHECK-NEXT: mov r3, #4 +; CHECK-NEXT: bl many_args_callee +; CHECK-NEXT: add sp, sp, #8 +; CHECK-NEXT: pop {r11, pc} + %ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6) + ret i32 %ret +} + +declare void @sret_callee(ptr sret({ double, double }) align 8) + +; Functions which return by sret can be tail-called because the incoming sret +; pointer gets passed through to the callee. +define void @sret_caller_tail(ptr sret({ double, double }) align 8 %result) { +; CHECK-LABEL: sret_caller_tail: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: b sret_callee +entry: + tail call void @sret_callee(ptr sret({ double, double }) align 8 %result) + ret void +} + +define void @sret_caller_musttail(ptr sret({ double, double }) align 8 %result) { +; CHECK-LABEL: sret_caller_musttail: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: b sret_callee +entry: + musttail call void @sret_callee(ptr sret({ double, double }) align 8 %result) + ret void +} + +; Clang only uses byval for arguments of 65 bytes or larger, but we test with a +; 20 byte struct to keep the tests more readable. This size was chosen to still +; make sure that it will be split between registers and the stack, to test all +; of the interesting code paths in the backend. +%twenty_bytes = type { [5 x i32] } +declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4) + +; Functions with byval parameters can be tail-called, because the value is +; actually passed in registers and the stack in the same way for the caller and +; callee. Within @large_caller the first 16 bytes of the argument are spilled +; to the local stack frame, but for the tail-call they are passed in r0-r3, so +; it's safe to de-allocate that memory before the call. +; TODO: The SUB and STM instructions are unnecessary and could be optimised +; out, but the behaviour of this is still correct. +define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; CHECK-LABEL: large_caller: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: stm sp!, {r0, r1, r2, r3} +; CHECK-NEXT: b large_callee +entry: + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; As above, but with some inline asm to test that the arguments in r0-r3 are +; re-loaded before the call. +define void @large_caller_check_regs(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; CHECK-LABEL: large_caller_check_regs: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: stm sp, {r0, r1, r2, r3} +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: pop {r0, r1, r2, r3} +; CHECK-NEXT: b large_callee +entry: + tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"() + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; The IR for this one looks dodgy, because it has an alloca passed to a +; musttail function, but it is passed as a byval argument, so will be copied +; into the stack space allocated by @large_caller_new_value's caller, so is +; valid. +define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; CHECK-LABEL: large_caller_new_value: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #36 +; CHECK-NEXT: sub sp, sp, #36 +; CHECK-NEXT: add r12, sp, #20 +; CHECK-NEXT: stm r12, {r0, r1, r2, r3} +; CHECK-NEXT: mov r0, #4 +; CHECK-NEXT: add r1, sp, #36 +; CHECK-NEXT: str r0, [sp, #16] +; CHECK-NEXT: mov r0, #3 +; CHECK-NEXT: str r0, [sp, #12] +; CHECK-NEXT: mov r0, #2 +; CHECK-NEXT: str r0, [sp, #8] +; CHECK-NEXT: mov r0, #1 +; CHECK-NEXT: str r0, [sp, #4] +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r0, r0, #16 +; CHECK-NEXT: mov r3, #3 +; CHECK-NEXT: ldr r2, [r0], #4 +; CHECK-NEXT: str r2, [r1], #4 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: mov r1, #1 +; CHECK-NEXT: mov r2, #2 +; CHECK-NEXT: add sp, sp, #36 +; CHECK-NEXT: b large_callee +entry: + %y = alloca %twenty_bytes, align 4 + store i32 0, ptr %y, align 4 + %0 = getelementptr inbounds i8, ptr %y, i32 4 + store i32 1, ptr %0, align 4 + %1 = getelementptr inbounds i8, ptr %y, i32 8 + store i32 2, ptr %1, align 4 + %2 = getelementptr inbounds i8, ptr %y, i32 12 + store i32 3, ptr %2, align 4 + %3 = getelementptr inbounds i8, ptr %y, i32 16 + store i32 4, ptr %3, align 4 + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y) + ret void +} + +declare void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4, %twenty_bytes* byval(%twenty_bytes) align 4) +define void @swap_byvals(%twenty_bytes* byval(%twenty_bytes) align 4 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) { +; CHECK-LABEL: swap_byvals: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, sp, #40 +; CHECK-NEXT: add r12, sp, #56 +; CHECK-NEXT: add lr, sp, #20 +; CHECK-NEXT: stm r12, {r0, r1, r2, r3} +; CHECK-NEXT: add r0, sp, #56 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: str r1, [r2], #4 +; CHECK-NEXT: add r3, sp, #20 +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: add r4, sp, #76 +; CHECK-NEXT: str r1, [r2], #4 +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: str r1, [r2], #4 +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: str r1, [r2], #4 +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: add r0, sp, #76 +; CHECK-NEXT: str r1, [r2], #4 +; CHECK-NEXT: mov r2, lr +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: str r1, [r2], #4 +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: str r1, [r2], #4 +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: str r1, [r2], #4 +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: str r1, [r2], #4 +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: str r1, [r2], #4 +; CHECK-NEXT: ldm r3, {r0, r1, r2, r3} +; CHECK-NEXT: ldr r5, [r12], #4 +; CHECK-NEXT: str r5, [r4], #4 +; CHECK-NEXT: ldr r5, [r12], #4 +; CHECK-NEXT: str r5, [r4], #4 +; CHECK-NEXT: ldr r5, [r12], #4 +; CHECK-NEXT: str r5, [r4], #4 +; CHECK-NEXT: ldr r5, [r12], #4 +; CHECK-NEXT: str r5, [r4], #4 +; CHECK-NEXT: ldr r5, [r12], #4 +; CHECK-NEXT: str r5, [r4], #4 +; CHECK-NEXT: add r5, lr, #16 +; CHECK-NEXT: add r12, sp, #72 +; CHECK-NEXT: ldr r4, [r5], #4 +; CHECK-NEXT: str r4, [r12], #4 +; CHECK-NEXT: add sp, sp, #40 +; CHECK-NEXT: pop {r4, r5, r11, lr} +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: b two_byvals_callee +entry: + musttail call void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b, %twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; A forwarded byval arg, but at a different offset on the stack, so it needs to +; be copied to the local stack frame first. This can't be musttail because of +; the different signatures, but is still tail-called as an optimisation. +declare void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4) +define void @shift_byval(i32 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) { +; CHECK-LABEL: shift_byval: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #12 +; CHECK-NEXT: sub sp, sp, #12 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .pad #20 +; CHECK-NEXT: sub sp, sp, #20 +; CHECK-NEXT: add r0, sp, #28 +; CHECK-NEXT: add lr, sp, #40 +; CHECK-NEXT: stm r0, {r1, r2, r3} +; CHECK-NEXT: add r0, sp, #28 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: ldr r2, [r0], #4 +; CHECK-NEXT: add r12, r1, #16 +; CHECK-NEXT: str r2, [r1], #4 +; CHECK-NEXT: ldr r2, [r0], #4 +; CHECK-NEXT: str r2, [r1], #4 +; CHECK-NEXT: ldr r2, [r0], #4 +; CHECK-NEXT: str r2, [r1], #4 +; CHECK-NEXT: ldr r2, [r0], #4 +; CHECK-NEXT: str r2, [r1], #4 +; CHECK-NEXT: ldr r2, [r0], #4 +; CHECK-NEXT: str r2, [r1], #4 +; CHECK-NEXT: ldm sp, {r0, r1, r2, r3} +; CHECK-NEXT: ldr r4, [r12], #4 +; CHECK-NEXT: str r4, [lr], #4 +; CHECK-NEXT: add sp, sp, #20 +; CHECK-NEXT: pop {r4, lr} +; CHECK-NEXT: add sp, sp, #12 +; CHECK-NEXT: b shift_byval_callee +entry: + tail call void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b) + ret void +} + +; A global object passed to a byval argument, so it must be copied, but doesn't +; need a stack temporary. +@large_global = external global %twenty_bytes +define void @large_caller_from_global(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; CHECK-LABEL: large_caller_from_global: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: add r12, sp, #8 +; CHECK-NEXT: add lr, sp, #24 +; CHECK-NEXT: stm r12, {r0, r1, r2, r3} +; CHECK-NEXT: movw r3, :lower16:large_global +; CHECK-NEXT: movt r3, :upper16:large_global +; CHECK-NEXT: add r12, r3, #16 +; CHECK-NEXT: ldm r3, {r0, r1, r2, r3} +; CHECK-NEXT: ldr r4, [r12], #4 +; CHECK-NEXT: str r4, [lr], #4 +; CHECK-NEXT: pop {r4, lr} +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: b large_callee +entry: + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 @large_global) + ret void +}