diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e28b9c11a04cd..157642e14f68d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1067,6 +1067,19 @@ namespace llvm { //===--------------------------------------------------------------------===// // X86 Implementation of the TargetLowering interface class X86TargetLowering final : public TargetLowering { + // Copying needed for an outgoing byval argument. + enum ByValCopyKind { + // Argument is already in the correct location, no copy needed. + NoCopy, + // Argument value is currently in the local stack frame, needs copying to + // outgoing arguemnt area. + CopyOnce, + // Argument value is currently in the outgoing argument area, but not at + // the correct offset, so needs copying via a temporary in local stack + // space. + CopyViaTemp, + }; + public: explicit X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI); @@ -1775,6 +1788,9 @@ namespace llvm { SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; + ByValCopyKind ByValNeedsCopyForTailCall(SelectionDAG &DAG, SDValue Src, + SDValue Dst, + ISD::ArgFlagsTy Flags) const; SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 37d77728882b1..d1e8ad75a7d9a 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -2018,6 +2018,49 @@ SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } +// Returns the type of copying which is required to set up a byval argument to +// a tail-called function. This isn't needed for non-tail calls, because they +// always need the equivalent of CopyOnce, but tail-calls sometimes need two to +// avoid clobbering another argument (CopyViaTemp), and sometimes can be +// optimised to zero copies when forwarding an argument from the caller's +// caller (NoCopy). +X86TargetLowering::ByValCopyKind X86TargetLowering::ByValNeedsCopyForTailCall( + SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + + // Globals are always safe to copy from. + if (isa(Src) || isa(Src)) + return CopyOnce; + + // Can only analyse frame index nodes, conservatively assume we need a + // temporary. + auto *SrcFrameIdxNode = dyn_cast(Src); + auto *DstFrameIdxNode = dyn_cast(Dst); + if (!SrcFrameIdxNode || !DstFrameIdxNode) + return CopyViaTemp; + + int SrcFI = SrcFrameIdxNode->getIndex(); + int DstFI = DstFrameIdxNode->getIndex(); + assert(MFI.isFixedObjectIndex(DstFI) && + "byval passed in non-fixed stack slot"); + + int64_t SrcOffset = MFI.getObjectOffset(SrcFI); + int64_t DstOffset = MFI.getObjectOffset(DstFI); + + // If the source is in the local frame, then the copy to the argument + // memory is always valid. + bool FixedSrc = MFI.isFixedObjectIndex(SrcFI); + if (!FixedSrc || (FixedSrc && SrcOffset < 0)) + return CopyOnce; + + // If the value is already in the correct location, then no copying is + // needed. If not, then we need to copy via a temporary. + if (SrcOffset == DstOffset) + return NoCopy; + else + return CopyViaTemp; +} + SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { @@ -2098,15 +2141,18 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, isTailCall = false; } - if (isTailCall && !IsMustTail) { + if (isTailCall) { // Check if it's really possible to do a tail call. - isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, - IsCalleePopSRet); + IsSibcall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, + IsCalleePopSRet); - // Sibcalls are automatically detected tailcalls which do not require - // ABI changes. - if (!IsGuaranteeTCO && isTailCall) - IsSibcall = true; + if (!IsMustTail) { + isTailCall = IsSibcall; + + // Sibcalls are automatically detected tailcalls which do not require + // ABI changes. + IsSibcall = IsSibcall && !IsGuaranteeTCO; + } if (isTailCall) ++NumTailCalls; @@ -2128,8 +2174,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); + // A sibcall is ABI-compatible and does not need to adjust the stack pointer. int FPDiff = 0; - if (isTailCall && + if (isTailCall && !IsSibcall && shouldGuaranteeTCO(CallConv, MF.getTarget().Options.GuaranteedTailCallOpt)) { // Lower arguments at fp - stackoffset + fpdiff. @@ -2146,6 +2193,74 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned NumBytesToPush = NumBytes; unsigned NumBytesToPop = NumBytes; + SDValue StackPtr; + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + + // If we are doing a tail-call, any byval arguments will be written to stack + // space which was used for incoming arguments. If any the values being used + // are incoming byval arguments to this function, then they might be + // overwritten by the stores of the outgoing arguments. To avoid this, we + // need to make a temporary copy of them in local stack space, then copy back + // to the argument area. + DenseMap ByValTemporaries; + SDValue ByValTempChain; + if (isTailCall) { + SmallVector ByValCopyChains; + for (const CCValAssign &VA : ArgLocs) { + unsigned ArgIdx = VA.getValNo(); + SDValue Src = OutVals[ArgIdx]; + ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags; + + if (!Flags.isByVal()) + continue; + + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + if (!StackPtr.getNode()) + StackPtr = + DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), PtrVT); + + // Destination: where this byval should live in the callee’s frame + // after the tail call. + int32_t Offset = VA.getLocMemOffset() + FPDiff; + int Size = VA.getLocVT().getFixedSizeInBits() / 8; + int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); + SDValue Dst = DAG.getFrameIndex(FI, PtrVT); + + ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags); + + if (Copy == NoCopy) { + // If the argument is already at the correct offset on the stack + // (because we are forwarding a byval argument from our caller), we + // don't need any copying. + continue; + } else if (Copy == CopyOnce) { + // If the argument is in our local stack frame, no other argument + // preparation can clobber it, so we can copy it to the final location + // later. + ByValTemporaries[ArgIdx] = Src; + } else { + assert(Copy == CopyViaTemp && "unexpected enum value"); + // If we might be copying this argument from the outgoing argument + // stack area, we need to copy via a temporary in the local stack + // frame. + MachineFrameInfo &MFI = MF.getFrameInfo(); + int TempFrameIdx = MFI.CreateStackObject(Flags.getByValSize(), + Flags.getNonZeroByValAlign(), + /*isSS=*/false); + SDValue Temp = + DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout())); + + SDValue CopyChain = + CreateCopyOfByValArgument(Src, Temp, Chain, Flags, DAG, dl); + ByValCopyChains.push_back(CopyChain); + } + } + if (!ByValCopyChains.empty()) + ByValTempChain = + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains); + } + // If we have an inalloca argument, all stack space has already been allocated // for us and be right at the top of the stack. We don't support multiple // arguments passed in memory when using inalloca. @@ -2186,7 +2301,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector, 8> RegsToPass; SmallVector MemOpChains; - SDValue StackPtr; // The next loop assumes that the locations are in the same order of the // input arguments. @@ -2195,7 +2309,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. - const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; ++I, ++OutIndex) { assert(OutIndex < Outs.size() && "Invalid Out index"); @@ -2285,7 +2398,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (ShadowReg) RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); } - } else if (!IsSibcall && (!isTailCall || isByVal)) { + } else if (!IsSibcall && (!isTailCall || (isByVal && !IsMustTail))) { assert(VA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), @@ -2372,6 +2485,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // would clobber. Chain = DAG.getStackArgumentTokenFactor(Chain); + if (ByValTempChain) + Chain = + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain, ByValTempChain); + SmallVector MemOpChains2; SDValue FIN; int FI = 0; @@ -2404,16 +2521,24 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); if (Flags.isByVal()) { - // Copy relative to framepointer. - SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); - if (!StackPtr.getNode()) - StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), - getPointerTy(DAG.getDataLayout())); - Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), - StackPtr, Source); - - MemOpChains2.push_back( - CreateCopyOfByValArgument(Source, FIN, Chain, Flags, DAG, dl)); + SDValue ByValSrc; + bool NeedsStackCopy; + if (auto It = ByValTemporaries.find(OutsIndex); + It != ByValTemporaries.end()) { + ByValSrc = It->second; + NeedsStackCopy = true; + } else { + ByValSrc = Arg; + NeedsStackCopy = false; + } + + if (NeedsStackCopy) { + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue DstAddr = DAG.getFrameIndex(FI, PtrVT); + + MemOpChains2.push_back(CreateCopyOfByValArgument( + ByValSrc, DstAddr, Chain, Flags, DAG, dl)); + } } else { // Store relative to framepointer. MemOpChains2.push_back(DAG.getStore( diff --git a/llvm/test/CodeGen/X86/musttail-struct.ll b/llvm/test/CodeGen/X86/musttail-struct.ll new file mode 100644 index 0000000000000..735fd674a2ff1 --- /dev/null +++ b/llvm/test/CodeGen/X86/musttail-struct.ll @@ -0,0 +1,320 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -x86-asm-syntax=intel | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-asm-syntax=intel | FileCheck %s --check-prefix=X64 + +; Test correct handling of a musttail call with a byval struct argument. + +%struct.1xi32 = type { [1 x i32] } +%struct.3xi32 = type { [3 x i32] } +%struct.5xi32 = type { [5 x i32] } + +declare dso_local i32 @Func1(ptr byval(%struct.1xi32) %0) +declare dso_local i32 @Func3(ptr byval(%struct.3xi32) %0) +declare dso_local i32 @Func5(ptr byval(%struct.5xi32) %0) +declare dso_local i32 @FuncManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) + +define dso_local i32 @test1(ptr byval(%struct.1xi32) %0) { +; X32-LABEL: test1: +; X32: # %bb.0: +; X32-NEXT: jmp Func1 # TAILCALL +; +; X64-LABEL: test1: +; X64: # %bb.0: +; X64-NEXT: jmp Func1 # TAILCALL + %r = musttail call i32 @Func1(ptr byval(%struct.1xi32) %0) + ret i32 %r +} + +define dso_local i32 @test3(ptr byval(%struct.3xi32) %0) { +; X32-LABEL: test3: +; X32: # %bb.0: +; X32-NEXT: jmp Func3 # TAILCALL +; +; X64-LABEL: test3: +; X64: # %bb.0: +; X64-NEXT: jmp Func3 # TAILCALL + %r = musttail call i32 @Func3(ptr byval(%struct.3xi32) %0) + ret i32 %r +} + +; sizeof(%struct.5xi32) > 16, in x64 this is passed on stack. +define dso_local i32 @test5(ptr byval(%struct.5xi32) %0) { +; X32-LABEL: test5: +; X32: # %bb.0: +; X32-NEXT: jmp Func5 # TAILCALL +; +; X64-LABEL: test5: +; X64: # %bb.0: +; X64-NEXT: jmp Func5 # TAILCALL + %r = musttail call i32 @Func5(ptr byval(%struct.5xi32) %0) + ret i32 %r +} + +; Test passing multiple arguments with different sizes on stack. In x64 Linux +; the first 6 are passed by register. +define dso_local i32 @testManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) { +; X32-LABEL: testManyArgs: +; X32: # %bb.0: +; X32-NEXT: jmp FuncManyArgs # TAILCALL +; +; X64-LABEL: testManyArgs: +; X64: # %bb.0: +; X64-NEXT: jmp FuncManyArgs # TAILCALL + %r = musttail call i32 @FuncManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) + ret i32 %r +} + +define dso_local i32 @testRecursion(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) { +; X32-LABEL: testRecursion: +; X32: # %bb.0: +; X32-NEXT: jmp testRecursion # TAILCALL +; +; X64-LABEL: testRecursion: +; X64: # %bb.0: +; X64-NEXT: jmp testRecursion # TAILCALL + %r = musttail call i32 @testRecursion(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) + ret i32 %r +} + +define dso_local i32 @swap(ptr byval(%struct.1xi32) %0, ptr byval(%struct.1xi32) %1) noinline { +; X32-LABEL: swap: +; X32: # %bb.0: # %entry +; X32-NEXT: mov eax, dword ptr [esp + 4] +; X32-NEXT: add eax, dword ptr [esp + 8] +; X32-NEXT: ret +; +; X64-LABEL: swap: +; X64: # %bb.0: # %entry +; X64-NEXT: mov eax, dword ptr [rsp + 8] +; X64-NEXT: add eax, dword ptr [rsp + 16] +; X64-NEXT: ret +entry: + %a.ptr = getelementptr inbounds %struct.1xi32, ptr %0, i32 0, i32 0, i32 0 + %a = load i32, ptr %a.ptr, align 4 + %b.ptr = getelementptr inbounds %struct.1xi32, ptr %1, i32 0, i32 0, i32 0 + %b = load i32, ptr %b.ptr, align 4 + %sum = add i32 %a, %b + ret i32 %sum +} + +define dso_local i32 @swapByValArguments(ptr byval(%struct.1xi32) %0, ptr byval(%struct.1xi32) %1) { +; X32-LABEL: swapByValArguments: +; X32: # %bb.0: +; X32-NEXT: sub esp, 8 +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: mov eax, dword ptr [esp + 12] +; X32-NEXT: mov dword ptr [esp], eax +; X32-NEXT: mov eax, dword ptr [esp + 16] +; X32-NEXT: mov dword ptr [esp + 4], eax +; X32-NEXT: add esp, 8 +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: jmp swap # TAILCALL +; +; X64-LABEL: swapByValArguments: +; X64: # %bb.0: +; X64-NEXT: mov eax, dword ptr [rsp + 8] +; X64-NEXT: mov dword ptr [rsp - 16], eax +; X64-NEXT: mov eax, dword ptr [rsp + 16] +; X64-NEXT: mov dword ptr [rsp - 8], eax +; X64-NEXT: jmp swap # TAILCALL + %r = musttail call i32 @swap(ptr byval(%struct.1xi32) %1, ptr byval(%struct.1xi32) %0) + ret i32 %r +} + +; Clang only uses byval for arguments of 65 bytes or larger, but e.g. rustc +; does use byval for smaller types. Here we use a 20 byte struct to keep +; the tests more readable. +%twenty_bytes = type { [5 x i32] } +declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4) + +; Functions with byval parameters can be tail-called, because the value is +; actually passed in registers and the stack in the same way for the caller and +; callee. On x86 byval arguments are never (partially) passed via registers. +define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; X32-LABEL: large_caller: +; X32: # %bb.0: # %entry +; X32-NEXT: jmp large_callee@PLT # TAILCALL +; +; X64-LABEL: large_caller: +; X64: # %bb.0: # %entry +; X64-NEXT: jmp large_callee@PLT # TAILCALL +entry: + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; The IR for this one looks dodgy, because it has an alloca passed to a +; musttail function, but it is passed as a byval argument, so will be copied +; into the stack space allocated by @large_caller_new_value's caller, so is +; valid. +define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; X32-LABEL: large_caller_new_value: +; X32: # %bb.0: # %entry +; X32-NEXT: sub esp, 20 +; X32-NEXT: .cfi_def_cfa_offset 24 +; X32-NEXT: mov dword ptr [esp], 0 +; X32-NEXT: mov dword ptr [esp + 4], 1 +; X32-NEXT: mov dword ptr [esp + 8], 2 +; X32-NEXT: mov dword ptr [esp + 12], 3 +; X32-NEXT: mov dword ptr [esp + 16], 4 +; X32-NEXT: mov dword ptr [esp + 24], 0 +; X32-NEXT: mov dword ptr [esp + 28], 1 +; X32-NEXT: mov dword ptr [esp + 32], 2 +; X32-NEXT: mov dword ptr [esp + 36], 3 +; X32-NEXT: mov dword ptr [esp + 40], 4 +; X32-NEXT: add esp, 20 +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: jmp large_callee@PLT # TAILCALL +; +; X64-LABEL: large_caller_new_value: +; X64: # %bb.0: # %entry +; X64-NEXT: movabs rax, 4294967296 +; X64-NEXT: mov qword ptr [rsp - 20], rax +; X64-NEXT: movabs rcx, 12884901890 +; X64-NEXT: mov qword ptr [rsp - 12], rcx +; X64-NEXT: mov dword ptr [rsp - 4], 4 +; X64-NEXT: mov qword ptr [rsp + 8], rax +; X64-NEXT: mov qword ptr [rsp + 16], rcx +; X64-NEXT: mov dword ptr [rsp + 24], 4 +; X64-NEXT: jmp large_callee@PLT # TAILCALL +entry: + %y = alloca %twenty_bytes, align 4 + store i32 0, ptr %y, align 4 + %0 = getelementptr inbounds i8, ptr %y, i32 4 + store i32 1, ptr %0, align 4 + %1 = getelementptr inbounds i8, ptr %y, i32 8 + store i32 2, ptr %1, align 4 + %2 = getelementptr inbounds i8, ptr %y, i32 12 + store i32 3, ptr %2, align 4 + %3 = getelementptr inbounds i8, ptr %y, i32 16 + store i32 4, ptr %3, align 4 + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y) + ret void +} + +declare void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4, %twenty_bytes* byval(%twenty_bytes) align 4) +define void @swap_byvals(%twenty_bytes* byval(%twenty_bytes) align 4 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) { +; X32-LABEL: swap_byvals: +; X32: # %bb.0: # %entry +; X32-NEXT: sub esp, 40 +; X32-NEXT: .cfi_def_cfa_offset 44 +; X32-NEXT: mov eax, dword ptr [esp + 60] +; X32-NEXT: mov dword ptr [esp + 16], eax +; X32-NEXT: mov eax, dword ptr [esp + 56] +; X32-NEXT: mov dword ptr [esp + 12], eax +; X32-NEXT: mov eax, dword ptr [esp + 52] +; X32-NEXT: mov dword ptr [esp + 8], eax +; X32-NEXT: mov eax, dword ptr [esp + 44] +; X32-NEXT: mov ecx, dword ptr [esp + 48] +; X32-NEXT: mov dword ptr [esp + 4], ecx +; X32-NEXT: mov dword ptr [esp], eax +; X32-NEXT: mov eax, dword ptr [esp + 80] +; X32-NEXT: mov dword ptr [esp + 36], eax +; X32-NEXT: mov eax, dword ptr [esp + 76] +; X32-NEXT: mov dword ptr [esp + 32], eax +; X32-NEXT: mov eax, dword ptr [esp + 72] +; X32-NEXT: mov dword ptr [esp + 28], eax +; X32-NEXT: mov eax, dword ptr [esp + 64] +; X32-NEXT: mov ecx, dword ptr [esp + 68] +; X32-NEXT: mov dword ptr [esp + 24], ecx +; X32-NEXT: mov dword ptr [esp + 20], eax +; X32-NEXT: add esp, 40 +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: jmp two_byvals_callee@PLT # TAILCALL +; +; X64-LABEL: swap_byvals: +; X64: # %bb.0: # %entry +; X64-NEXT: mov eax, dword ptr [rsp + 24] +; X64-NEXT: mov dword ptr [rsp - 8], eax +; X64-NEXT: movaps xmm0, xmmword ptr [rsp + 8] +; X64-NEXT: movaps xmmword ptr [rsp - 24], xmm0 +; X64-NEXT: mov eax, dword ptr [rsp + 48] +; X64-NEXT: mov dword ptr [rsp - 32], eax +; X64-NEXT: mov rax, qword ptr [rsp + 32] +; X64-NEXT: mov rcx, qword ptr [rsp + 40] +; X64-NEXT: mov qword ptr [rsp - 40], rcx +; X64-NEXT: mov qword ptr [rsp - 48], rax +; X64-NEXT: jmp two_byvals_callee@PLT # TAILCALL +entry: + musttail call void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b, %twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; A forwarded byval arg, but at a different argument position. Because +; x86 does not (partially) pass byval arguments in registers, the byval +; arg is in the correct position already, so this is not a sibcall but +; can be tail-call optimized. +declare void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4) +define void @shift_byval(i32 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) { +; X32-LABEL: shift_byval: +; X32: # %bb.0: # %entry +; X32-NEXT: push edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: push esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset esi, -12 +; X32-NEXT: .cfi_offset edi, -8 +; X32-NEXT: mov eax, dword ptr [esp + 32] +; X32-NEXT: mov ecx, dword ptr [esp + 28] +; X32-NEXT: mov edx, dword ptr [esp + 24] +; X32-NEXT: mov esi, dword ptr [esp + 16] +; X32-NEXT: mov edi, dword ptr [esp + 20] +; X32-NEXT: push eax +; X32-NEXT: .cfi_adjust_cfa_offset 4 +; X32-NEXT: push ecx +; X32-NEXT: .cfi_adjust_cfa_offset 4 +; X32-NEXT: push edx +; X32-NEXT: .cfi_adjust_cfa_offset 4 +; X32-NEXT: push edi +; X32-NEXT: .cfi_adjust_cfa_offset 4 +; X32-NEXT: push esi +; X32-NEXT: .cfi_adjust_cfa_offset 4 +; X32-NEXT: call shift_byval_callee@PLT +; X32-NEXT: add esp, 20 +; X32-NEXT: .cfi_adjust_cfa_offset -20 +; X32-NEXT: pop esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pop edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: ret +; +; X64-LABEL: shift_byval: +; X64: # %bb.0: # %entry +; X64-NEXT: jmp shift_byval_callee@PLT # TAILCALL +entry: + tail call void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b) + ret void +} + +; A global object passed to a byval argument, so it must be copied, but doesn't +; need a stack temporary. +@large_global = external global %twenty_bytes +define void @large_caller_from_global(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; X32-LABEL: large_caller_from_global: +; X32: # %bb.0: # %entry +; X32-NEXT: mov eax, dword ptr [large_global+16] +; X32-NEXT: mov dword ptr [esp + 20], eax +; X32-NEXT: mov eax, dword ptr [large_global+12] +; X32-NEXT: mov dword ptr [esp + 16], eax +; X32-NEXT: mov eax, dword ptr [large_global+8] +; X32-NEXT: mov dword ptr [esp + 12], eax +; X32-NEXT: mov eax, dword ptr [large_global+4] +; X32-NEXT: mov dword ptr [esp + 8], eax +; X32-NEXT: mov eax, dword ptr [large_global] +; X32-NEXT: mov dword ptr [esp + 4], eax +; X32-NEXT: jmp large_callee@PLT # TAILCALL +; +; X64-LABEL: large_caller_from_global: +; X64: # %bb.0: # %entry +; X64-NEXT: mov rax, qword ptr [rip + large_global@GOTPCREL] +; X64-NEXT: mov ecx, dword ptr [rax + 16] +; X64-NEXT: mov dword ptr [rsp + 24], ecx +; X64-NEXT: mov rcx, qword ptr [rax] +; X64-NEXT: mov rax, qword ptr [rax + 8] +; X64-NEXT: mov qword ptr [rsp + 16], rax +; X64-NEXT: mov qword ptr [rsp + 8], rcx +; X64-NEXT: jmp large_callee@PLT # TAILCALL +entry: + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 @large_global) + ret void +} diff --git a/llvm/test/CodeGen/X86/musttail-tailcc.ll b/llvm/test/CodeGen/X86/musttail-tailcc.ll index fae698d53b927..f1ffbcb1142c5 100644 --- a/llvm/test/CodeGen/X86/musttail-tailcc.ll +++ b/llvm/test/CodeGen/X86/musttail-tailcc.ll @@ -55,15 +55,6 @@ define dso_local tailcc void @void_test(i32, i32, i32, i32) { ; ; X86-LABEL: void_test: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: popl %esi -; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: jmp void_test # TAILCALL entry: musttail call tailcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3) @@ -77,15 +68,6 @@ define dso_local tailcc i1 @i1test(i32, i32, i32, i32) { ; ; X86-LABEL: i1test: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: popl %esi -; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: jmp i1test # TAILCALL entry: %4 = musttail call tailcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3) diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll index 2759a9883975e..d1137cac7d365 100644 --- a/llvm/test/CodeGen/X86/sibcall.ll +++ b/llvm/test/CodeGen/X86/sibcall.ll @@ -295,10 +295,15 @@ declare dso_local i32 @foo5(i32, i32, i32, i32, i32) define dso_local i32 @t12(i32 %x, i32 %y, ptr byval(%struct.t) align 4 %z) nounwind ssp { ; X86-LABEL: t12: ; X86: # %bb.0: # %entry +; X86-NEXT: subl $20, %esp ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: jne foo6 # TAILCALL -; X86-NEXT: # %bb.1: # %bb2 +; X86-NEXT: je .LBB12_1 +; X86-NEXT: # %bb.2: # %bb +; X86-NEXT: addl $20, %esp +; X86-NEXT: jmp foo6 # TAILCALL +; X86-NEXT: .LBB12_1: # %bb2 ; X86-NEXT: xorl %eax, %eax +; X86-NEXT: addl $20, %esp ; X86-NEXT: retl ; ; X64-LABEL: t12: diff --git a/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll b/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll index cd669768705e5..b901d22f66392 100644 --- a/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll +++ b/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc %s -o - | FileCheck %s target triple = "x86_64-apple-macosx" @@ -24,9 +25,7 @@ define swifttailcc void @test(ptr %0, ptr swiftasync %1, i64 %2, i64 %3, ptr %4, ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; CHECK-NEXT: callq _foo ; CHECK-NEXT: movq %r14, (%rax) -; CHECK-NEXT: movl [[OFF:[0-9]+]](%rsp), %edx -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movq %rcx, [[OFF]](%rsp) +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: movq %rax, %r14 ; CHECK-NEXT: movq %r13, %rdi ; CHECK-NEXT: movq %r15, %rsi @@ -34,7 +33,6 @@ define swifttailcc void @test(ptr %0, ptr swiftasync %1, i64 %2, i64 %3, ptr %4, ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r15 -; CHECK-NEXT: addq $16, %rsp ; CHECK-NEXT: jmp _tc_fn ## TAILCALL entry: %res = tail call ptr @foo() diff --git a/llvm/test/CodeGen/X86/tailcallbyval64.ll b/llvm/test/CodeGen/X86/tailcallbyval64.ll index 3d2f6392bd150..e44e156a7ad4d 100644 --- a/llvm/test/CodeGen/X86/tailcallbyval64.ll +++ b/llvm/test/CodeGen/X86/tailcallbyval64.ll @@ -5,8 +5,7 @@ ; Expect the entry point. ; CHECK-LABEL: tailcaller: -; Expect 2 rep;movs because of tail call byval lowering. -; CHECK: rep; +; Expect 1 rep;movs because of tail call stack argument lowering. ; CHECK: rep; ; A sequence of copyto/copyfrom virtual registers is used to deal with byval diff --git a/llvm/test/CodeGen/X86/tailccbyval64.ll b/llvm/test/CodeGen/X86/tailccbyval64.ll index c08a9a77bfb88..6b440058c84b5 100644 --- a/llvm/test/CodeGen/X86/tailccbyval64.ll +++ b/llvm/test/CodeGen/X86/tailccbyval64.ll @@ -5,8 +5,7 @@ ; Expect the entry point. ; CHECK-LABEL: tailcaller: -; Expect 2 rep;movs because of tail call byval lowering. -; CHECK: rep; +; Expect 1 rep;movs because of tail call stack argument lowering. ; CHECK: rep; ; A sequence of copyto/copyfrom virtual registers is used to deal with byval