diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e28b9c11a04cd..157642e14f68d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1067,6 +1067,19 @@ namespace llvm {
   //===--------------------------------------------------------------------===//
   //  X86 Implementation of the TargetLowering interface
   class X86TargetLowering final : public TargetLowering {
+    // Copying needed for an outgoing byval argument.
+    enum ByValCopyKind {
+      // Argument is already in the correct location, no copy needed.
+      NoCopy,
+      // Argument value is currently in the local stack frame, needs copying to
+      // outgoing arguemnt area.
+      CopyOnce,
+      // Argument value is currently in the outgoing argument area, but not at
+      // the correct offset, so needs copying via a temporary in local stack
+      // space.
+      CopyViaTemp,
+    };
+
   public:
     explicit X86TargetLowering(const X86TargetMachine &TM,
                                const X86Subtarget &STI);
@@ -1775,6 +1788,9 @@ namespace llvm {
     SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    ByValCopyKind ByValNeedsCopyForTailCall(SelectionDAG &DAG, SDValue Src,
+                                            SDValue Dst,
+                                            ISD::ArgFlagsTy Flags) const;
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 37d77728882b1..d1e8ad75a7d9a 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2018,6 +2018,49 @@ SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
+// Returns the type of copying which is required to set up a byval argument to
+// a tail-called function. This isn't needed for non-tail calls, because they
+// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
+// avoid clobbering another argument (CopyViaTemp), and sometimes can be
+// optimised to zero copies when forwarding an argument from the caller's
+// caller (NoCopy).
+X86TargetLowering::ByValCopyKind X86TargetLowering::ByValNeedsCopyForTailCall(
+    SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+  // Globals are always safe to copy from.
+  if (isa<GlobalAddressSDNode>(Src) || isa<ExternalSymbolSDNode>(Src))
+    return CopyOnce;
+
+  // Can only analyse frame index nodes, conservatively assume we need a
+  // temporary.
+  auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
+  auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
+  if (!SrcFrameIdxNode || !DstFrameIdxNode)
+    return CopyViaTemp;
+
+  int SrcFI = SrcFrameIdxNode->getIndex();
+  int DstFI = DstFrameIdxNode->getIndex();
+  assert(MFI.isFixedObjectIndex(DstFI) &&
+         "byval passed in non-fixed stack slot");
+
+  int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
+  int64_t DstOffset = MFI.getObjectOffset(DstFI);
+
+  // If the source is in the local frame, then the copy to the argument
+  // memory is always valid.
+  bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
+  if (!FixedSrc || (FixedSrc && SrcOffset < 0))
+    return CopyOnce;
+
+  // If the value is already in the correct location, then no copying is
+  // needed. If not, then we need to copy via a temporary.
+  if (SrcOffset == DstOffset)
+    return NoCopy;
+  else
+    return CopyViaTemp;
+}
+
 SDValue
 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
@@ -2098,15 +2141,18 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       isTailCall = false;
   }
 
-  if (isTailCall && !IsMustTail) {
+  if (isTailCall) {
     // Check if it's really possible to do a tail call.
-    isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs,
-                                                   IsCalleePopSRet);
+    IsSibcall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs,
+                                                  IsCalleePopSRet);
 
-    // Sibcalls are automatically detected tailcalls which do not require
-    // ABI changes.
-    if (!IsGuaranteeTCO && isTailCall)
-      IsSibcall = true;
+    if (!IsMustTail) {
+      isTailCall = IsSibcall;
+
+      // Sibcalls are automatically detected tailcalls which do not require
+      // ABI changes.
+      IsSibcall = IsSibcall && !IsGuaranteeTCO;
+    }
 
     if (isTailCall)
       ++NumTailCalls;
@@ -2128,8 +2174,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
+  // A sibcall is ABI-compatible and does not need to adjust the stack pointer.
   int FPDiff = 0;
-  if (isTailCall &&
+  if (isTailCall && !IsSibcall &&
       shouldGuaranteeTCO(CallConv,
                          MF.getTarget().Options.GuaranteedTailCallOpt)) {
     // Lower arguments at fp - stackoffset + fpdiff.
@@ -2146,6 +2193,74 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   unsigned NumBytesToPush = NumBytes;
   unsigned NumBytesToPop = NumBytes;
 
+  SDValue StackPtr;
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+  // If we are doing a tail-call, any byval arguments will be written to stack
+  // space which was used for incoming arguments. If any the values being used
+  // are incoming byval arguments to this function, then they might be
+  // overwritten by the stores of the outgoing arguments. To avoid this, we
+  // need to make a temporary copy of them in local stack space, then copy back
+  // to the argument area.
+  DenseMap<unsigned, SDValue> ByValTemporaries;
+  SDValue ByValTempChain;
+  if (isTailCall) {
+    SmallVector<SDValue, 8> ByValCopyChains;
+    for (const CCValAssign &VA : ArgLocs) {
+      unsigned ArgIdx = VA.getValNo();
+      SDValue Src = OutVals[ArgIdx];
+      ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
+
+      if (!Flags.isByVal())
+        continue;
+
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+      if (!StackPtr.getNode())
+        StackPtr =
+            DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), PtrVT);
+
+      // Destination: where this byval should live in the callee’s frame
+      // after the tail call.
+      int32_t Offset = VA.getLocMemOffset() + FPDiff;
+      int Size = VA.getLocVT().getFixedSizeInBits() / 8;
+      int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
+      SDValue Dst = DAG.getFrameIndex(FI, PtrVT);
+
+      ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
+
+      if (Copy == NoCopy) {
+        // If the argument is already at the correct offset on the stack
+        // (because we are forwarding a byval argument from our caller), we
+        // don't need any copying.
+        continue;
+      } else if (Copy == CopyOnce) {
+        // If the argument is in our local stack frame, no other argument
+        // preparation can clobber it, so we can copy it to the final location
+        // later.
+        ByValTemporaries[ArgIdx] = Src;
+      } else {
+        assert(Copy == CopyViaTemp && "unexpected enum value");
+        // If we might be copying this argument from the outgoing argument
+        // stack area, we need to copy via a temporary in the local stack
+        // frame.
+        MachineFrameInfo &MFI = MF.getFrameInfo();
+        int TempFrameIdx = MFI.CreateStackObject(Flags.getByValSize(),
+                                                 Flags.getNonZeroByValAlign(),
+                                                 /*isSS=*/false);
+        SDValue Temp =
+            DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
+
+        SDValue CopyChain =
+            CreateCopyOfByValArgument(Src, Temp, Chain, Flags, DAG, dl);
+        ByValCopyChains.push_back(CopyChain);
+      }
+    }
+    if (!ByValCopyChains.empty())
+      ByValTempChain =
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
+  }
+
   // If we have an inalloca argument, all stack space has already been allocated
   // for us and be right at the top of the stack.  We don't support multiple
   // arguments passed in memory when using inalloca.
@@ -2186,7 +2301,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
-  SDValue StackPtr;
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
@@ -2195,7 +2309,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
-  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
        ++I, ++OutIndex) {
     assert(OutIndex < Outs.size() && "Invalid Out index");
@@ -2285,7 +2398,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         if (ShadowReg)
           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
       }
-    } else if (!IsSibcall && (!isTailCall || isByVal)) {
+    } else if (!IsSibcall && (!isTailCall || (isByVal && !IsMustTail))) {
       assert(VA.isMemLoc());
       if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
@@ -2372,6 +2485,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // would clobber.
     Chain = DAG.getStackArgumentTokenFactor(Chain);
 
+    if (ByValTempChain)
+      Chain =
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain, ByValTempChain);
+
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
@@ -2404,16 +2521,24 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
 
       if (Flags.isByVal()) {
-        // Copy relative to framepointer.
-        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
-        if (!StackPtr.getNode())
-          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
-                                        getPointerTy(DAG.getDataLayout()));
-        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
-                             StackPtr, Source);
-
-        MemOpChains2.push_back(
-            CreateCopyOfByValArgument(Source, FIN, Chain, Flags, DAG, dl));
+        SDValue ByValSrc;
+        bool NeedsStackCopy;
+        if (auto It = ByValTemporaries.find(OutsIndex);
+            It != ByValTemporaries.end()) {
+          ByValSrc = It->second;
+          NeedsStackCopy = true;
+        } else {
+          ByValSrc = Arg;
+          NeedsStackCopy = false;
+        }
+
+        if (NeedsStackCopy) {
+          auto PtrVT = getPointerTy(DAG.getDataLayout());
+          SDValue DstAddr = DAG.getFrameIndex(FI, PtrVT);
+
+          MemOpChains2.push_back(CreateCopyOfByValArgument(
+              ByValSrc, DstAddr, Chain, Flags, DAG, dl));
+        }
       } else {
         // Store relative to framepointer.
         MemOpChains2.push_back(DAG.getStore(
diff --git a/llvm/test/CodeGen/X86/musttail-struct.ll b/llvm/test/CodeGen/X86/musttail-struct.ll
new file mode 100644
index 0000000000000..735fd674a2ff1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/musttail-struct.ll
@@ -0,0 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -x86-asm-syntax=intel | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-asm-syntax=intel | FileCheck %s --check-prefix=X64
+
+; Test correct handling of a musttail call with a byval struct argument.
+
+%struct.1xi32 = type { [1 x i32] }
+%struct.3xi32 = type { [3 x i32] }
+%struct.5xi32 = type { [5 x i32] }
+
+declare dso_local i32 @Func1(ptr byval(%struct.1xi32) %0)
+declare dso_local i32 @Func3(ptr byval(%struct.3xi32) %0)
+declare dso_local i32 @Func5(ptr byval(%struct.5xi32) %0)
+declare dso_local i32 @FuncManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7)
+
+define dso_local i32 @test1(ptr byval(%struct.1xi32) %0) {
+; X32-LABEL: test1:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp Func1 # TAILCALL
+;
+; X64-LABEL: test1:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp Func1 # TAILCALL
+  %r = musttail call i32 @Func1(ptr byval(%struct.1xi32) %0)
+  ret i32 %r
+}
+
+define dso_local i32 @test3(ptr byval(%struct.3xi32) %0) {
+; X32-LABEL: test3:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp Func3 # TAILCALL
+;
+; X64-LABEL: test3:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp Func3 # TAILCALL
+  %r = musttail call i32 @Func3(ptr byval(%struct.3xi32) %0)
+  ret i32 %r
+}
+
+; sizeof(%struct.5xi32) > 16, in x64 this is passed on stack.
+define dso_local i32 @test5(ptr byval(%struct.5xi32) %0) {
+; X32-LABEL: test5:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp Func5 # TAILCALL
+;
+; X64-LABEL: test5:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp Func5 # TAILCALL
+  %r = musttail call i32 @Func5(ptr byval(%struct.5xi32) %0)
+  ret i32 %r
+}
+
+; Test passing multiple arguments with different sizes on stack. In x64 Linux
+; the first 6 are passed by register.
+define dso_local i32 @testManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) {
+; X32-LABEL: testManyArgs:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp FuncManyArgs # TAILCALL
+;
+; X64-LABEL: testManyArgs:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp FuncManyArgs # TAILCALL
+  %r = musttail call i32 @FuncManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7)
+  ret i32 %r
+}
+
+define dso_local i32 @testRecursion(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) {
+; X32-LABEL: testRecursion:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp testRecursion # TAILCALL
+;
+; X64-LABEL: testRecursion:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp testRecursion # TAILCALL
+  %r = musttail call i32 @testRecursion(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7)
+  ret i32 %r
+}
+
+define dso_local i32 @swap(ptr byval(%struct.1xi32) %0, ptr byval(%struct.1xi32) %1) noinline {
+; X32-LABEL: swap:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    mov eax, dword ptr [esp + 4]
+; X32-NEXT:    add eax, dword ptr [esp + 8]
+; X32-NEXT:    ret
+;
+; X64-LABEL: swap:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    mov eax, dword ptr [rsp + 8]
+; X64-NEXT:    add eax, dword ptr [rsp + 16]
+; X64-NEXT:    ret
+entry:
+  %a.ptr = getelementptr inbounds %struct.1xi32, ptr %0, i32 0, i32 0, i32 0
+  %a     = load i32, ptr %a.ptr, align 4
+  %b.ptr = getelementptr inbounds %struct.1xi32, ptr %1, i32 0, i32 0, i32 0
+  %b     = load i32, ptr %b.ptr, align 4
+  %sum   = add i32 %a, %b
+  ret i32 %sum
+}
+
+define dso_local i32 @swapByValArguments(ptr byval(%struct.1xi32) %0, ptr byval(%struct.1xi32) %1) {
+; X32-LABEL: swapByValArguments:
+; X32:       # %bb.0:
+; X32-NEXT:    sub esp, 8
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    mov eax, dword ptr [esp + 12]
+; X32-NEXT:    mov dword ptr [esp], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 16]
+; X32-NEXT:    mov dword ptr [esp + 4], eax
+; X32-NEXT:    add esp, 8
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    jmp swap # TAILCALL
+;
+; X64-LABEL: swapByValArguments:
+; X64:       # %bb.0:
+; X64-NEXT:    mov eax, dword ptr [rsp + 8]
+; X64-NEXT:    mov dword ptr [rsp - 16], eax
+; X64-NEXT:    mov eax, dword ptr [rsp + 16]
+; X64-NEXT:    mov dword ptr [rsp - 8], eax
+; X64-NEXT:    jmp swap # TAILCALL
+  %r = musttail call i32 @swap(ptr byval(%struct.1xi32) %1, ptr byval(%struct.1xi32) %0)
+  ret i32 %r
+}
+
+; Clang only uses byval for arguments of 65 bytes or larger, but e.g. rustc
+; does use byval for smaller types. Here we use a 20 byte struct to keep
+; the tests more readable.
+%twenty_bytes = type { [5 x i32] }
+declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4)
+
+; Functions with byval parameters can be tail-called, because the value is
+; actually passed in registers and the stack in the same way for the caller and
+; callee. On x86 byval arguments are never (partially) passed via registers.
+define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; X32-LABEL: large_caller:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    jmp large_callee@PLT # TAILCALL
+;
+; X64-LABEL: large_caller:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    jmp large_callee@PLT # TAILCALL
+entry:
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a)
+  ret void
+}
+
+; The IR for this one looks dodgy, because it has an alloca passed to a
+; musttail function, but it is passed as a byval argument, so will be copied
+; into the stack space allocated by @large_caller_new_value's caller, so is
+; valid.
+define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; X32-LABEL: large_caller_new_value:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    sub esp, 20
+; X32-NEXT:    .cfi_def_cfa_offset 24
+; X32-NEXT:    mov dword ptr [esp], 0
+; X32-NEXT:    mov dword ptr [esp + 4], 1
+; X32-NEXT:    mov dword ptr [esp + 8], 2
+; X32-NEXT:    mov dword ptr [esp + 12], 3
+; X32-NEXT:    mov dword ptr [esp + 16], 4
+; X32-NEXT:    mov dword ptr [esp + 24], 0
+; X32-NEXT:    mov dword ptr [esp + 28], 1
+; X32-NEXT:    mov dword ptr [esp + 32], 2
+; X32-NEXT:    mov dword ptr [esp + 36], 3
+; X32-NEXT:    mov dword ptr [esp + 40], 4
+; X32-NEXT:    add esp, 20
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    jmp large_callee@PLT # TAILCALL
+;
+; X64-LABEL: large_caller_new_value:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabs rax, 4294967296
+; X64-NEXT:    mov qword ptr [rsp - 20], rax
+; X64-NEXT:    movabs rcx, 12884901890
+; X64-NEXT:    mov qword ptr [rsp - 12], rcx
+; X64-NEXT:    mov dword ptr [rsp - 4], 4
+; X64-NEXT:    mov qword ptr [rsp + 8], rax
+; X64-NEXT:    mov qword ptr [rsp + 16], rcx
+; X64-NEXT:    mov dword ptr [rsp + 24], 4
+; X64-NEXT:    jmp large_callee@PLT # TAILCALL
+entry:
+  %y = alloca %twenty_bytes, align 4
+  store i32 0, ptr %y, align 4
+  %0 = getelementptr inbounds i8, ptr %y, i32 4
+  store i32 1, ptr %0, align 4
+  %1 = getelementptr inbounds i8, ptr %y, i32 8
+  store i32 2, ptr %1, align 4
+  %2 = getelementptr inbounds i8, ptr %y, i32 12
+  store i32 3, ptr %2, align 4
+  %3 = getelementptr inbounds i8, ptr %y, i32 16
+  store i32 4, ptr %3, align 4
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y)
+  ret void
+}
+
+declare void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4, %twenty_bytes* byval(%twenty_bytes) align 4)
+define void @swap_byvals(%twenty_bytes* byval(%twenty_bytes) align 4 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) {
+; X32-LABEL: swap_byvals:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    sub esp, 40
+; X32-NEXT:    .cfi_def_cfa_offset 44
+; X32-NEXT:    mov eax, dword ptr [esp + 60]
+; X32-NEXT:    mov dword ptr [esp + 16], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 56]
+; X32-NEXT:    mov dword ptr [esp + 12], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 52]
+; X32-NEXT:    mov dword ptr [esp + 8], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 44]
+; X32-NEXT:    mov ecx, dword ptr [esp + 48]
+; X32-NEXT:    mov dword ptr [esp + 4], ecx
+; X32-NEXT:    mov dword ptr [esp], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 80]
+; X32-NEXT:    mov dword ptr [esp + 36], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 76]
+; X32-NEXT:    mov dword ptr [esp + 32], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 72]
+; X32-NEXT:    mov dword ptr [esp + 28], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 64]
+; X32-NEXT:    mov ecx, dword ptr [esp + 68]
+; X32-NEXT:    mov dword ptr [esp + 24], ecx
+; X32-NEXT:    mov dword ptr [esp + 20], eax
+; X32-NEXT:    add esp, 40
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    jmp two_byvals_callee@PLT # TAILCALL
+;
+; X64-LABEL: swap_byvals:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    mov eax, dword ptr [rsp + 24]
+; X64-NEXT:    mov dword ptr [rsp - 8], eax
+; X64-NEXT:    movaps xmm0, xmmword ptr [rsp + 8]
+; X64-NEXT:    movaps xmmword ptr [rsp - 24], xmm0
+; X64-NEXT:    mov eax, dword ptr [rsp + 48]
+; X64-NEXT:    mov dword ptr [rsp - 32], eax
+; X64-NEXT:    mov rax, qword ptr [rsp + 32]
+; X64-NEXT:    mov rcx, qword ptr [rsp + 40]
+; X64-NEXT:    mov qword ptr [rsp - 40], rcx
+; X64-NEXT:    mov qword ptr [rsp - 48], rax
+; X64-NEXT:    jmp two_byvals_callee@PLT # TAILCALL
+entry:
+  musttail call void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b, %twenty_bytes* byval(%twenty_bytes) align 4 %a)
+  ret void
+}
+
+; A forwarded byval arg, but at a different argument position. Because
+; x86 does not (partially) pass byval arguments in registers, the byval
+; arg is in the correct position already, so this is not a sibcall but
+; can be tail-call optimized.
+declare void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4)
+define void @shift_byval(i32 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) {
+; X32-LABEL: shift_byval:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    push edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    push esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    .cfi_offset esi, -12
+; X32-NEXT:    .cfi_offset edi, -8
+; X32-NEXT:    mov eax, dword ptr [esp + 32]
+; X32-NEXT:    mov ecx, dword ptr [esp + 28]
+; X32-NEXT:    mov edx, dword ptr [esp + 24]
+; X32-NEXT:    mov esi, dword ptr [esp + 16]
+; X32-NEXT:    mov edi, dword ptr [esp + 20]
+; X32-NEXT:    push eax
+; X32-NEXT:    .cfi_adjust_cfa_offset 4
+; X32-NEXT:    push ecx
+; X32-NEXT:    .cfi_adjust_cfa_offset 4
+; X32-NEXT:    push edx
+; X32-NEXT:    .cfi_adjust_cfa_offset 4
+; X32-NEXT:    push edi
+; X32-NEXT:    .cfi_adjust_cfa_offset 4
+; X32-NEXT:    push esi
+; X32-NEXT:    .cfi_adjust_cfa_offset 4
+; X32-NEXT:    call shift_byval_callee@PLT
+; X32-NEXT:    add esp, 20
+; X32-NEXT:    .cfi_adjust_cfa_offset -20
+; X32-NEXT:    pop esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pop edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    ret
+;
+; X64-LABEL: shift_byval:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    jmp shift_byval_callee@PLT # TAILCALL
+entry:
+  tail call void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b)
+  ret void
+}
+
+; A global object passed to a byval argument, so it must be copied, but doesn't
+; need a stack temporary.
+@large_global = external global %twenty_bytes
+define void @large_caller_from_global(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; X32-LABEL: large_caller_from_global:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    mov eax, dword ptr [large_global+16]
+; X32-NEXT:    mov dword ptr [esp + 20], eax
+; X32-NEXT:    mov eax, dword ptr [large_global+12]
+; X32-NEXT:    mov dword ptr [esp + 16], eax
+; X32-NEXT:    mov eax, dword ptr [large_global+8]
+; X32-NEXT:    mov dword ptr [esp + 12], eax
+; X32-NEXT:    mov eax, dword ptr [large_global+4]
+; X32-NEXT:    mov dword ptr [esp + 8], eax
+; X32-NEXT:    mov eax, dword ptr [large_global]
+; X32-NEXT:    mov dword ptr [esp + 4], eax
+; X32-NEXT:    jmp large_callee@PLT # TAILCALL
+;
+; X64-LABEL: large_caller_from_global:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    mov rax, qword ptr [rip + large_global@GOTPCREL]
+; X64-NEXT:    mov ecx, dword ptr [rax + 16]
+; X64-NEXT:    mov dword ptr [rsp + 24], ecx
+; X64-NEXT:    mov rcx, qword ptr [rax]
+; X64-NEXT:    mov rax, qword ptr [rax + 8]
+; X64-NEXT:    mov qword ptr [rsp + 16], rax
+; X64-NEXT:    mov qword ptr [rsp + 8], rcx
+; X64-NEXT:    jmp large_callee@PLT # TAILCALL
+entry:
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 @large_global)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/musttail-tailcc.ll b/llvm/test/CodeGen/X86/musttail-tailcc.ll
index fae698d53b927..f1ffbcb1142c5 100644
--- a/llvm/test/CodeGen/X86/musttail-tailcc.ll
+++ b/llvm/test/CodeGen/X86/musttail-tailcc.ll
@@ -55,15 +55,6 @@ define dso_local tailcc void @void_test(i32, i32, i32, i32) {
 ;
 ; X86-LABEL: void_test:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    jmp void_test # TAILCALL
   entry:
    musttail call tailcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3)
@@ -77,15 +68,6 @@ define dso_local tailcc i1 @i1test(i32, i32, i32, i32) {
 ;
 ; X86-LABEL: i1test:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    jmp i1test # TAILCALL
   entry:
   %4 = musttail call tailcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll
index 2759a9883975e..d1137cac7d365 100644
--- a/llvm/test/CodeGen/X86/sibcall.ll
+++ b/llvm/test/CodeGen/X86/sibcall.ll
@@ -295,10 +295,15 @@ declare dso_local i32 @foo5(i32, i32, i32, i32, i32)
 define dso_local i32 @t12(i32 %x, i32 %y, ptr byval(%struct.t) align 4 %z) nounwind ssp {
 ; X86-LABEL: t12:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    jne foo6 # TAILCALL
-; X86-NEXT:  # %bb.1: # %bb2
+; X86-NEXT:    je .LBB12_1
+; X86-NEXT:  # %bb.2: # %bb
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    jmp foo6 # TAILCALL
+; X86-NEXT:  .LBB12_1: # %bb2
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl $20, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t12:
diff --git a/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll b/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll
index cd669768705e5..b901d22f66392 100644
--- a/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll
+++ b/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc %s -o - | FileCheck %s
 
 target triple = "x86_64-apple-macosx"
@@ -24,9 +25,7 @@ define swifttailcc void @test(ptr %0, ptr swiftasync %1, i64 %2, i64 %3, ptr %4,
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; CHECK-NEXT:    callq _foo
 ; CHECK-NEXT:    movq %r14, (%rax)
-; CHECK-NEXT:    movl [[OFF:[0-9]+]](%rsp), %edx
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT:    movq %rcx, [[OFF]](%rsp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    movq %rax, %r14
 ; CHECK-NEXT:    movq %r13, %rdi
 ; CHECK-NEXT:    movq %r15, %rsi
@@ -34,7 +33,6 @@ define swifttailcc void @test(ptr %0, ptr swiftasync %1, i64 %2, i64 %3, ptr %4,
 ; CHECK-NEXT:    addq $8, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r15
-; CHECK-NEXT:    addq $16, %rsp
 ; CHECK-NEXT:    jmp _tc_fn ## TAILCALL
 entry:
   %res = tail call ptr @foo()
diff --git a/llvm/test/CodeGen/X86/tailcallbyval64.ll b/llvm/test/CodeGen/X86/tailcallbyval64.ll
index 3d2f6392bd150..e44e156a7ad4d 100644
--- a/llvm/test/CodeGen/X86/tailcallbyval64.ll
+++ b/llvm/test/CodeGen/X86/tailcallbyval64.ll
@@ -5,8 +5,7 @@
 ; Expect the entry point.
 ; CHECK-LABEL: tailcaller:
 
-; Expect 2 rep;movs because of tail call byval lowering.
-; CHECK: rep;
+; Expect 1 rep;movs because of tail call stack argument lowering.
 ; CHECK: rep;
 
 ; A sequence of copyto/copyfrom virtual registers is used to deal with byval
diff --git a/llvm/test/CodeGen/X86/tailccbyval64.ll b/llvm/test/CodeGen/X86/tailccbyval64.ll
index c08a9a77bfb88..6b440058c84b5 100644
--- a/llvm/test/CodeGen/X86/tailccbyval64.ll
+++ b/llvm/test/CodeGen/X86/tailccbyval64.ll
@@ -5,8 +5,7 @@
 ; Expect the entry point.
 ; CHECK-LABEL: tailcaller:
 
-; Expect 2 rep;movs because of tail call byval lowering.
-; CHECK: rep;
+; Expect 1 rep;movs because of tail call stack argument lowering.
 ; CHECK: rep;
 
 ; A sequence of copyto/copyfrom virtual registers is used to deal with byval