[SLP]Fix insertion point for setting for the nodes

alexey-bataev · alexey-bataev · commit 2c3aa9208969 · 2025-11-19T17:15:24.000-08:00
The problem with the many def-use chain problems in SLP vectorizer are related to the fact that some nodes reuse the same instruction as insertion point. Insertion point is not the instruction, but the place between instructions. To set it correctly, better to generate pseudo instruction immediately after the last instruction, and use it as insertion point. It resolves the issues in most cases. Fixes #168512 #168576
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2082,6 +2082,7 @@ class slpvectorizer::BoUpSLP {
     MustGather.clear();
     NonScheduledFirst.clear();
     EntryToLastInstruction.clear();
+    LastInstructionToPos.clear();
     LoadEntriesToVectorize.clear();
     IsGraphTransformMode = false;
     GatheredLoadsEntriesFirst.reset();
@@ -4593,6 +4594,10 @@ class slpvectorizer::BoUpSLP {
   /// pre-gather them before.
   SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
 
+  /// Keeps the mapping between the last instructions and their insertion
+  /// points, which is an instruction-after-the-last-instruction.
+  SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
+
   /// List of gather nodes, depending on other gather/vector nodes, which should
   /// be emitted after the vector instruction emission process to correctly
   /// handle order of the vector instructions and shuffles.
@@ -17894,6 +17899,16 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
     Builder.SetInsertPoint(
         LastInst->getParent(),
         LastInst->getNextNode()->getIterator());
+    if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
+      Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
+    } else {
+      Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
+                                      PoisonValue::get(Builder.getPtrTy()),
+                                      MaybeAlign());
+      Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
+      eraseInstruction(Res);
+      LastInstructionToPos.try_emplace(LastInst, Res);
+    }
   }
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-node-with-in-order-parent.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-node-with-in-order-parent.ll
@@ -10,11 +10,11 @@ define double @test() {
 ; CHECK-NEXT:    br label %[[BB4]]
 ; CHECK:       [[BB4]]:
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i32 0, 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 1>, i32 [[MUL]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[MUL]], i32 0
 ; CHECK-NEXT:    [[TMP3]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 1>, i32 [[MUL]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 false, label %[[BB7:.*]], label %[[BB1]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matching-insert-point-for-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/matching-insert-point-for-nodes.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test() {
+; CHECK-LABEL: define i32 @test() {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], %[[BB24:.*]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <4 x i32> [ [[TMP13:%.*]], %[[BB24]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ]
+; CHECK-NEXT:    br i1 false, label %[[BB4:.*]], label %[[BB11:.*]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ zeroinitializer, %[[BB1]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ [[TMP0]], %[[BB1]] ]
+; CHECK-NEXT:    br label %[[BB19:.*]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    br i1 false, label %[[BB12:.*]], label %[[BB16:.*]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP1]], <i32 poison, i32 poison, i32 0, i32 0>
+; CHECK-NEXT:    br label %[[BB13:.*]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <4 x i32> [ [[TMP4]], %[[BB12]] ]
+; CHECK-NEXT:    br label %[[BB16]]
+; CHECK:       [[BB16]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB11]] ], [ [[TMP5]], %[[BB13]] ]
+; CHECK-NEXT:    br label %[[BB19]]
+; CHECK:       [[BB19]]:
+; CHECK-NEXT:    [[PHI22:%.*]] = phi double [ 0.000000e+00, %[[BB4]] ], [ 0.000000e+00, %[[BB16]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB4]] ], [ [[TMP6]], %[[BB16]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP7]], <i32 poison, i32 poison, i32 0, i32 0>
+; CHECK-NEXT:    br label %[[BB24]]
+; CHECK:       [[BB24]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i32> [[TMP8]], <i32 poison, i32 poison, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = and <4 x i32> [[TMP9]], <i32 poison, i32 poison, i32 -1, i32 0>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP13]] = lshr <4 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = lshr <4 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP16]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 3>
+; CHECK-NEXT:    br label %[[BB1]]
+;
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i32 [ %lshr25, %bb24 ], [ 0, %bb ]
+  %phi2 = phi i32 [ %or26, %bb24 ], [ 0, %bb ]
+  %phi3 = phi i32 [ 0, %bb24 ], [ 0, %bb ]
+  br i1 false, label %bb4, label %bb11
+
+bb4:
+  %phi6 = phi i32 [ poison, %bb1 ]
+  %phi7 = phi i32 [ poison, %bb1 ]
+  %phi9 = phi i32 [ %phi2, %bb1 ]
+  %phi10 = phi i32 [ %phi, %bb1 ]
+  %0 = phi <2 x double> [ zeroinitializer, %bb1 ]
+  br label %bb19
+
+bb11:
+  br i1 false, label %bb12, label %bb16
+
+bb12:
+  %or = or i32 0, %phi3
+  br label %bb13
+
+bb13:
+  %phi14 = phi i32 [ %phi, %bb12 ]
+  %phi15 = phi i32 [ %or, %bb12 ]
+  br label %bb16
+
+bb16:
+  %phi17 = phi i32 [ 0, %bb11 ], [ %phi14, %bb13 ]
+  %phi18 = phi i32 [ 0, %bb11 ], [ %phi15, %bb13 ]
+  br label %bb19
+
+bb19:
+  %phi20 = phi i32 [ 0, %bb4 ], [ %phi17, %bb16 ]
+  %phi21 = phi i32 [ 0, %bb4 ], [ %phi18, %bb16 ]
+  %phi22 = phi double [ 0.000000e+00, %bb4 ], [ 0.000000e+00, %bb16 ]
+  %or23 = or i32 %phi21, 0
+  br label %bb24
+
+bb24:
+  %lshr = lshr i32 %phi20, 0
+  %and = and i32 %lshr, 0
+  %lshr25 = lshr i32 %phi, %and
+  %or26 = or i32 0, %or23
+  br label %bb1
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define float @test() {
+; CHECK-LABEL: define float @test() {
+; CHECK-NEXT:  [[LABEL:.*]]:
+; CHECK-NEXT:    [[SUB_I102_I:%.*]] = fsub float 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float poison, float poison>, float [[SUB_I102_I]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef>, <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <8 x float> [[TMP7]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> <float poison, float 1.000000e+00>, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> zeroinitializer, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <12 x float> [[TMP16]], <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x float> [[TMP18]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <8 x float> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> [[TMP20]], [[TMP8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <12 x float> [[TMP22]], <12 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP21]], <8 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <20 x float> [[TMP23]], <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    br label %[[REGION_30:.*]]
+; CHECK:       [[REGION_30]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP25]], %[[LABEL]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <20 x float> [[TMP26]], i32 7
+; CHECK-NEXT:    ret float [[TMP27]]
+;
+label:
+  %tmp.0.4.vec.extract = extractelement <4 x float> zeroinitializer, i64 0
+  %tmp.0.0.vec.extract = extractelement <4 x float> zeroinitializer, i64 0
+  %tmp.12.36.vec.extract = extractelement <4 x float> zeroinitializer, i64 0
+  %tmp.7.28.vec.extract = extractelement <4 x float> zeroinitializer, i64 0
+  %tmp.0.12.vec.extract = extractelement <4 x float> zeroinitializer, i64 0
+  %mul3.i128.i = fmul float 0.000000e+00, 0.000000e+00
+  %mul3.i138.i = fmul float 0.000000e+00, 0.000000e+00
+  %sub.i102.i = fsub float 0.000000e+00, 0.000000e+00
+  %mul.i136.i = fmul float %sub.i102.i, 0.000000e+00
+  %v.0.4.vec.extract = extractelement <8 x float> zeroinitializer, i64 0
+  %v.0.24.vec.extract = extractelement <8 x float> zeroinitializer, i64 0
+  %v.0.28.vec.extract = extractelement <8 x float> zeroinitializer, i64 0
+  %v.11.48.vec.extract = extractelement <8 x float> zeroinitializer, i64 0
+  %v.20.72.vec.extract = extractelement <8 x float> zeroinitializer, i64 0
+  %v.20.76.vec.extract = extractelement <8 x float> zeroinitializer, i64 0
+  %0 = fmul float %v.0.4.vec.extract, %tmp.0.0.vec.extract
+  %1 = fadd float %0, 0.000000e+00
+  %2 = fmul float %v.0.28.vec.extract, 0.000000e+00
+  %3 = fmul float %v.0.28.vec.extract, %tmp.0.0.vec.extract
+  %4 = fadd float %3, 0.000000e+00
+  %5 = fmul float %tmp.0.4.vec.extract, %v.11.48.vec.extract
+  %6 = fadd float 0.000000e+00, %5
+  %7 = fmul float %v.20.76.vec.extract, %tmp.0.0.vec.extract
+  %8 = fadd float %7, 0.000000e+00
+  %9 = fmul float 0.000000e+00, %tmp.0.12.vec.extract
+  %10 = fadd float %2, %9
+  %11 = fadd float %10, 0.000000e+00
+  %12 = fsub float 0.000000e+00, %11
+  %13 = fadd float 0.000000e+00, %1
+  %14 = fadd float 0.000000e+00, %4
+  %15 = fadd float 0.000000e+00, %6
+  %16 = fadd float 0.000000e+00, %8
+  %17 = fmul float 0.000000e+00, 0.000000e+00
+  %18 = fmul float 0.000000e+00, %tmp.7.28.vec.extract
+  %19 = fmul float 0.000000e+00, 0.000000e+00
+  %20 = fmul float 0.000000e+00, 0.000000e+00
+  %21 = fmul float 0.000000e+00, %tmp.7.28.vec.extract
+  %22 = fmul float 0.000000e+00, 0.000000e+00
+  %23 = fmul float 0.000000e+00, %tmp.12.36.vec.extract
+  %24 = fadd float %18, %23
+  %25 = fmul float 0.000000e+00, %tmp.12.36.vec.extract
+  %26 = fadd float %21, %25
+  %27 = fsub float 0.000000e+00, poison
+  %28 = fadd float %24, 0.000000e+00
+  %29 = fsub float 0.000000e+00, %28
+  %30 = fadd float %26, 0.000000e+00
+  %31 = fsub float 0.000000e+00, %30
+  %32 = fadd float 0.000000e+00, %17
+  %33 = fadd float 0.000000e+00, %19
+  %34 = fadd float 0.000000e+00, %20
+  %35 = fadd float 0.000000e+00, %22
+  %36 = fmul float 0.000000e+00, %mul3.i138.i
+  %37 = fmul float %v.0.4.vec.extract, %mul.i136.i
+  %38 = fadd float %37, 0.000000e+00
+  %39 = fmul float 0.000000e+00, %mul3.i138.i
+  %40 = fmul float %mul3.i138.i, %v.0.24.vec.extract
+  %41 = fadd float 0.000000e+00, %40
+  %42 = fmul float 0.000000e+00, %mul3.i138.i
+  %43 = fmul float 0.000000e+00, %mul3.i138.i
+  %44 = fmul float %mul3.i138.i, %v.20.72.vec.extract
+  %45 = fadd float 0.000000e+00, %44
+  %46 = fmul float 0.000000e+00, 1.000000e+00
+  %47 = fmul float 0.000000e+00, %mul3.i128.i
+  %48 = fadd float %36, %47
+  %49 = fmul float 0.000000e+00, %mul3.i128.i
+  %50 = fadd float %39, %49
+  %51 = fmul float 0.000000e+00, %mul3.i128.i
+  %52 = fadd float %42, %51
+  %53 = fmul float 0.000000e+00, %mul3.i128.i
+  %54 = fadd float %43, %53
+  %55 = fadd float %46, 0.000000e+00
+  %56 = fadd float %48, 0.000000e+00
+  %57 = fsub float %55, %56
+  %58 = fadd float %50, 0.000000e+00
+  %59 = fsub float 0.000000e+00, %58
+  %60 = fadd float %52, 0.000000e+00
+  %61 = fsub float 0.000000e+00, %60
+  %62 = fadd float %mul.i136.i, 0.000000e+00
+  %63 = fadd float %54, 0.000000e+00
+  %64 = fsub float %62, %63
+  %65 = fadd float 0.000000e+00, %38
+  %66 = fadd float 0.000000e+00, %41
+  %67 = fadd float 0.000000e+00, %45
+  br label %region.30
+
+region.30:
+  %68 = phi float [ %29, %label ]
+  %69 = phi float [ %66, %label ]
+  %70 = phi float [ %59, %label ]
+  %71 = phi float [ %15, %label ]
+  %72 = phi float [ %34, %label ]
+  %73 = phi float [ poison, %label ]
+  %74 = phi float [ %61, %label ]
+  %75 = phi float [ %16, %label ]
+  %76 = phi float [ %35, %label ]
+  %77 = phi float [ %31, %label ]
+  %78 = phi float [ %67, %label ]
+  %79 = phi float [ %64, %label ]
+  %80 = phi float [ %33, %label ]
+  %81 = phi float [ %12, %label ]
+  %82 = phi float [ %14, %label ]
+  %83 = phi float [ %57, %label ]
+  %84 = phi float [ %65, %label ]
+  %85 = phi float [ %27, %label ]
+  %86 = phi float [ %32, %label ]
+  %87 = phi float [ %13, %label ]
+  ret float %87
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll
@@ -5,11 +5,11 @@ define i1 @test() {
 ; CHECK-LABEL: define i1 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[H_PROMOTED118_I_FR:%.*]] = freeze i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[H_PROMOTED118_I_FR]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[H_PROMOTED118_I_FR]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[H_PROMOTED118_I_FR]], i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP5]], <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[TMP6]], <i32 1, i32 0, i32 0, i32 0>