llvm · lukel97 · Sep 26, 2025 · Sep 23, 2025 · Sep 24, 2025 · Sep 25, 2025
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3051,7 +3051,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
   if (State.VF.isVector() && shouldPack()) {
     Value *WideValue =
         State.Lane->isFirstLane()
-            ? PoisonValue::get(VectorType::get(UI->getType(), State.VF))
+            ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
             : State.get(this);
     State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
                                                         *State.Lane));
@@ -3267,11 +3267,21 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
   // also do that packing, thereby "hoisting" the insert-element sequence.
   // Otherwise, a phi node for the scalar value is needed.
   if (State.hasVectorValue(getOperand(0))) {
-    Value *VectorValue = State.get(getOperand(0));
-    InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
-    PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
-    VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
-    VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
+    auto *VecI = cast<Instruction>(State.get(getOperand(0)));
+    assert(isa<InsertElementInst>(VecI) || isa<InsertValueInst>(VecI));
+
+    // If VectorI is a struct, it will be a sequence like:
+    // %1       = insertvalue %unmodified, %x, 0
+    // %2       = insertvalue %1, %y, 1
+    // %VectorI = insertvalue %2, %z, 2
+    // To get the unmodified vector we need to look through the chain.
+    if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
+      for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
+        VecI = cast<Instruction>(VecI->getOperand(0));
+
+    PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
+    VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
+    VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
     if (State.hasVectorValue(this))
       State.reset(this, VPhi);
     else

diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
@@ -166,7 +166,7 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4-NEXT:    store <4 x float> [[TMP42]], ptr [[TMP45]], align 4
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
 ;
 ; VF2IC2-LABEL: define void @struct_return_2xf32_replicate(
@@ -233,7 +233,7 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    store <2 x float> [[TMP44]], ptr [[TMP50]], align 4
 ; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF2IC2-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; VF2IC2-NEXT:    br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2IC2-NEXT:    br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF2IC2:       [[MIDDLE_BLOCK]]:
 ;
 entry:
@@ -336,7 +336,7 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4-NEXT:    store <4 x i32> [[TMP63]], ptr [[TMP64]], align 4
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; VF4-NEXT:    br i1 [[TMP66]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP66]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
 ;
 ; VF2IC2-LABEL: define void @struct_return_3xi32_replicate(
@@ -425,7 +425,7 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    store <2 x i32> [[TMP68]], ptr [[TMP71]], align 4
 ; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF2IC2-NEXT:    [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; VF2IC2-NEXT:    br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2IC2-NEXT:    br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2IC2:       [[MIDDLE_BLOCK]]:
 ;
 entry:
@@ -453,6 +453,224 @@ exit:
   ret void
 }
 
+define void @struct_return_2xf32_replicate_predicated(ptr %a) {
+; CHECK-LABEL: define void @scalarized_predicated_struct_return
+; CHECK:       vector.body:
+; CHECK:         [[WIDE_CALL:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @scalable_vec_masked_bar(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; VF4-LABEL: define void @struct_return_2xf32_replicate_predicated(
+; VF4-SAME: ptr [[A:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 8
+; VF4-NEXT:    [[TMP1:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; VF4-NEXT:    br i1 [[TMP2]], label %[[PRED_CALL_IF:.*]], label %[[PRED_CALL_CONTINUE:.*]]
+; VF4:       [[PRED_CALL_IF]]:
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
+; VF4-NEXT:    [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR3:[0-9]+]]
+; VF4-NEXT:    [[TMP5:%.*]] = extractvalue { float, float } [[TMP4]], 0
+; VF4-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i32 0
+; VF4-NEXT:    [[TMP7:%.*]] = insertvalue { <4 x float>, <4 x float> } poison, <4 x float> [[TMP6]], 0
+; VF4-NEXT:    [[TMP8:%.*]] = extractvalue { float, float } [[TMP4]], 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP7]], 1
+; VF4-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP8]], i32 0
+; VF4-NEXT:    [[TMP11:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP7]], <4 x float> [[TMP10]], 1
+; VF4-NEXT:    br label %[[PRED_CALL_CONTINUE]]
+; VF4:       [[PRED_CALL_CONTINUE]]:
+; VF4-NEXT:    [[TMP12:%.*]] = phi { <4 x float>, <4 x float> } [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_CALL_IF]] ]
+; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; VF4-NEXT:    br i1 [[TMP13]], label %[[PRED_CALL_IF1:.*]], label %[[PRED_CALL_CONTINUE2:.*]]
+; VF4:       [[PRED_CALL_IF1]]:
+; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
+; VF4-NEXT:    [[TMP15:%.*]] = tail call { float, float } @fn2(float [[TMP14]]) #[[ATTR3]]
+; VF4-NEXT:    [[TMP16:%.*]] = extractvalue { float, float } [[TMP15]], 0
+; VF4-NEXT:    [[TMP17:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP12]], 0
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP16]], i32 1
+; VF4-NEXT:    [[TMP19:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP12]], <4 x float> [[TMP18]], 0
+; VF4-NEXT:    [[TMP20:%.*]] = extractvalue { float, float } [[TMP15]], 1
+; VF4-NEXT:    [[TMP21:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP19]], 1
+; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP20]], i32 1
+; VF4-NEXT:    [[TMP23:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP19]], <4 x float> [[TMP22]], 1
+; VF4-NEXT:    br label %[[PRED_CALL_CONTINUE2]]
+; VF4:       [[PRED_CALL_CONTINUE2]]:
+; VF4-NEXT:    [[TMP24:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP12]], %[[PRED_CALL_CONTINUE]] ], [ [[TMP19]], %[[PRED_CALL_IF1]] ]
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; VF4-NEXT:    br i1 [[TMP25]], label %[[PRED_CALL_IF3:.*]], label %[[PRED_CALL_CONTINUE4:.*]]
+; VF4:       [[PRED_CALL_IF3]]:
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = tail call { float, float } @fn2(float [[TMP26]]) #[[ATTR3]]
+; VF4-NEXT:    [[TMP28:%.*]] = extractvalue { float, float } [[TMP27]], 0
+; VF4-NEXT:    [[TMP29:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP24]], 0
+; VF4-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP28]], i32 2
+; VF4-NEXT:    [[TMP31:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP24]], <4 x float> [[TMP30]], 0
+; VF4-NEXT:    [[TMP32:%.*]] = extractvalue { float, float } [[TMP27]], 1
+; VF4-NEXT:    [[TMP33:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP31]], 1
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP32]], i32 2
+; VF4-NEXT:    [[TMP35:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP31]], <4 x float> [[TMP34]], 1
+; VF4-NEXT:    br label %[[PRED_CALL_CONTINUE4]]
+; VF4:       [[PRED_CALL_CONTINUE4]]:
+; VF4-NEXT:    [[TMP36:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP24]], %[[PRED_CALL_CONTINUE2]] ], [ [[TMP31]], %[[PRED_CALL_IF3]] ]
+; VF4-NEXT:    [[TMP37:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; VF4-NEXT:    br i1 [[TMP37]], label %[[PRED_CALL_IF5:.*]], label %[[PRED_CALL_CONTINUE6:.*]]
+; VF4:       [[PRED_CALL_IF5]]:
+; VF4-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP39:%.*]] = tail call { float, float } @fn2(float [[TMP38]]) #[[ATTR3]]
+; VF4-NEXT:    [[TMP40:%.*]] = extractvalue { float, float } [[TMP39]], 0
+; VF4-NEXT:    [[TMP41:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP36]], 0
+; VF4-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP40]], i32 3
+; VF4-NEXT:    [[TMP43:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP36]], <4 x float> [[TMP42]], 0
+; VF4-NEXT:    [[TMP44:%.*]] = extractvalue { float, float } [[TMP39]], 1
+; VF4-NEXT:    [[TMP45:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP43]], 1
+; VF4-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP44]], i32 3
+; VF4-NEXT:    [[TMP47:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP43]], <4 x float> [[TMP46]], 1
+; VF4-NEXT:    br label %[[PRED_CALL_CONTINUE6]]
+; VF4:       [[PRED_CALL_CONTINUE6]]:
+; VF4-NEXT:    [[TMP48:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP36]], %[[PRED_CALL_CONTINUE4]] ], [ [[TMP43]], %[[PRED_CALL_IF5]] ]
+; VF4-NEXT:    [[TMP49:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP48]], 0
+; VF4-NEXT:    [[TMP50:%.*]] = fdiv <4 x float> [[TMP49]], [[WIDE_LOAD]]
+; VF4-NEXT:    [[TMP51:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; VF4-NEXT:    br i1 [[TMP51]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF4:       [[PRED_STORE_IF]]:
+; VF4-NEXT:    [[TMP52:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP52]]
+; VF4-NEXT:    [[TMP54:%.*]] = extractelement <4 x float> [[TMP50]], i32 0
+; VF4-NEXT:    store float [[TMP54]], ptr [[TMP53]], align 8
+; VF4-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF4:       [[PRED_STORE_CONTINUE]]:
+; VF4-NEXT:    [[TMP55:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; VF4-NEXT:    br i1 [[TMP55]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF4:       [[PRED_STORE_IF7]]:
+; VF4-NEXT:    [[TMP56:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP56]]
+; VF4-NEXT:    [[TMP58:%.*]] = extractelement <4 x float> [[TMP50]], i32 1
+; VF4-NEXT:    store float [[TMP58]], ptr [[TMP57]], align 8
+; VF4-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF4:       [[PRED_STORE_CONTINUE8]]:
+; VF4-NEXT:    [[TMP59:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; VF4-NEXT:    br i1 [[TMP59]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF4:       [[PRED_STORE_IF9]]:
+; VF4-NEXT:    [[TMP60:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP60]]
+; VF4-NEXT:    [[TMP62:%.*]] = extractelement <4 x float> [[TMP50]], i32 2
+; VF4-NEXT:    store float [[TMP62]], ptr [[TMP61]], align 8
+; VF4-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF4:       [[PRED_STORE_CONTINUE10]]:
+; VF4-NEXT:    [[TMP63:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; VF4-NEXT:    br i1 [[TMP63]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; VF4:       [[PRED_STORE_IF11]]:
+; VF4-NEXT:    [[TMP64:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP64]]
+; VF4-NEXT:    [[TMP66:%.*]] = extractelement <4 x float> [[TMP50]], i32 3
+; VF4-NEXT:    store float [[TMP66]], ptr [[TMP65]], align 8
+; VF4-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF4:       [[PRED_STORE_CONTINUE12]]:
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4-NEXT:    br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+;
+; VF2IC2-LABEL: define void @struct_return_2xf32_replicate_predicated(
+; VF2IC2-SAME: ptr [[A:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 8
+; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP1]], align 8
+; VF2IC2-NEXT:    [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], zeroinitializer
+; VF2IC2-NEXT:    [[TMP3:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], zeroinitializer
+; VF2IC2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR3:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; VF2IC2-NEXT:    [[TMP8:%.*]] = extractvalue { float, float } [[TMP6]], 0
+; VF2IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP11:%.*]] = fdiv float [[TMP8]], [[TMP10]]
+; VF2IC2-NEXT:    store float [[TMP11]], ptr [[TMP9]], align 8
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]]
+; VF2IC2:       [[PRED_STORE_IF2]]:
+; VF2IC2-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; VF2IC2-NEXT:    [[TMP14:%.*]] = tail call { float, float } @fn2(float [[TMP13]]) #[[ATTR3]]
+; VF2IC2-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP16:%.*]] = extractvalue { float, float } [[TMP14]], 0
+; VF2IC2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP15]]
+; VF2IC2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; VF2IC2-NEXT:    [[TMP19:%.*]] = fdiv float [[TMP16]], [[TMP18]]
+; VF2IC2-NEXT:    store float [[TMP19]], ptr [[TMP17]], align 8
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
+; VF2IC2:       [[PRED_STORE_CONTINUE3]]:
+; VF2IC2-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
+; VF2IC2:       [[PRED_STORE_IF4]]:
+; VF2IC2-NEXT:    [[TMP21:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP22:%.*]] = tail call { float, float } @fn2(float [[TMP21]]) #[[ATTR3]]
+; VF2IC2-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP24:%.*]] = extractvalue { float, float } [[TMP22]], 0
+; VF2IC2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]]
+; VF2IC2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP27:%.*]] = fdiv float [[TMP24]], [[TMP26]]
+; VF2IC2-NEXT:    store float [[TMP27]], ptr [[TMP25]], align 8
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
+; VF2IC2:       [[PRED_STORE_CONTINUE5]]:
+; VF2IC2-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
+; VF2IC2:       [[PRED_STORE_IF6]]:
+; VF2IC2-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP30:%.*]] = tail call { float, float } @fn2(float [[TMP29]]) #[[ATTR3]]
+; VF2IC2-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 3
+; VF2IC2-NEXT:    [[TMP32:%.*]] = extractvalue { float, float } [[TMP30]], 0
+; VF2IC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]]
+; VF2IC2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP35:%.*]] = fdiv float [[TMP32]], [[TMP34]]
+; VF2IC2-NEXT:    store float [[TMP35]], ptr [[TMP33]], align 8
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; VF2IC2:       [[PRED_STORE_CONTINUE7]]:
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF2IC2-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 8
+  %sgt_zero = fcmp ogt float %in_val, 0.0
+  br i1 %sgt_zero, label %if.then, label %for.inc
+
+if.then:
+  %call = tail call { float, float } @fn2(float %in_val) #3
+  %extract_a = extractvalue { float, float } %call, 0
+  %div = fdiv float %extract_a, %in_val
+  store float %div, ptr %arrayidx, align 8
+  br label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
 declare { i64 } @fn1(float)
 declare { float, float } @fn2(float)
 declare { i32, i32, i32 } @fn3(i32)
@@ -464,3 +682,4 @@ declare { <8 x i32>, <8 x i32>, <8 x i32> } @fixed_vec_fn3(<8 x i32>)
 attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn1(fixed_vec_fn1)" }
 attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn2(fixed_vec_fn2)" }
 attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn3(fixed_vec_fn3)" }
+attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVnM8v_fn2(fixed_vec_fn2)" }