llvm
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlan.h‎
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Transforms/Vectorize/VPlan.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp‎
Lines changed: 3 additions & 1 deletion b/‎llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp‎
Lines changed: 24 additions & 5 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp‎
Lines changed: 21 additions & 0 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll‎
Lines changed: 4 additions & 4 deletions b/‎llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll‎
Lines changed: 22 additions & 11 deletions b/‎llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll‎
Lines changed: 22 additions & 11 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll‎
Lines changed: 2 additions & 1 deletion b/‎llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll‎
Lines changed: 8 additions & 8 deletions b/‎llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll‎
Lines changed: 8 additions & 8 deletions
@@ -971,6 +971,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     StepVector,
 
     Pack,
+    Unpack,
 
   };
 
 
@@ -101,6 +101,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return Type::getIntNTy(Ctx, 32);
   case Instruction::PHI:
   case VPInstruction::Pack:
+  case VPInstruction::Unpack:
     // Infer the type of first operand only, as other operands of header phi's
     // may lead to infinite recursion.
     return inferScalarType(R->getOperand(0));
@@ -442,7 +443,8 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
       break;
     for (VPRecipeBase &R : *VPBB) {
       if (isa<VPInstruction>(&R) &&
-          cast<VPInstruction>(&R)->getOpcode() == VPInstruction::Pack)
+          (cast<VPInstruction>(&R)->getOpcode() == VPInstruction::Pack ||
+           cast<VPInstruction>(&R)->getOpcode() == VPInstruction::Unpack))
         continue;
       Idx2Recipe.push_back(&R);
 
 
@@ -418,7 +418,8 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
 }
 
 bool VPInstruction::doesGeneratePerAllLanes() const {
-  return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
+  return (Opcode == VPInstruction::PtrAdd || Opcode == VPInstruction::Unpack) &&
+         !vputils::onlyFirstLaneUsed(this);
 }
 
 bool VPInstruction::canGenerateScalarForFirstLane() const {
@@ -438,6 +439,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::AnyOf:
+  case VPInstruction::Unpack:
     return true;
   default:
     return false;
@@ -448,10 +450,17 @@ Value *VPInstruction::generatePerLane(VPTransformState &State,
                                       const VPLane &Lane) {
   IRBuilderBase &Builder = State.Builder;
 
-  assert(getOpcode() == VPInstruction::PtrAdd &&
-         "only PtrAdd opcodes are supported for now");
-  return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
-                              State.get(getOperand(1), Lane), Name);
+  switch (getOpcode()) {
+  case VPInstruction::PtrAdd:
+    return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
+                                State.get(getOperand(1), Lane), Name);
+
+  case VPInstruction::Unpack: {
+    Value *LaneV = Lane.getAsRuntimeExpr(State.Builder, State.VF);
+    return Builder.CreateExtractElement(State.get(getOperand(0)), LaneV);
+  }
+  }
+  llvm_unreachable("all supported opcodes must be handled above");
 }
 
 /// Create a conditional branch using \p Cond branching to the successors of \p
@@ -775,6 +784,11 @@ Value *VPInstruction::generate(VPTransformState &State) {
           State.packScalarIntoVectorizedValue(getOperand(0), WideValue, Lane);
     return WideValue;
   }
+  case VPInstruction::Unpack: {
+    assert(vputils::onlyFirstLaneUsed(this) &&
+           "can only generate first lane for PtrAdd");
+    return generatePerLane(State, VPLane(0));
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -934,6 +948,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::StepVector:
   case VPInstruction::ReductionStartVector:
   case VPInstruction::Pack:
+  case VPInstruction::Unpack:
     return false;
   default:
     return true;
@@ -1077,6 +1092,10 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::Pack:
     O << "pack-into-vector";
     break;
+  case VPInstruction::Unpack:
+    O << "unpack-into-scalars";
+    break;
+
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
 
@@ -1927,6 +1927,27 @@ static void materializePack(VPlan &Plan) {
             cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
         continue;
       auto *Def = cast<VPSingleDefRecipe>(&R);
+      for (auto *Op : to_vector(Def->operands())) {
+        VPRecipeBase *OpDef = Op->getDefiningRecipe();
+        if (!OpDef || isa<VPReplicateRecipe>(OpDef) ||
+            vputils::isSingleScalar(Op) ||
+            (isa<VPInstruction>(OpDef) &&
+             cast<VPInstruction>(OpDef)->doesGeneratePerAllLanes()) ||
+            isa<VPScalarIVStepsRecipe>(OpDef))
+          continue;
+
+        auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Op});
+        if (OpDef->isPhi())
+          Unpack->insertBefore(*OpDef->getParent(),
+                               OpDef->getParent()->getFirstNonPhi());
+        else
+          Unpack->insertAfter(OpDef);
+        Op->replaceUsesWithIf(Unpack, [](VPUser &U, unsigned) {
+          auto *RepR = dyn_cast<VPReplicateRecipe>(&U);
+          return RepR && (!isa<StoreInst>(RepR->getUnderlyingInstr()) ||
+                          !vputils::isSingleScalar(RepR->getOperand(1)));
+        });
+      }
       if (all_of(Def->users(),
                  [Def](VPUser *U) { return U->usesScalars(Def); }))
         continue;
 
@@ -29,12 +29,12 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <2 x ptr> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <2 x ptr> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP10]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP10]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP12]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
@@ -62,8 +62,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP8]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP22]])
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP22]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP23]])
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i32 0
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP24]], align 1
 
@@ -15,12 +15,14 @@ target triple = "aarch64--linux-gnu"
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 ;
-; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
@@ -66,17 +68,20 @@ exit:
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
 ;
-; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
@@ -123,32 +128,38 @@ exit:
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
 ;
-; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of Invalid for VF vscale x 1: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 1: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of Invalid for VF vscale x 1: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF vscale x 1: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of Invalid for VF vscale x 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 2: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of Invalid for VF vscale x 2: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF vscale x 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of Invalid for VF vscale x 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of Invalid for VF vscale x 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF vscale x 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 
@@ -28,7 +28,8 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
 ; CHECK-NEXT:     WIDEN ir<%load> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:     REPLICATE ir<%call> = call @foo(ir<%load>)
+; CHECK-NEXT:     EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%load>
+; CHECK-NEXT:     REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-NEXT:     EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx>
 
@@ -768,15 +768,15 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) {
 ; VF2-NEXT:    [[TMP22:%.*]] = shufflevector <6 x i32> [[WIDE_VEC1]], <6 x i32> poison, <2 x i32> <i32 1, i32 4>
 ; VF2-NEXT:    [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
 ; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
-; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
 ; VF2-NEXT:    store i32 [[TMP16]], ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
 ; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
 ; VF2-NEXT:    [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
 ; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
-; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    store i32 [[TMP25]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 98
@@ -809,25 +809,25 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) {
 ; VF4-NEXT:    [[TMP44:%.*]] = shufflevector <12 x i32> [[WIDE_VEC1]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; VF4-NEXT:    [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
 ; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
-; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
 ; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
-; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
 ; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
-; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    store i32 [[TMP32]], ptr [[TMP19]], align 8
 ; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
 ; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
 ; VF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 1
 ; VF4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
 ; VF4-NEXT:    [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
 ; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
-; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
 ; VF4-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
-; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
 ; VF4-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
-; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
 ; VF4-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
 ; VF4-NEXT:    store i32 [[TMP49]], ptr [[TMP36]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
-Original file line number
+Diff line change
     StepVector,
     Pack,
 +    Unpack,
   };