diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e62d57e6920b7..d00a7f979fc19 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7192,7 +7192,8 @@ DenseMap LoopVectorizationPlanner::executePlan( // TODO: Move to VPlan transform stage once the transition to the VPlan-based // cost model is complete for better cost estimates. VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF); - VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan); + VPlanTransforms::runPass(VPlanTransforms::materializePacksAndUnpacks, + BestVPlan); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); bool HasBranchWeights = diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index fb696bea671af..acde037b32d30 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1012,6 +1012,11 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, /// Creates a fixed-width vector containing all operands. The number of /// operands matches the vector element count. BuildVector, + /// Extracts all lanes from its (non-scalable) vector operand. This is an + /// abstract VPInstruction whose single defined VPValue represents VF + /// scalars extracted from a vector, to be replaced by VF ExtractElement + /// VPInstructions. + Unpack, /// Compute the final result of a AnyOf reduction with select(cmp(),x,y), /// where one of (x,y) is loop invariant, and both x and y are integer type. ComputeAnyOfResult, @@ -2717,6 +2722,15 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags { return R && classof(R); } + static inline bool classof(const VPValue *VPV) { + const VPRecipeBase *R = VPV->getDefiningRecipe(); + return R && classof(R); + } + + static inline bool classof(const VPSingleDefRecipe *R) { + return classof(static_cast(R)); + } + /// Generate the reduction in the loop. void execute(VPTransformState &State) override; @@ -3102,6 +3116,9 @@ class VPExpressionRecipe : public VPSingleDefRecipe { /// Returns true if this expression contains recipes that may have side /// effects. bool mayHaveSideEffects() const; + + /// Returns true if the result of this VPExpressionRecipe is a single-scalar. + bool isSingleScalar() const; }; /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 07bfe7a896d86..025fa78064100 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -110,6 +110,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::AnyOf: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: + case VPInstruction::Unpack: return SetResultTyFromOp(); case VPInstruction::ExtractLane: return inferScalarType(R->getOperand(1)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 555efea1ea840..e05923cb5df85 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -368,6 +368,12 @@ m_ExtractLastElement(const Op0_t &Op0) { return m_VPInstruction(Op0); } +template +inline VPInstruction_match +m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) { + return m_VPInstruction(Op0, Op1); +} + template inline VPInstruction_match m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 8e91677292788..70b34879e3121 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -514,6 +514,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::ExtractPenultimateElement: case VPInstruction::FirstActiveLane: case VPInstruction::Not: + case VPInstruction::Unpack: return 1; case Instruction::ICmp: case Instruction::FCmp: @@ -1241,6 +1242,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::StepVector: case VPInstruction::ReductionStartVector: case VPInstruction::VScale: + case VPInstruction::Unpack: return false; default: return true; @@ -1285,7 +1287,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::PtrAdd: return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this); case VPInstruction::WidePtrAdd: - return Op == getOperand(0); + // WidePtrAdd supports scalar and vector base addresses. + return false; case VPInstruction::ComputeAnyOfResult: case VPInstruction::ComputeFindIVResult: return Op == getOperand(1); @@ -1409,6 +1412,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ResumeForEpilogue: O << "resume-for-epilogue"; break; + case VPInstruction::Unpack: + O << "unpack"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -2880,6 +2886,13 @@ bool VPExpressionRecipe::mayHaveSideEffects() const { return false; } +bool VPExpressionRecipe::isSingleScalar() const { + // Cannot use vputils::isSingleScalar(), because all external operands + // of the expression will be live-ins while bundled. + return isa(ExpressionRecipes.back()) && + !isa(ExpressionRecipes.back()); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index c8a2d84a535d3..fbd1261684b23 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1223,6 +1223,13 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } + uint64_t Idx; + if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { + auto *BuildVector = cast(R.getOperand(0)); + Def->replaceAllUsesWith(BuildVector->getOperand(Idx)); + return; + } + if (auto *Phi = dyn_cast(Def)) { if (Phi->getNumOperands() == 1) Phi->replaceAllUsesWith(Phi->getOperand(0)); @@ -3738,7 +3745,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, BTC->replaceAllUsesWith(TCMO); } -void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { +void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) { if (Plan.hasScalarVFOnly()) return; @@ -3787,6 +3794,50 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { }); } } + + // Create explicit VPInstructions to convert vectors to scalars. The current + // implementation is conservative - it may miss some cases that may or may not + // be vector values. TODO: introduce Unpacks speculatively - remove them later + // if they are known to operate on scalar values. + for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (isa(&R)) + continue; + for (VPValue *Def : R.definedValues()) { + // Skip recipes that are single-scalar or only have their first lane + // used. + // TODO: The Defs skipped here may or may not be vector values. + // Introduce Unpacks, and remove them later, if they are guaranteed to + // produce scalar values. + if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def)) + continue; + + // At the moment, we create unpacks only for scalar users outside + // replicate regions. Recipes inside replicate regions still extract the + // required lanes implicitly. + // TODO: Remove once replicate regions are unrolled completely. + auto IsCandidateUnpackUser = [Def](VPUser *U) { + VPRegionBlock *ParentRegion = + cast(U)->getParent()->getParent(); + return U->usesScalars(Def) && + (!ParentRegion || !ParentRegion->isReplicator()); + }; + if (none_of(Def->users(), IsCandidateUnpackUser)) + continue; + + auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def}); + if (R.isPhi()) + Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi()); + else + Unpack->insertAfter(&R); + Def->replaceUsesWithIf(Unpack, + [&IsCandidateUnpackUser](VPUser &U, unsigned) { + return IsCandidateUnpackUser(&U); + }); + } + } + } } void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 5a8a2bbc2975e..b28559b620e13 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -325,9 +325,10 @@ struct VPlanTransforms { static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH); - /// Add explicit Build[Struct]Vector recipes that combine multiple scalar - /// values into single vectors. - static void materializeBuildVectors(VPlan &Plan); + /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values + /// into vectors and Unpack recipes to extract scalars from vectors as + /// needed. + static void materializePacksAndUnpacks(VPlan &Plan); /// Materialize VF and VFxUF to be computed explicitly using VPInstructions. static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 5e7f19faebb56..8caf81e75a75f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -466,10 +466,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) { /// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or /// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar /// definitions for operands of \DefR. -static VPRecipeWithIRFlags * +static VPValue * cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, VPRecipeWithIRFlags *DefR, VPLane Lane, const DenseMap> &Def2LaneDefs) { + VPValue *Op; + if (match(DefR, m_VPInstruction(m_VPValue(Op)))) { + auto LaneDefs = Def2LaneDefs.find(Op); + if (LaneDefs != Def2LaneDefs.end()) + return LaneDefs->second[Lane.getKnownLane()]; + + VPValue *Idx = + Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); + } + // Collect the operands at Lane, creating extracts as needed. SmallVector NewOps; for (VPValue *Op : DefR->operands()) { @@ -481,6 +492,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, continue; } if (Lane.getKind() == VPLane::Kind::ScalableLast) { + // Look through mandatory Unpack. + [[maybe_unused]] bool Matched = + match(Op, m_VPInstruction(m_VPValue(Op))); + assert(Matched && "original op must have been Unpack"); NewOps.push_back( Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); continue; @@ -548,7 +563,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { (isa(&R) && cast(&R)->isSingleScalar()) || (isa(&R) && - !cast(&R)->doesGeneratePerAllLanes())) + !cast(&R)->doesGeneratePerAllLanes() && + cast(&R)->getOpcode() != VPInstruction::Unpack)) continue; auto *DefR = cast(&R); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 0222b0aa81063..40177d0ddb426 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -84,6 +84,12 @@ inline bool isSingleScalar(const VPValue *VPV) { return VPI->isSingleScalar() || VPI->isVectorToScalar() || (PreservesUniformity(VPI->getOpcode()) && all_of(VPI->operands(), isSingleScalar)); + if (isa(VPV)) + return false; + if (isa(VPV)) + return true; + if (auto *Expr = dyn_cast(VPV)) + return Expr->isSingleScalar(); // VPExpandSCEVRecipes must be placed in the entry and are alway uniform. return isa(VPV); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll index d10a26d1a73df..fb836d844638d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll @@ -241,12 +241,12 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) { ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP23:%.*]] = udiv [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement [[TMP23]], i32 0 ; CHECK-NEXT: [[TMP24:%.*]] = urem i64 [[INDEX]], [[MUL_2_I]] ; CHECK-NEXT: [[TMP25:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]] ; CHECK-NEXT: [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]] ; CHECK-NEXT: [[TMP27:%.*]] = udiv i64 [[TMP26]], [[X]] ; CHECK-NEXT: [[TMP28:%.*]] = urem i64 [[TMP26]], [[X]] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement [[TMP23]], i32 0 ; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[X]], [[TMP29]] ; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP30]], [[TMP25]] ; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], [[X]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll index 307d4c43198af..d23e3c29b59e5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll @@ -27,15 +27,15 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) { ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP2]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <2 x ptr> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <2 x ptr> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP10]]) -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP12]]) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP14]]) ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2 ; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[NEXT_GEP]], align 1 ; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP15]], align 1 @@ -61,8 +61,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) { ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP8]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]]) ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP23]]) ; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[NEXT_GEP7]], align 1 ; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll index 4bb8a0e72acb7..5322021f918ce 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll @@ -17,6 +17,7 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -59,7 +60,6 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP24]] = phi <4 x i16> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP24]], <4 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = sext <4 x i16> [[TMP25]] to <4 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP27]] ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP26]], ptr [[TMP28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll index 93e71af74f4ac..875cd431c1ee8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -284,17 +284,17 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0 ; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP6]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1 ; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP7]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 ; CHECK-NEXT: store i16 [[TMP12]], ptr [[TMP8]], align 2 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: store i16 [[TMP13]], ptr [[TMP9]], align 2 ; CHECK-NEXT: store i64 0, ptr [[A]], align 8 ; CHECK-NEXT: store i64 0, ptr [[B]], align 8 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll index ab9b48fb68f6b..5a99b82224831 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll @@ -205,25 +205,25 @@ define void @test_load_gep_widen_induction(ptr noalias %dst, ptr noalias %dst2) ; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 2) ; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <2 x i64> [[STEP_ADD_2]], splat (i64 2) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_2]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_3]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0 -; CHECK-NEXT: store ptr null, ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1 ; CHECK-NEXT: store ptr null, ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 ; CHECK-NEXT: store ptr null, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 ; CHECK-NEXT: store ptr null, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0 ; CHECK-NEXT: store ptr null, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1 ; CHECK-NEXT: store ptr null, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 ; CHECK-NEXT: store ptr null, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1 ; CHECK-NEXT: store ptr null, ptr [[TMP11]], align 8 +; CHECK-NEXT: store ptr null, ptr [[TMP17]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr ptr, ptr [[DST2]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 2 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index cde89763b26c3..48e4613d27738 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -812,10 +812,10 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP12]], i64 0 ; CHECK-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, [[DOTSPLIT]], i64 4 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], [[TMP12]], i32 4, splat (i1 true)) -; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP12]], i64 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP14]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 @@ -884,11 +884,11 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 { ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, [[DOTSPLIT]], i64 4 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[TMP13]], i64 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP12]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP14]], [[TMP13]], i32 4, splat (i1 true)) -; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[TMP13]], i64 0 ; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load , ptr [[TMP15]], align 4 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC1]]) ; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 @@ -1120,8 +1120,8 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], [[TMP13]], i32 4, splat (i1 true)) ; CHECK-NEXT: [[P:%.*]] = extractelement [[TMP13]], i64 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], [[TMP13]], i32 4, splat (i1 true)) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT4]]) ; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[P]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll index ef111caafbf0e..f223786a07cdf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -35,11 +35,11 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement [[VECTOR_GEP]], i32 0 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, [[VECTOR_GEP]], i64 1 ; CHECK-NEXT: store [[TMP9]], ptr [[NEXT_GEP]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement [[VECTOR_GEP]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 ; CHECK-NEXT: [[TMP12:%.*]] = add [[WIDE_LOAD]], splat (i8 1) ; CHECK-NEXT: store [[TMP12]], ptr [[TMP10]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index cf41664c28f3b..b803c3ab8d6c3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -239,9 +239,9 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 { ; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP10:%.*]] = shl [[TMP9]], splat (i64 2) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[VECTOR_GEP]], i64 0 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[VECTOR_GEP]], i64 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 8 ; CHECK-NEXT: [[TMP12]] = add [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: store [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8 @@ -313,8 +313,8 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr % ; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl [[TMP4]], splat (i64 1) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne [[VECTOR_GEP]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = extractelement [[VECTOR_GEP]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne [[VECTOR_GEP]], zeroinitializer ; CHECK-NEXT: call void @llvm.masked.store.nxv2i16.p0( zeroinitializer, ptr [[TMP7]], i32 2, [[TMP6]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP0]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll index dd63b5ed20d13..6d0c55b1d246c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll @@ -748,15 +748,15 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) { ; VF2-NEXT: [[TMP22:%.*]] = shufflevector <6 x i32> [[WIDE_VEC1]], <6 x i32> poison, <2 x i32> ; VF2-NEXT: [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]] ; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0 -; VF2-NEXT: store i32 [[TMP15]], ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1 +; VF2-NEXT: store i32 [[TMP15]], ptr [[TMP8]], align 8 ; VF2-NEXT: store i32 [[TMP16]], ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1 ; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1 ; VF2-NEXT: [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]] ; VF2-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0 -; VF2-NEXT: store i32 [[TMP24]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1 +; VF2-NEXT: store i32 [[TMP24]], ptr [[TMP17]], align 8 ; VF2-NEXT: store i32 [[TMP25]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 98 @@ -789,12 +789,12 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) { ; VF4-NEXT: [[TMP44:%.*]] = shufflevector <12 x i32> [[WIDE_VEC1]], <12 x i32> poison, <4 x i32> ; VF4-NEXT: [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]] ; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0 -; VF4-NEXT: store i32 [[TMP29]], ptr [[TMP16]], align 8 ; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1 -; VF4-NEXT: store i32 [[TMP30]], ptr [[TMP17]], align 8 ; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2 -; VF4-NEXT: store i32 [[TMP31]], ptr [[TMP18]], align 8 ; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3 +; VF4-NEXT: store i32 [[TMP29]], ptr [[TMP16]], align 8 +; VF4-NEXT: store i32 [[TMP30]], ptr [[TMP17]], align 8 +; VF4-NEXT: store i32 [[TMP31]], ptr [[TMP18]], align 8 ; VF4-NEXT: store i32 [[TMP32]], ptr [[TMP19]], align 8 ; VF4-NEXT: [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1 ; VF4-NEXT: [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1 @@ -802,12 +802,12 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) { ; VF4-NEXT: [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1 ; VF4-NEXT: [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]] ; VF4-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0 -; VF4-NEXT: store i32 [[TMP46]], ptr [[TMP33]], align 8 ; VF4-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1 -; VF4-NEXT: store i32 [[TMP47]], ptr [[TMP34]], align 8 ; VF4-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2 -; VF4-NEXT: store i32 [[TMP48]], ptr [[TMP35]], align 8 ; VF4-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3 +; VF4-NEXT: store i32 [[TMP46]], ptr [[TMP33]], align 8 +; VF4-NEXT: store i32 [[TMP47]], ptr [[TMP34]], align 8 +; VF4-NEXT: store i32 [[TMP48]], ptr [[TMP35]], align 8 ; VF4-NEXT: store i32 [[TMP49]], ptr [[TMP36]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll index 187edb580f8e2..4761cb0d63de7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll @@ -20,30 +20,30 @@ define void @test0(ptr noalias %M3, ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP4]], align 2 ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i16> [[WIDE_LOAD]], splat (i16 10) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = ashr exact i64 [[TMP10]], 32 -; CHECK-NEXT: [[TMP15:%.*]] = ashr exact i64 [[TMP11]], 32 -; CHECK-NEXT: [[TMP16:%.*]] = ashr exact i64 [[TMP12]], 32 -; CHECK-NEXT: [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0 -; CHECK-NEXT: store i16 [[TMP22]], ptr [[TMP18]], align 2 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1 -; CHECK-NEXT: store i16 [[TMP23]], ptr [[TMP19]], align 2 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 -; CHECK-NEXT: store i16 [[TMP24]], ptr [[TMP20]], align 2 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 -; CHECK-NEXT: store i16 [[TMP25]], ptr [[TMP21]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32 +; CHECK-NEXT: [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32 +; CHECK-NEXT: [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32 +; CHECK-NEXT: [[TMP21:%.*]] = ashr exact i64 [[TMP17]], 32 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP21]] +; CHECK-NEXT: store i16 [[TMP6]], ptr [[TMP22]], align 2 +; CHECK-NEXT: store i16 [[TMP7]], ptr [[TMP23]], align 2 +; CHECK-NEXT: store i16 [[TMP8]], ptr [[TMP24]], align 2 +; CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP25]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -95,30 +95,30 @@ define void @test1(ptr noalias %M3, ptr noalias %A, ptr noalias %B, ptr noalias ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[WIDE_LOAD]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = ashr exact i64 [[TMP12]], 32 -; CHECK-NEXT: [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32 -; CHECK-NEXT: [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32 -; CHECK-NEXT: [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP7]], i32 0 -; CHECK-NEXT: store i16 [[TMP24]], ptr [[TMP20]], align 2 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP7]], i32 1 -; CHECK-NEXT: store i16 [[TMP25]], ptr [[TMP21]], align 2 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP7]], i32 2 -; CHECK-NEXT: store i16 [[TMP26]], ptr [[TMP22]], align 2 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP7]], i32 3 -; CHECK-NEXT: store i16 [[TMP27]], ptr [[TMP23]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32 +; CHECK-NEXT: [[TMP21:%.*]] = ashr exact i64 [[TMP17]], 32 +; CHECK-NEXT: [[TMP22:%.*]] = ashr exact i64 [[TMP18]], 32 +; CHECK-NEXT: [[TMP23:%.*]] = ashr exact i64 [[TMP19]], 32 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP23]] +; CHECK-NEXT: store i16 [[TMP8]], ptr [[TMP24]], align 2 +; CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP25]], align 2 +; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP26]], align 2 +; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP27]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll index 96a25a853f880..5999707699970 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll @@ -116,11 +116,11 @@ define void @test(ptr noalias %src, ptr noalias %dst) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double> ; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x double> @__simd_sin_v2f64(<2 x double> [[TMP4]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 ; CHECK-NEXT: store double [[TMP8]], ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 ; CHECK-NEXT: store double [[TMP9]], ptr [[TMP7]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll index 2adb3b5035582..fc0b19da47f4b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll @@ -54,8 +54,8 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 { ; NARROW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 ; NARROW-NEXT: [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float> ; NARROW-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 -; NARROW-NEXT: [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]] ; NARROW-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; NARROW-NEXT: [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]] ; NARROW-NEXT: [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]] ; NARROW-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 ; NARROW-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 5ca9bfdb29c2c..f2f65685e9bad 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -870,17 +870,17 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2 ; RV64-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3 ; RV64-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1) +; RV64-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 +; RV64-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 +; RV64-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 +; RV64-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 ; RV64-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]] ; RV64-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]] ; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]] ; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]] -; RV64-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 ; RV64-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1 -; RV64-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 ; RV64-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1 -; RV64-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 ; RV64-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1 -; RV64-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 ; RV64-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; RV64-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 @@ -921,17 +921,17 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2 ; RV32-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3 ; RV32-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1) +; RV32-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 +; RV32-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 +; RV32-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 +; RV32-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 ; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]] ; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]] ; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]] ; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]] -; RV32-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 ; RV32-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1 -; RV32-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 ; RV32-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1 -; RV32-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 ; RV32-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1 -; RV32-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 ; RV32-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; RV32-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 @@ -992,7 +992,15 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) { ; RV64-UF2-NEXT: [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2 ; RV64-UF2-NEXT: [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3 ; RV64-UF2-NEXT: [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1) +; RV64-UF2-NEXT: [[TMP50:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0 +; RV64-UF2-NEXT: [[TMP51:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1 +; RV64-UF2-NEXT: [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2 +; RV64-UF2-NEXT: [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3 ; RV64-UF2-NEXT: [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1) +; RV64-UF2-NEXT: [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0 +; RV64-UF2-NEXT: [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1 +; RV64-UF2-NEXT: [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2 +; RV64-UF2-NEXT: [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3 ; RV64-UF2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP8]] ; RV64-UF2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP9]] ; RV64-UF2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP10]] @@ -1001,21 +1009,13 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) { ; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]] ; RV64-UF2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]] ; RV64-UF2-NEXT: [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP50:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0 ; RV64-UF2-NEXT: store i7 [[TMP50]], ptr [[TMP42]], align 1 -; RV64-UF2-NEXT: [[TMP51:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1 ; RV64-UF2-NEXT: store i7 [[TMP51]], ptr [[TMP43]], align 1 -; RV64-UF2-NEXT: [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2 ; RV64-UF2-NEXT: store i7 [[TMP52]], ptr [[TMP44]], align 1 -; RV64-UF2-NEXT: [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3 ; RV64-UF2-NEXT: store i7 [[TMP53]], ptr [[TMP45]], align 1 -; RV64-UF2-NEXT: [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0 ; RV64-UF2-NEXT: store i7 [[TMP54]], ptr [[TMP46]], align 1 -; RV64-UF2-NEXT: [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1 ; RV64-UF2-NEXT: store i7 [[TMP55]], ptr [[TMP47]], align 1 -; RV64-UF2-NEXT: [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2 ; RV64-UF2-NEXT: store i7 [[TMP56]], ptr [[TMP48]], align 1 -; RV64-UF2-NEXT: [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3 ; RV64-UF2-NEXT: store i7 [[TMP57]], ptr [[TMP49]], align 1 ; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; RV64-UF2-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 2fbc73ef74d16..01640cd347d82 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -269,8 +269,8 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK-UF2-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-UF2-NEXT: [[TMP11:%.*]] = mul [[TMP10]], splat (i64 8) ; CHECK-UF2-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP11]] -; CHECK-UF2-NEXT: [[STEP_ADD:%.*]] = getelementptr i8, [[VECTOR_GEP]], [[TMP9]] ; CHECK-UF2-NEXT: [[TMP12:%.*]] = extractelement [[VECTOR_GEP]], i32 0 +; CHECK-UF2-NEXT: [[STEP_ADD:%.*]] = getelementptr i8, [[VECTOR_GEP]], [[TMP9]] ; CHECK-UF2-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP12]], align 4 ; CHECK-UF2-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-UF2-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll index e11b1ad7f09dc..7cf830918402b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -107,24 +107,24 @@ define void @PR31671(float %x, ptr %d) #0 { ; FORCE-NEXT: [[WIDE_VEC13:%.*]] = load <10 x float>, ptr [[TMP22]], align 4 ; FORCE-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <10 x float> [[WIDE_VEC13]], <10 x float> poison, <2 x i32> ; FORCE-NEXT: [[TMP24:%.*]] = fadd <2 x float> [[STRIDED_VEC8]], [[TMP12]] +; FORCE-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[TMP24]], i32 0 +; FORCE-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[TMP24]], i32 1 ; FORCE-NEXT: [[TMP25:%.*]] = fadd <2 x float> [[STRIDED_VEC10]], [[TMP13]] +; FORCE-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP25]], i32 0 +; FORCE-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[TMP25]], i32 1 ; FORCE-NEXT: [[TMP26:%.*]] = fadd <2 x float> [[STRIDED_VEC12]], [[TMP14]] +; FORCE-NEXT: [[TMP32:%.*]] = extractelement <2 x float> [[TMP26]], i32 0 +; FORCE-NEXT: [[TMP33:%.*]] = extractelement <2 x float> [[TMP26]], i32 1 ; FORCE-NEXT: [[TMP27:%.*]] = fadd <2 x float> [[STRIDED_VEC14]], [[TMP15]] -; FORCE-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[TMP24]], i32 0 +; FORCE-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP27]], i32 0 +; FORCE-NEXT: [[TMP35:%.*]] = extractelement <2 x float> [[TMP27]], i32 1 ; FORCE-NEXT: store float [[TMP28]], ptr [[TMP16]], align 4 -; FORCE-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[TMP24]], i32 1 ; FORCE-NEXT: store float [[TMP29]], ptr [[TMP17]], align 4 -; FORCE-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP25]], i32 0 ; FORCE-NEXT: store float [[TMP30]], ptr [[TMP18]], align 4 -; FORCE-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[TMP25]], i32 1 ; FORCE-NEXT: store float [[TMP31]], ptr [[TMP19]], align 4 -; FORCE-NEXT: [[TMP32:%.*]] = extractelement <2 x float> [[TMP26]], i32 0 ; FORCE-NEXT: store float [[TMP32]], ptr [[TMP20]], align 4 -; FORCE-NEXT: [[TMP33:%.*]] = extractelement <2 x float> [[TMP26]], i32 1 ; FORCE-NEXT: store float [[TMP33]], ptr [[TMP21]], align 4 -; FORCE-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP27]], i32 0 ; FORCE-NEXT: store float [[TMP34]], ptr [[TMP22]], align 4 -; FORCE-NEXT: [[TMP35:%.*]] = extractelement <2 x float> [[TMP27]], i32 1 ; FORCE-NEXT: store float [[TMP35]], ptr [[TMP23]], align 4 ; FORCE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FORCE-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6392 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll index 590b2691c3238..98c204a3ce7ee 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll @@ -27,36 +27,36 @@ define void @test_replicate_call_chain(float %x, ptr noalias %A, ptr noalias %B, ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP9]], i32 4, <16 x i1> [[TMP7]], <16 x float> poison) ; CHECK-NEXT: [[TMP10:%.*]] = fmul <16 x float> [[WIDE_MASKED_LOAD]], splat (float 2.000000e+00) ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = tail call float @llvm.pow.f32(float [[TMP11]], float [[X:%.*]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = tail call float @llvm.pow.f32(float [[TMP13]], float [[X]]) ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = tail call float @llvm.pow.f32(float [[TMP15]], float [[X]]) ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[TMP10]], i32 3 -; CHECK-NEXT: [[TMP18:%.*]] = tail call float @llvm.pow.f32(float [[TMP17]], float [[X]]) ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x float> [[TMP10]], i32 4 -; CHECK-NEXT: [[TMP20:%.*]] = tail call float @llvm.pow.f32(float [[TMP19]], float [[X]]) ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x float> [[TMP10]], i32 5 -; CHECK-NEXT: [[TMP22:%.*]] = tail call float @llvm.pow.f32(float [[TMP21]], float [[X]]) ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x float> [[TMP10]], i32 6 -; CHECK-NEXT: [[TMP24:%.*]] = tail call float @llvm.pow.f32(float [[TMP23]], float [[X]]) ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x float> [[TMP10]], i32 7 -; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.pow.f32(float [[TMP25]], float [[X]]) ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x float> [[TMP10]], i32 8 -; CHECK-NEXT: [[TMP28:%.*]] = tail call float @llvm.pow.f32(float [[TMP27]], float [[X]]) ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x float> [[TMP10]], i32 9 -; CHECK-NEXT: [[TMP30:%.*]] = tail call float @llvm.pow.f32(float [[TMP29]], float [[X]]) ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x float> [[TMP10]], i32 10 -; CHECK-NEXT: [[TMP32:%.*]] = tail call float @llvm.pow.f32(float [[TMP31]], float [[X]]) ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x float> [[TMP10]], i32 11 -; CHECK-NEXT: [[TMP34:%.*]] = tail call float @llvm.pow.f32(float [[TMP33]], float [[X]]) ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x float> [[TMP10]], i32 12 -; CHECK-NEXT: [[TMP36:%.*]] = tail call float @llvm.pow.f32(float [[TMP35]], float [[X]]) ; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x float> [[TMP10]], i32 13 -; CHECK-NEXT: [[TMP38:%.*]] = tail call float @llvm.pow.f32(float [[TMP37]], float [[X]]) ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x float> [[TMP10]], i32 14 -; CHECK-NEXT: [[TMP40:%.*]] = tail call float @llvm.pow.f32(float [[TMP39]], float [[X]]) ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x float> [[TMP10]], i32 15 +; CHECK-NEXT: [[TMP12:%.*]] = tail call float @llvm.pow.f32(float [[TMP11]], float [[X:%.*]]) +; CHECK-NEXT: [[TMP14:%.*]] = tail call float @llvm.pow.f32(float [[TMP13]], float [[X]]) +; CHECK-NEXT: [[TMP16:%.*]] = tail call float @llvm.pow.f32(float [[TMP15]], float [[X]]) +; CHECK-NEXT: [[TMP18:%.*]] = tail call float @llvm.pow.f32(float [[TMP17]], float [[X]]) +; CHECK-NEXT: [[TMP20:%.*]] = tail call float @llvm.pow.f32(float [[TMP19]], float [[X]]) +; CHECK-NEXT: [[TMP22:%.*]] = tail call float @llvm.pow.f32(float [[TMP21]], float [[X]]) +; CHECK-NEXT: [[TMP24:%.*]] = tail call float @llvm.pow.f32(float [[TMP23]], float [[X]]) +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.pow.f32(float [[TMP25]], float [[X]]) +; CHECK-NEXT: [[TMP28:%.*]] = tail call float @llvm.pow.f32(float [[TMP27]], float [[X]]) +; CHECK-NEXT: [[TMP30:%.*]] = tail call float @llvm.pow.f32(float [[TMP29]], float [[X]]) +; CHECK-NEXT: [[TMP32:%.*]] = tail call float @llvm.pow.f32(float [[TMP31]], float [[X]]) +; CHECK-NEXT: [[TMP34:%.*]] = tail call float @llvm.pow.f32(float [[TMP33]], float [[X]]) +; CHECK-NEXT: [[TMP36:%.*]] = tail call float @llvm.pow.f32(float [[TMP35]], float [[X]]) +; CHECK-NEXT: [[TMP38:%.*]] = tail call float @llvm.pow.f32(float [[TMP37]], float [[X]]) +; CHECK-NEXT: [[TMP40:%.*]] = tail call float @llvm.pow.f32(float [[TMP39]], float [[X]]) ; CHECK-NEXT: [[TMP42:%.*]] = tail call float @llvm.pow.f32(float [[TMP41]], float [[X]]) ; CHECK-NEXT: [[TMP43:%.*]] = tail call float @llvm.pow.f32(float [[TMP12]], float [[X]]) ; CHECK-NEXT: [[TMP44:%.*]] = tail call float @llvm.pow.f32(float [[TMP14]], float [[X]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll index 8126c70edb0d8..f1e29d396b22e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll @@ -240,8 +240,8 @@ define void @drop_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[INPUT]], <4 x i64> -; CHECK-NEXT: store <4 x ptr> [[TMP3]], ptr [[PTRS]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP3]], i32 0 +; CHECK-NEXT: store <4 x ptr> [[TMP3]], ptr [[PTRS]], align 8 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP6]], i32 4, <4 x i1> , <4 x float> poison), !invariant.load [[META0]] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> , <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer ; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll index 2f33e111d8ca7..2b2aa0806d8da 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -786,16 +786,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]] ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP21]], align 4, !alias.scope [[META8:![0-9]+]] ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 -; FVW2-NEXT: store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] ; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 +; FVW2-NEXT: store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] ; FVW2-NEXT: store float [[TMP24]], ptr [[TMP20]], align 4, !alias.scope [[META11]], !noalias [[META13]] ; FVW2-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP16]], align 4, !alias.scope [[META15:![0-9]+]] -; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1 -; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1 ; FVW2-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 0 -; FVW2-NEXT: store float [[TMP28]], ptr [[TMP26]], align 4, !alias.scope [[META11]], !noalias [[META13]] ; FVW2-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 1 -; FVW2-NEXT: store float [[TMP29]], ptr [[TMP27]], align 4, !alias.scope [[META11]], !noalias [[META13]] +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1 +; FVW2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1 +; FVW2-NEXT: store float [[TMP28]], ptr [[TMP25]], align 4, !alias.scope [[META11]], !noalias [[META13]] +; FVW2-NEXT: store float [[TMP29]], ptr [[TMP22]], align 4, !alias.scope [[META11]], !noalias [[META13]] ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; FVW2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; FVW2-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll index 2f44c7ecd770f..f8f77ffb8b49e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll @@ -82,10 +82,10 @@ define void @gep_use_outside_loop(ptr noalias %dst, ptr %src) { ; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[DST]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i16> [[WIDE_LOAD]], splat (i16 10) -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP6]], i32 2, <4 x i1> [[TMP5]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index a19b294541172..1b8f8dc9d97da 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -297,36 +297,36 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 { ; CHECK-NEXT: [[TMP24:%.*]] = lshr <16 x i32> [[TMP23]], splat (i32 1) ; CHECK-NEXT: [[TMP25:%.*]] = trunc <16 x i32> [[TMP24]] to <16 x i8> ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[TMP25]], i32 0 -; CHECK-NEXT: store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META18:![0-9]+]], !noalias [[META15]] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP25]], i32 1 -; CHECK-NEXT: store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[TMP25]], i32 2 -; CHECK-NEXT: store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i8> [[TMP25]], i32 3 -; CHECK-NEXT: store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[TMP25]], i32 4 -; CHECK-NEXT: store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP25]], i32 5 -; CHECK-NEXT: store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[TMP25]], i32 6 -; CHECK-NEXT: store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i8> [[TMP25]], i32 7 -; CHECK-NEXT: store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[TMP25]], i32 8 -; CHECK-NEXT: store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP25]], i32 9 -; CHECK-NEXT: store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[TMP25]], i32 10 -; CHECK-NEXT: store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i8> [[TMP25]], i32 11 -; CHECK-NEXT: store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[TMP25]], i32 12 -; CHECK-NEXT: store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP25]], i32 13 -; CHECK-NEXT: store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[TMP25]], i32 14 -; CHECK-NEXT: store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i8> [[TMP25]], i32 15 +; CHECK-NEXT: store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META18:![0-9]+]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: store i8 [[TMP41]], ptr [[NEXT_GEP21]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967184 diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll index 8e0401d9baff9..14fb2a76eb75b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll @@ -69,40 +69,40 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 { ; CHECK-NEXT: [[STRIDED_VEC25:%.*]] = shufflevector <8 x i32> [[WIDE_VEC24]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC26:%.*]] = shufflevector <8 x i32> [[WIDE_VEC24]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = add <4 x i32> [[STRIDED_VEC17]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1 +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2 +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[TMP36]], i32 3 ; CHECK-NEXT: [[TMP37:%.*]] = add <4 x i32> [[STRIDED_VEC20]], [[STRIDED_VEC19]] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i32> [[TMP37]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i32> [[TMP37]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP37]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP37]], i32 3 ; CHECK-NEXT: [[TMP38:%.*]] = add <4 x i32> [[STRIDED_VEC23]], [[STRIDED_VEC22]] +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP38]], i32 0 +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP38]], i32 1 +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i32> [[TMP38]], i32 2 +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i32> [[TMP38]], i32 3 ; CHECK-NEXT: [[TMP39:%.*]] = add <4 x i32> [[STRIDED_VEC26]], [[STRIDED_VEC25]] -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0 +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3 ; CHECK-NEXT: store i32 [[TMP40]], ptr [[NEXT_GEP12]], align 4 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1 ; CHECK-NEXT: store i32 [[TMP41]], ptr [[NEXT_GEP2]], align 4 -; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2 ; CHECK-NEXT: store i32 [[TMP42]], ptr [[NEXT_GEP3]], align 4 -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[TMP36]], i32 3 ; CHECK-NEXT: store i32 [[TMP43]], ptr [[NEXT_GEP4]], align 4 -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i32> [[TMP37]], i32 0 ; CHECK-NEXT: store i32 [[TMP44]], ptr [[NEXT_GEP13]], align 4 -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i32> [[TMP37]], i32 1 ; CHECK-NEXT: store i32 [[TMP45]], ptr [[NEXT_GEP6]], align 4 -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP37]], i32 2 ; CHECK-NEXT: store i32 [[TMP46]], ptr [[NEXT_GEP7]], align 4 -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP37]], i32 3 ; CHECK-NEXT: store i32 [[TMP47]], ptr [[NEXT_GEP8]], align 4 -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP38]], i32 0 ; CHECK-NEXT: store i32 [[TMP48]], ptr [[NEXT_GEP14]], align 4 -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP38]], i32 1 ; CHECK-NEXT: store i32 [[TMP49]], ptr [[NEXT_GEP10]], align 4 -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i32> [[TMP38]], i32 2 ; CHECK-NEXT: store i32 [[TMP50]], ptr [[NEXT_GEP11]], align 4 -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i32> [[TMP38]], i32 3 ; CHECK-NEXT: store i32 [[TMP51]], ptr [[NEXT_GEP17]], align 4 -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0 ; CHECK-NEXT: store i32 [[TMP52]], ptr [[NEXT_GEP15]], align 4 -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1 ; CHECK-NEXT: store i32 [[TMP53]], ptr [[NEXT_GEP18]], align 4 -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2 ; CHECK-NEXT: store i32 [[TMP54]], ptr [[NEXT_GEP19]], align 4 -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3 ; CHECK-NEXT: store i32 [[TMP55]], ptr [[NEXT_GEP16]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP56:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll index d17361a6caacb..829fdff5123e2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll @@ -28,18 +28,18 @@ define void @pr63602_1(ptr %arr) { ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0 -; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1 -; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]] +; CHECK-NEXT: store i32 [[TMP8]], ptr [[TMP12]], align 4 +; CHECK-NEXT: store i32 [[TMP9]], ptr [[TMP13]], align 4 +; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP14]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP15]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP16]] ; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <12 x i32>, ptr [[TMP17]], align 4 @@ -47,13 +47,13 @@ define void @pr63602_1(ptr %arr) { ; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC2]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = add <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0 -; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1 -; CHECK-NEXT: store i32 [[TMP20]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2 -; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3 -; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP11]], align 4 +; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP12]], align 4 +; CHECK-NEXT: store i32 [[TMP20]], ptr [[TMP13]], align 4 +; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP14]], align 4 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP15]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -141,18 +141,18 @@ define void @pr63602_2(ptr %arr) { ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP10]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP11]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1 -; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP13]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2 -; CHECK-NEXT: store i32 [[TMP18]], ptr [[TMP14]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3 -; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP9]] +; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP16]], align 4 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP17]], align 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP18]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP20:%.*]] = add nuw nsw i64 [[TMP1]], 2 ; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[TMP2]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw i64 [[TMP3]], 2 @@ -161,10 +161,10 @@ define void @pr63602_2(ptr %arr) { ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP23]] -; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP13]], align 4 -; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP14]], align 4 -; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP17]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP18]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP28]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 1 ; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP30]], i32 2 @@ -179,13 +179,13 @@ define void @pr63602_2(ptr %arr) { ; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 3 ; CHECK-NEXT: [[TMP44:%.*]] = add <4 x i32> [[TMP35]], [[TMP43]] ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i32> [[TMP44]], i32 0 -; CHECK-NEXT: store i32 [[TMP45]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP44]], i32 1 -; CHECK-NEXT: store i32 [[TMP46]], ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP44]], i32 2 -; CHECK-NEXT: store i32 [[TMP47]], ptr [[TMP14]], align 4 ; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP44]], i32 3 -; CHECK-NEXT: store i32 [[TMP48]], ptr [[TMP15]], align 4 +; CHECK-NEXT: store i32 [[TMP45]], ptr [[TMP16]], align 4 +; CHECK-NEXT: store i32 [[TMP46]], ptr [[TMP17]], align 4 +; CHECK-NEXT: store i32 [[TMP47]], ptr [[TMP18]], align 4 +; CHECK-NEXT: store i32 [[TMP48]], ptr [[TMP19]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll index 3efb82de7e9f1..1dc114e629164 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll @@ -26,9 +26,9 @@ define void @avoid_sinking_store_across_load(ptr %arr) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[VEC_IND2]] ; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i32> [[STRIDED_VEC]], splat (i32 25) ; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP7]], <4 x ptr> [[TMP6]], i32 4, <4 x i1> splat (i1 true)) -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP6]], i32 0 ; CHECK-NEXT: [[WIDE_VEC4:%.*]] = load <12 x i32>, ptr [[TMP8]], align 4 ; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC4]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC4]], <12 x i32> poison, <4 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll index 5d76dfb781636..66809ebb715f0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll @@ -73,6 +73,10 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4 @@ -92,14 +96,10 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0 -; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP1:![0-9]+]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1 -; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP1]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2 -; CHECK-NEXT: store i32 [[TMP23]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP1]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3 -; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP1]] +; CHECK-NEXT: store i32 [[TMP1]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP1:![0-9]+]] +; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP1]] +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP1]] +; CHECK-NEXT: store i32 [[TMP23]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP1]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP27]], i64 4 ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4, !llvm.access.group [[ACC_GRP0]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll index 65058bd86c935..14a83176f02ff 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll @@ -45,9 +45,25 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 { ; I64-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 14 ; I64-NEXT: [[TMP19:%.*]] = add i32 [[INDEX]], 15 ; I64-NEXT: [[TMP20:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double> +; I64-NEXT: [[CONV:%.*]] = extractelement <4 x double> [[TMP20]], i32 0 +; I64-NEXT: [[TMP57:%.*]] = extractelement <4 x double> [[TMP20]], i32 1 +; I64-NEXT: [[TMP58:%.*]] = extractelement <4 x double> [[TMP20]], i32 2 +; I64-NEXT: [[TMP59:%.*]] = extractelement <4 x double> [[TMP20]], i32 3 ; I64-NEXT: [[TMP21:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double> +; I64-NEXT: [[TMP60:%.*]] = extractelement <4 x double> [[TMP21]], i32 0 +; I64-NEXT: [[TMP61:%.*]] = extractelement <4 x double> [[TMP21]], i32 1 +; I64-NEXT: [[TMP62:%.*]] = extractelement <4 x double> [[TMP21]], i32 2 +; I64-NEXT: [[TMP63:%.*]] = extractelement <4 x double> [[TMP21]], i32 3 ; I64-NEXT: [[TMP22:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double> +; I64-NEXT: [[TMP64:%.*]] = extractelement <4 x double> [[TMP22]], i32 0 +; I64-NEXT: [[TMP65:%.*]] = extractelement <4 x double> [[TMP22]], i32 1 +; I64-NEXT: [[TMP66:%.*]] = extractelement <4 x double> [[TMP22]], i32 2 +; I64-NEXT: [[TMP67:%.*]] = extractelement <4 x double> [[TMP22]], i32 3 ; I64-NEXT: [[TMP23:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double> +; I64-NEXT: [[TMP68:%.*]] = extractelement <4 x double> [[TMP23]], i32 0 +; I64-NEXT: [[TMP69:%.*]] = extractelement <4 x double> [[TMP23]], i32 1 +; I64-NEXT: [[TMP70:%.*]] = extractelement <4 x double> [[TMP23]], i32 2 +; I64-NEXT: [[TMP71:%.*]] = extractelement <4 x double> [[TMP23]], i32 3 ; I64-NEXT: [[ADD_PTR_I:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[IV]] ; I64-NEXT: [[TMP25:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]] ; I64-NEXT: [[TMP26:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]] @@ -80,37 +96,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 { ; I64-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP37]], align 4 ; I64-NEXT: [[TMP54:%.*]] = load ptr, ptr [[TMP38]], align 4 ; I64-NEXT: [[TMP55:%.*]] = load ptr, ptr [[TMP39]], align 4 -; I64-NEXT: [[CONV:%.*]] = extractelement <4 x double> [[TMP20]], i32 0 ; I64-NEXT: store double [[CONV]], ptr [[TMP0]], align 4 -; I64-NEXT: [[TMP57:%.*]] = extractelement <4 x double> [[TMP20]], i32 1 ; I64-NEXT: store double [[TMP57]], ptr [[TMP41]], align 4 -; I64-NEXT: [[TMP58:%.*]] = extractelement <4 x double> [[TMP20]], i32 2 ; I64-NEXT: store double [[TMP58]], ptr [[TMP42]], align 4 -; I64-NEXT: [[TMP59:%.*]] = extractelement <4 x double> [[TMP20]], i32 3 ; I64-NEXT: store double [[TMP59]], ptr [[TMP43]], align 4 -; I64-NEXT: [[TMP60:%.*]] = extractelement <4 x double> [[TMP21]], i32 0 ; I64-NEXT: store double [[TMP60]], ptr [[TMP44]], align 4 -; I64-NEXT: [[TMP61:%.*]] = extractelement <4 x double> [[TMP21]], i32 1 ; I64-NEXT: store double [[TMP61]], ptr [[TMP45]], align 4 -; I64-NEXT: [[TMP62:%.*]] = extractelement <4 x double> [[TMP21]], i32 2 ; I64-NEXT: store double [[TMP62]], ptr [[TMP46]], align 4 -; I64-NEXT: [[TMP63:%.*]] = extractelement <4 x double> [[TMP21]], i32 3 ; I64-NEXT: store double [[TMP63]], ptr [[TMP47]], align 4 -; I64-NEXT: [[TMP64:%.*]] = extractelement <4 x double> [[TMP22]], i32 0 ; I64-NEXT: store double [[TMP64]], ptr [[TMP48]], align 4 -; I64-NEXT: [[TMP65:%.*]] = extractelement <4 x double> [[TMP22]], i32 1 ; I64-NEXT: store double [[TMP65]], ptr [[TMP49]], align 4 -; I64-NEXT: [[TMP66:%.*]] = extractelement <4 x double> [[TMP22]], i32 2 ; I64-NEXT: store double [[TMP66]], ptr [[TMP50]], align 4 -; I64-NEXT: [[TMP67:%.*]] = extractelement <4 x double> [[TMP22]], i32 3 ; I64-NEXT: store double [[TMP67]], ptr [[TMP51]], align 4 -; I64-NEXT: [[TMP68:%.*]] = extractelement <4 x double> [[TMP23]], i32 0 ; I64-NEXT: store double [[TMP68]], ptr [[TMP52]], align 4 -; I64-NEXT: [[TMP69:%.*]] = extractelement <4 x double> [[TMP23]], i32 1 ; I64-NEXT: store double [[TMP69]], ptr [[TMP53]], align 4 -; I64-NEXT: [[TMP70:%.*]] = extractelement <4 x double> [[TMP23]], i32 2 ; I64-NEXT: store double [[TMP70]], ptr [[TMP54]], align 4 -; I64-NEXT: [[TMP71:%.*]] = extractelement <4 x double> [[TMP23]], i32 3 ; I64-NEXT: store double [[TMP71]], ptr [[TMP55]], align 4 ; I64-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; I64-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) @@ -139,21 +139,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 { ; I64-NEXT: [[TMP77:%.*]] = add i32 [[INDEX4]], 2 ; I64-NEXT: [[TMP78:%.*]] = add i32 [[INDEX4]], 3 ; I64-NEXT: [[TMP79:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double> -; I64-NEXT: [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]] -; I64-NEXT: [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]] -; I64-NEXT: [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]] -; I64-NEXT: [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP78]] -; I64-NEXT: [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4 -; I64-NEXT: [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4 -; I64-NEXT: [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4 -; I64-NEXT: [[TMP87:%.*]] = load ptr, ptr [[TMP83]], align 4 ; I64-NEXT: [[TMP88:%.*]] = extractelement <4 x double> [[TMP79]], i32 0 -; I64-NEXT: store double [[TMP88]], ptr [[TMP84]], align 4 ; I64-NEXT: [[TMP89:%.*]] = extractelement <4 x double> [[TMP79]], i32 1 -; I64-NEXT: store double [[TMP89]], ptr [[TMP85]], align 4 ; I64-NEXT: [[TMP90:%.*]] = extractelement <4 x double> [[TMP79]], i32 2 -; I64-NEXT: store double [[TMP90]], ptr [[TMP86]], align 4 ; I64-NEXT: [[TMP91:%.*]] = extractelement <4 x double> [[TMP79]], i32 3 +; I64-NEXT: [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]] +; I64-NEXT: [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]] +; I64-NEXT: [[TMP86:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]] +; I64-NEXT: [[TMP93:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP78]] +; I64-NEXT: [[TMP94:%.*]] = load ptr, ptr [[TMP84]], align 4 +; I64-NEXT: [[TMP95:%.*]] = load ptr, ptr [[TMP85]], align 4 +; I64-NEXT: [[TMP96:%.*]] = load ptr, ptr [[TMP86]], align 4 +; I64-NEXT: [[TMP87:%.*]] = load ptr, ptr [[TMP93]], align 4 +; I64-NEXT: store double [[TMP88]], ptr [[TMP94]], align 4 +; I64-NEXT: store double [[TMP89]], ptr [[TMP95]], align 4 +; I64-NEXT: store double [[TMP90]], ptr [[TMP96]], align 4 ; I64-NEXT: store double [[TMP91]], ptr [[TMP87]], align 4 ; I64-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4 ; I64-NEXT: [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4) @@ -201,9 +201,25 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 { ; I32-NEXT: [[TMP42:%.*]] = add i32 [[INDEX]], 14 ; I32-NEXT: [[TMP43:%.*]] = add i32 [[INDEX]], 15 ; I32-NEXT: [[TMP44:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double> +; I32-NEXT: [[TMP31:%.*]] = extractelement <4 x double> [[TMP44]], i32 0 +; I32-NEXT: [[TMP32:%.*]] = extractelement <4 x double> [[TMP44]], i32 1 +; I32-NEXT: [[TMP33:%.*]] = extractelement <4 x double> [[TMP44]], i32 2 +; I32-NEXT: [[TMP34:%.*]] = extractelement <4 x double> [[TMP44]], i32 3 ; I32-NEXT: [[TMP45:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double> +; I32-NEXT: [[TMP35:%.*]] = extractelement <4 x double> [[TMP45]], i32 0 +; I32-NEXT: [[TMP36:%.*]] = extractelement <4 x double> [[TMP45]], i32 1 +; I32-NEXT: [[TMP37:%.*]] = extractelement <4 x double> [[TMP45]], i32 2 +; I32-NEXT: [[TMP38:%.*]] = extractelement <4 x double> [[TMP45]], i32 3 ; I32-NEXT: [[TMP46:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double> +; I32-NEXT: [[TMP63:%.*]] = extractelement <4 x double> [[TMP46]], i32 0 +; I32-NEXT: [[TMP64:%.*]] = extractelement <4 x double> [[TMP46]], i32 1 +; I32-NEXT: [[TMP65:%.*]] = extractelement <4 x double> [[TMP46]], i32 2 +; I32-NEXT: [[TMP66:%.*]] = extractelement <4 x double> [[TMP46]], i32 3 ; I32-NEXT: [[TMP55:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double> +; I32-NEXT: [[TMP67:%.*]] = extractelement <4 x double> [[TMP55]], i32 0 +; I32-NEXT: [[TMP68:%.*]] = extractelement <4 x double> [[TMP55]], i32 1 +; I32-NEXT: [[TMP69:%.*]] = extractelement <4 x double> [[TMP55]], i32 2 +; I32-NEXT: [[TMP70:%.*]] = extractelement <4 x double> [[TMP55]], i32 3 ; I32-NEXT: [[TMP15:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]] ; I32-NEXT: [[TMP16:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]] ; I32-NEXT: [[TMP17:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]] @@ -236,37 +252,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 { ; I32-NEXT: [[TMP52:%.*]] = load ptr, ptr [[TMP61]], align 4 ; I32-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP62]], align 4 ; I32-NEXT: [[TMP54:%.*]] = load ptr, ptr [[TMP71]], align 4 -; I32-NEXT: [[TMP31:%.*]] = extractelement <4 x double> [[TMP44]], i32 0 ; I32-NEXT: store double [[TMP31]], ptr [[TMP23]], align 4 -; I32-NEXT: [[TMP32:%.*]] = extractelement <4 x double> [[TMP44]], i32 1 ; I32-NEXT: store double [[TMP32]], ptr [[TMP24]], align 4 -; I32-NEXT: [[TMP33:%.*]] = extractelement <4 x double> [[TMP44]], i32 2 ; I32-NEXT: store double [[TMP33]], ptr [[TMP25]], align 4 -; I32-NEXT: [[TMP34:%.*]] = extractelement <4 x double> [[TMP44]], i32 3 ; I32-NEXT: store double [[TMP34]], ptr [[TMP26]], align 4 -; I32-NEXT: [[TMP35:%.*]] = extractelement <4 x double> [[TMP45]], i32 0 ; I32-NEXT: store double [[TMP35]], ptr [[TMP27]], align 4 -; I32-NEXT: [[TMP36:%.*]] = extractelement <4 x double> [[TMP45]], i32 1 ; I32-NEXT: store double [[TMP36]], ptr [[TMP28]], align 4 -; I32-NEXT: [[TMP37:%.*]] = extractelement <4 x double> [[TMP45]], i32 2 ; I32-NEXT: store double [[TMP37]], ptr [[TMP29]], align 4 -; I32-NEXT: [[TMP38:%.*]] = extractelement <4 x double> [[TMP45]], i32 3 ; I32-NEXT: store double [[TMP38]], ptr [[TMP30]], align 4 -; I32-NEXT: [[TMP63:%.*]] = extractelement <4 x double> [[TMP46]], i32 0 ; I32-NEXT: store double [[TMP63]], ptr [[TMP47]], align 4 -; I32-NEXT: [[TMP64:%.*]] = extractelement <4 x double> [[TMP46]], i32 1 ; I32-NEXT: store double [[TMP64]], ptr [[TMP48]], align 4 -; I32-NEXT: [[TMP65:%.*]] = extractelement <4 x double> [[TMP46]], i32 2 ; I32-NEXT: store double [[TMP65]], ptr [[TMP49]], align 4 -; I32-NEXT: [[TMP66:%.*]] = extractelement <4 x double> [[TMP46]], i32 3 ; I32-NEXT: store double [[TMP66]], ptr [[TMP50]], align 4 -; I32-NEXT: [[TMP67:%.*]] = extractelement <4 x double> [[TMP55]], i32 0 ; I32-NEXT: store double [[TMP67]], ptr [[TMP51]], align 4 -; I32-NEXT: [[TMP68:%.*]] = extractelement <4 x double> [[TMP55]], i32 1 ; I32-NEXT: store double [[TMP68]], ptr [[TMP52]], align 4 -; I32-NEXT: [[TMP69:%.*]] = extractelement <4 x double> [[TMP55]], i32 2 ; I32-NEXT: store double [[TMP69]], ptr [[TMP53]], align 4 -; I32-NEXT: [[TMP70:%.*]] = extractelement <4 x double> [[TMP55]], i32 3 ; I32-NEXT: store double [[TMP70]], ptr [[TMP54]], align 4 ; I32-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; I32-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) @@ -295,21 +295,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 { ; I32-NEXT: [[TMP76:%.*]] = add i32 [[INDEX4]], 2 ; I32-NEXT: [[TMP77:%.*]] = add i32 [[INDEX4]], 3 ; I32-NEXT: [[TMP78:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double> -; I32-NEXT: [[TMP79:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]] -; I32-NEXT: [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]] -; I32-NEXT: [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]] -; I32-NEXT: [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]] -; I32-NEXT: [[TMP83:%.*]] = load ptr, ptr [[TMP79]], align 4 -; I32-NEXT: [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4 -; I32-NEXT: [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4 -; I32-NEXT: [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4 ; I32-NEXT: [[TMP87:%.*]] = extractelement <4 x double> [[TMP78]], i32 0 -; I32-NEXT: store double [[TMP87]], ptr [[TMP83]], align 4 ; I32-NEXT: [[TMP88:%.*]] = extractelement <4 x double> [[TMP78]], i32 1 -; I32-NEXT: store double [[TMP88]], ptr [[TMP84]], align 4 ; I32-NEXT: [[TMP89:%.*]] = extractelement <4 x double> [[TMP78]], i32 2 -; I32-NEXT: store double [[TMP89]], ptr [[TMP85]], align 4 ; I32-NEXT: [[TMP90:%.*]] = extractelement <4 x double> [[TMP78]], i32 3 +; I32-NEXT: [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]] +; I32-NEXT: [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]] +; I32-NEXT: [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]] +; I32-NEXT: [[TMP92:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]] +; I32-NEXT: [[TMP93:%.*]] = load ptr, ptr [[TMP83]], align 4 +; I32-NEXT: [[TMP94:%.*]] = load ptr, ptr [[TMP84]], align 4 +; I32-NEXT: [[TMP95:%.*]] = load ptr, ptr [[TMP85]], align 4 +; I32-NEXT: [[TMP86:%.*]] = load ptr, ptr [[TMP92]], align 4 +; I32-NEXT: store double [[TMP87]], ptr [[TMP93]], align 4 +; I32-NEXT: store double [[TMP88]], ptr [[TMP94]], align 4 +; I32-NEXT: store double [[TMP89]], ptr [[TMP95]], align 4 ; I32-NEXT: store double [[TMP90]], ptr [[TMP86]], align 4 ; I32-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4 ; I32-NEXT: [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4) @@ -693,20 +693,20 @@ define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias % ; I32-NEXT: [[TMP76:%.*]] = insertelement <8 x ptr> [[TMP75]], ptr [[TMP68]], i32 7 ; I32-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[TMP51]], <8 x ptr> [[TMP76]], <8 x ptr> [[BROADCAST_SPLAT2]] ; I32-NEXT: [[TMP77:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 0 -; I32-NEXT: [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4 ; I32-NEXT: [[TMP79:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 1 -; I32-NEXT: [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4 ; I32-NEXT: [[TMP81:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 2 -; I32-NEXT: [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4 ; I32-NEXT: [[TMP83:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 3 -; I32-NEXT: [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4 ; I32-NEXT: [[TMP85:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 4 -; I32-NEXT: [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4 ; I32-NEXT: [[TMP87:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 5 -; I32-NEXT: [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4 ; I32-NEXT: [[TMP89:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 6 -; I32-NEXT: [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4 ; I32-NEXT: [[TMP91:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 7 +; I32-NEXT: [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4 +; I32-NEXT: [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4 +; I32-NEXT: [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4 +; I32-NEXT: [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4 +; I32-NEXT: [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4 +; I32-NEXT: [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4 +; I32-NEXT: [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4 ; I32-NEXT: [[TMP92:%.*]] = load float, ptr [[TMP91]], align 4 ; I32-NEXT: [[TMP93:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] ; I32-NEXT: [[TMP94:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]] @@ -847,32 +847,32 @@ define void @address_use_in_different_block(ptr noalias %dst, ptr %src.0, ptr %s ; I64-NEXT: [[TMP70:%.*]] = insertelement <2 x double> poison, double [[TMP68]], i32 0 ; I64-NEXT: [[TMP71:%.*]] = insertelement <2 x double> [[TMP70]], double [[TMP69]], i32 1 ; I64-NEXT: [[TMP72:%.*]] = fsub <2 x double> zeroinitializer, [[TMP59]] -; I64-NEXT: [[TMP73:%.*]] = fsub <2 x double> zeroinitializer, [[TMP63]] -; I64-NEXT: [[TMP74:%.*]] = fsub <2 x double> zeroinitializer, [[TMP67]] -; I64-NEXT: [[TMP75:%.*]] = fsub <2 x double> zeroinitializer, [[TMP71]] -; I64-NEXT: [[TMP76:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP8]] -; I64-NEXT: [[TMP77:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP9]] -; I64-NEXT: [[TMP78:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]] -; I64-NEXT: [[TMP79:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]] -; I64-NEXT: [[TMP80:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]] -; I64-NEXT: [[TMP81:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]] -; I64-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]] -; I64-NEXT: [[TMP83:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]] ; I64-NEXT: [[TMP84:%.*]] = extractelement <2 x double> [[TMP72]], i32 0 -; I64-NEXT: store double [[TMP84]], ptr [[TMP76]], align 8 ; I64-NEXT: [[TMP85:%.*]] = extractelement <2 x double> [[TMP72]], i32 1 -; I64-NEXT: store double [[TMP85]], ptr [[TMP77]], align 8 +; I64-NEXT: [[TMP73:%.*]] = fsub <2 x double> zeroinitializer, [[TMP63]] ; I64-NEXT: [[TMP86:%.*]] = extractelement <2 x double> [[TMP73]], i32 0 -; I64-NEXT: store double [[TMP86]], ptr [[TMP78]], align 8 ; I64-NEXT: [[TMP87:%.*]] = extractelement <2 x double> [[TMP73]], i32 1 -; I64-NEXT: store double [[TMP87]], ptr [[TMP79]], align 8 +; I64-NEXT: [[TMP74:%.*]] = fsub <2 x double> zeroinitializer, [[TMP67]] ; I64-NEXT: [[TMP88:%.*]] = extractelement <2 x double> [[TMP74]], i32 0 -; I64-NEXT: store double [[TMP88]], ptr [[TMP80]], align 8 ; I64-NEXT: [[TMP89:%.*]] = extractelement <2 x double> [[TMP74]], i32 1 -; I64-NEXT: store double [[TMP89]], ptr [[TMP81]], align 8 +; I64-NEXT: [[TMP75:%.*]] = fsub <2 x double> zeroinitializer, [[TMP71]] ; I64-NEXT: [[TMP90:%.*]] = extractelement <2 x double> [[TMP75]], i32 0 -; I64-NEXT: store double [[TMP90]], ptr [[TMP82]], align 8 ; I64-NEXT: [[TMP91:%.*]] = extractelement <2 x double> [[TMP75]], i32 1 +; I64-NEXT: [[TMP93:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP8]] +; I64-NEXT: [[TMP94:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP9]] +; I64-NEXT: [[TMP95:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]] +; I64-NEXT: [[TMP96:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]] +; I64-NEXT: [[TMP97:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]] +; I64-NEXT: [[TMP98:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]] +; I64-NEXT: [[TMP99:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]] +; I64-NEXT: [[TMP83:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]] +; I64-NEXT: store double [[TMP84]], ptr [[TMP93]], align 8 +; I64-NEXT: store double [[TMP85]], ptr [[TMP94]], align 8 +; I64-NEXT: store double [[TMP86]], ptr [[TMP95]], align 8 +; I64-NEXT: store double [[TMP87]], ptr [[TMP96]], align 8 +; I64-NEXT: store double [[TMP88]], ptr [[TMP97]], align 8 +; I64-NEXT: store double [[TMP89]], ptr [[TMP98]], align 8 +; I64-NEXT: store double [[TMP90]], ptr [[TMP99]], align 8 ; I64-NEXT: store double [[TMP91]], ptr [[TMP83]], align 8 ; I64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; I64-NEXT: [[TMP92:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 @@ -928,17 +928,17 @@ define void @address_use_in_different_block(ptr noalias %dst, ptr %src.0, ptr %s ; I32-NEXT: [[TMP34:%.*]] = insertelement <4 x double> [[TMP33]], double [[TMP30]], i32 2 ; I32-NEXT: [[TMP35:%.*]] = insertelement <4 x double> [[TMP34]], double [[TMP31]], i32 3 ; I32-NEXT: [[TMP36:%.*]] = fsub <4 x double> zeroinitializer, [[TMP35]] +; I32-NEXT: [[TMP41:%.*]] = extractelement <4 x double> [[TMP36]], i32 0 +; I32-NEXT: [[TMP42:%.*]] = extractelement <4 x double> [[TMP36]], i32 1 +; I32-NEXT: [[TMP43:%.*]] = extractelement <4 x double> [[TMP36]], i32 2 +; I32-NEXT: [[TMP44:%.*]] = extractelement <4 x double> [[TMP36]], i32 3 ; I32-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]] ; I32-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP5]] ; I32-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP6]] ; I32-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP7]] -; I32-NEXT: [[TMP41:%.*]] = extractelement <4 x double> [[TMP36]], i32 0 ; I32-NEXT: store double [[TMP41]], ptr [[TMP37]], align 8 -; I32-NEXT: [[TMP42:%.*]] = extractelement <4 x double> [[TMP36]], i32 1 ; I32-NEXT: store double [[TMP42]], ptr [[TMP38]], align 8 -; I32-NEXT: [[TMP43:%.*]] = extractelement <4 x double> [[TMP36]], i32 2 ; I32-NEXT: store double [[TMP43]], ptr [[TMP39]], align 8 -; I32-NEXT: [[TMP44:%.*]] = extractelement <4 x double> [[TMP36]], i32 3 ; I32-NEXT: store double [[TMP44]], ptr [[TMP40]], align 8 ; I32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; I32-NEXT: [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll index 15e26782f8e66..2ecd15e14c8a2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -529,6 +529,14 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] ; CHECK-NEXT: [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i8> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7 ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP8]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP9]] ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP10]] @@ -537,21 +545,13 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP13]] ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP14]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP15]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0 ; CHECK-NEXT: store i8 [[TMP28]], ptr [[TMP20]], align 1 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1 ; CHECK-NEXT: store i8 [[TMP29]], ptr [[TMP21]], align 1 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2 ; CHECK-NEXT: store i8 [[TMP30]], ptr [[TMP22]], align 1 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3 ; CHECK-NEXT: store i8 [[TMP31]], ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4 ; CHECK-NEXT: store i8 [[TMP32]], ptr [[TMP24]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5 ; CHECK-NEXT: store i8 [[TMP33]], ptr [[TMP25]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6 ; CHECK-NEXT: store i8 [[TMP34]], ptr [[TMP26]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7 ; CHECK-NEXT: store i8 [[TMP35]], ptr [[TMP27]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 @@ -608,53 +608,53 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; MAX-BW-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> ; MAX-BW-NEXT: [[TMP34:%.*]] = add <16 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] ; MAX-BW-NEXT: [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8> -; MAX-BW-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP16]] -; MAX-BW-NEXT: [[TMP37:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]] -; MAX-BW-NEXT: [[TMP38:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]] -; MAX-BW-NEXT: [[TMP39:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]] -; MAX-BW-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]] -; MAX-BW-NEXT: [[TMP41:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]] -; MAX-BW-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]] -; MAX-BW-NEXT: [[TMP43:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]] -; MAX-BW-NEXT: [[TMP44:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]] -; MAX-BW-NEXT: [[TMP45:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]] -; MAX-BW-NEXT: [[TMP46:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]] -; MAX-BW-NEXT: [[TMP47:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]] -; MAX-BW-NEXT: [[TMP48:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]] -; MAX-BW-NEXT: [[TMP49:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]] -; MAX-BW-NEXT: [[TMP50:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]] -; MAX-BW-NEXT: [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]] ; MAX-BW-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i32 0 -; MAX-BW-NEXT: store i8 [[TMP52]], ptr [[TMP36]], align 1 ; MAX-BW-NEXT: [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i32 1 -; MAX-BW-NEXT: store i8 [[TMP53]], ptr [[TMP37]], align 1 ; MAX-BW-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i32 2 -; MAX-BW-NEXT: store i8 [[TMP54]], ptr [[TMP38]], align 1 ; MAX-BW-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i32 3 -; MAX-BW-NEXT: store i8 [[TMP55]], ptr [[TMP39]], align 1 ; MAX-BW-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i32 4 -; MAX-BW-NEXT: store i8 [[TMP56]], ptr [[TMP40]], align 1 ; MAX-BW-NEXT: [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i32 5 -; MAX-BW-NEXT: store i8 [[TMP57]], ptr [[TMP41]], align 1 ; MAX-BW-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i32 6 -; MAX-BW-NEXT: store i8 [[TMP58]], ptr [[TMP42]], align 1 ; MAX-BW-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i32 7 -; MAX-BW-NEXT: store i8 [[TMP59]], ptr [[TMP43]], align 1 ; MAX-BW-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i32 8 -; MAX-BW-NEXT: store i8 [[TMP60]], ptr [[TMP44]], align 1 ; MAX-BW-NEXT: [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i32 9 -; MAX-BW-NEXT: store i8 [[TMP61]], ptr [[TMP45]], align 1 ; MAX-BW-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i32 10 -; MAX-BW-NEXT: store i8 [[TMP62]], ptr [[TMP46]], align 1 ; MAX-BW-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i32 11 -; MAX-BW-NEXT: store i8 [[TMP63]], ptr [[TMP47]], align 1 ; MAX-BW-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i32 12 -; MAX-BW-NEXT: store i8 [[TMP64]], ptr [[TMP48]], align 1 ; MAX-BW-NEXT: [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i32 13 -; MAX-BW-NEXT: store i8 [[TMP65]], ptr [[TMP49]], align 1 ; MAX-BW-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i32 14 -; MAX-BW-NEXT: store i8 [[TMP66]], ptr [[TMP50]], align 1 ; MAX-BW-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i32 15 +; MAX-BW-NEXT: [[TMP69:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP16]] +; MAX-BW-NEXT: [[TMP70:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]] +; MAX-BW-NEXT: [[TMP71:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]] +; MAX-BW-NEXT: [[TMP72:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]] +; MAX-BW-NEXT: [[TMP73:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]] +; MAX-BW-NEXT: [[TMP74:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]] +; MAX-BW-NEXT: [[TMP75:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]] +; MAX-BW-NEXT: [[TMP76:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]] +; MAX-BW-NEXT: [[TMP77:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]] +; MAX-BW-NEXT: [[TMP78:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]] +; MAX-BW-NEXT: [[TMP79:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]] +; MAX-BW-NEXT: [[TMP80:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]] +; MAX-BW-NEXT: [[TMP81:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]] +; MAX-BW-NEXT: [[TMP82:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]] +; MAX-BW-NEXT: [[TMP83:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]] +; MAX-BW-NEXT: [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]] +; MAX-BW-NEXT: store i8 [[TMP52]], ptr [[TMP69]], align 1 +; MAX-BW-NEXT: store i8 [[TMP53]], ptr [[TMP70]], align 1 +; MAX-BW-NEXT: store i8 [[TMP54]], ptr [[TMP71]], align 1 +; MAX-BW-NEXT: store i8 [[TMP55]], ptr [[TMP72]], align 1 +; MAX-BW-NEXT: store i8 [[TMP56]], ptr [[TMP73]], align 1 +; MAX-BW-NEXT: store i8 [[TMP57]], ptr [[TMP74]], align 1 +; MAX-BW-NEXT: store i8 [[TMP58]], ptr [[TMP75]], align 1 +; MAX-BW-NEXT: store i8 [[TMP59]], ptr [[TMP76]], align 1 +; MAX-BW-NEXT: store i8 [[TMP60]], ptr [[TMP77]], align 1 +; MAX-BW-NEXT: store i8 [[TMP61]], ptr [[TMP78]], align 1 +; MAX-BW-NEXT: store i8 [[TMP62]], ptr [[TMP79]], align 1 +; MAX-BW-NEXT: store i8 [[TMP63]], ptr [[TMP80]], align 1 +; MAX-BW-NEXT: store i8 [[TMP64]], ptr [[TMP81]], align 1 +; MAX-BW-NEXT: store i8 [[TMP65]], ptr [[TMP82]], align 1 +; MAX-BW-NEXT: store i8 [[TMP66]], ptr [[TMP83]], align 1 ; MAX-BW-NEXT: store i8 [[TMP67]], ptr [[TMP51]], align 1 ; MAX-BW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; MAX-BW-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll index 4fb928dd3f018..38617d25bfd7a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll @@ -32,12 +32,12 @@ define void @test(ptr %A) { ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[STRIDED_VEC]], splat (i32 2) ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP13]], i32 0 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i32 1 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i32 2 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3 +; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP8]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP9]], align 4 +; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP10]], align 4 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 diff --git a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll index 9ea9e1193f956..44f70b41c7859 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll @@ -26,6 +26,7 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) { ; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4) ; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4) ; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4) @@ -33,7 +34,6 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) { ; CHECK-NEXT: [[TMP17:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD4]], splat (i64 128) ; CHECK-NEXT: [[TMP18:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD5]], splat (i64 128) ; CHECK-NEXT: [[TMP19:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD6]], splat (i64 128) -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP26]], 1 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP27]] ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i64, ptr [[TMP28]], i32 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll index f29428c51c636..0a4f326608dd7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -794,20 +794,20 @@ define dso_local void @unconditional_strided1_optsize(ptr noalias nocapture read ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], splat (i32 1) ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = extractelement <8 x i32> [[TMP0]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP3]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP0]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP5]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP0]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP7]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP0]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP9]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP0]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP11]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP0]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP13]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP0]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP3]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP5]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP7]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP9]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP11]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP13]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP15]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = load i8, ptr [[TMP2]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP4]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll index 414394a8942e5..d8cbcec318f22 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll @@ -27,41 +27,41 @@ define dso_local void @test1(ptr noalias nocapture %points, ptr noalias nocaptur ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], splat (i64 2) ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS:%.*]], i64 [[TMP2]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP4]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP1]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP6]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP1]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS:%.*]], i64 [[TMP2]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP4]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP6]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP8]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP10]], ptr [[TMP3]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP11]], ptr [[TMP5]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP12]], ptr [[TMP7]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP13]], ptr [[TMP9]], align 2 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP14]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = or disjoint <4 x i64> [[TMP1]], splat (i64 1) ; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP15]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP16]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP15]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP18]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP15]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP20]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP15]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP16]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP18]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP20]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP22]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP24]], ptr [[TMP17]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP25]], ptr [[TMP19]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP26]], ptr [[TMP21]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP27]], ptr [[TMP23]], align 2 ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll index c9fb05c5df3de..d29719de79ffd 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll @@ -33,11 +33,11 @@ define void @example() { ; FORCED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; FORCED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; FORCED-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[VEC_IND]] to <2 x x86_fp80> +; FORCED-NEXT: [[TMP5:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 0 +; FORCED-NEXT: [[TMP6:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 1 ; FORCED-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1024 x x86_fp80], ptr @x, i64 0, i64 [[TMP0]] ; FORCED-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1024 x x86_fp80], ptr @x, i64 0, i64 [[TMP1]] -; FORCED-NEXT: [[TMP5:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 0 ; FORCED-NEXT: store x86_fp80 [[TMP5]], ptr [[TMP3]], align 16 -; FORCED-NEXT: [[TMP6:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 1 ; FORCED-NEXT: store x86_fp80 [[TMP6]], ptr [[TMP4]], align 16 ; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; FORCED-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) @@ -100,8 +100,8 @@ define void @test_replicating_store_x86_fp80_cost(i32 %n, ptr %dst) #0 { ; FORCED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; FORCED-NEXT: [[TMP4:%.*]] = zext <2 x i32> [[VEC_IND]] to <2 x i64> ; FORCED-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; FORCED-NEXT: [[TMP6:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP5]] ; FORCED-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; FORCED-NEXT: [[TMP6:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP5]] ; FORCED-NEXT: [[TMP8:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP7]] ; FORCED-NEXT: store x86_fp80 0xK00000000000000000000, ptr [[TMP6]], align 16 ; FORCED-NEXT: store x86_fp80 0xK00000000000000000000, ptr [[TMP8]], align 16 diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll index 65c12a15406ff..90fc0959f907a 100644 --- a/llvm/test/Transforms/LoopVectorize/assume.ll +++ b/llvm/test/Transforms/LoopVectorize/assume.ll @@ -15,15 +15,15 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+02) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+02) -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP3]]) -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP5]]) -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP6]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]]) ; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll index f64255f29d335..b7804f02f652f 100644 --- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll +++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll @@ -18,20 +18,20 @@ define i32 @foo(ptr nocapture %A) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1 ; CHECK-NEXT: [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 8) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; CHECK-NEXT: store i32 4, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] ; CHECK-NEXT: store i32 4, ptr [[TMP5]], align 4 ; CHECK-NEXT: store i32 4, ptr [[TMP7]], align 4 ; CHECK-NEXT: store i32 4, ptr [[TMP9]], align 4 +; CHECK-NEXT: store i32 4, ptr [[TMP12]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 diff --git a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll index 1fc4a017f9ec5..45405511abe90 100644 --- a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll @@ -988,30 +988,30 @@ define void @pointer_iv_non_uniform_0(ptr %a, i64 %n) { ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP47]], i32 2 ; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP56]], i32 3 ; CHECK-NEXT: [[TMP25:%.*]] = sub <4 x i32> [[TMP24]], [[TMP12]] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP25]], i32 0 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP25]], i32 1 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP25]], i32 2 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[TMP25]], i32 3 ; CHECK-NEXT: [[TMP39:%.*]] = sub <4 x i32> [[TMP40]], [[TMP40]] +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 2 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 2 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 2 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 2 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP25]], i32 0 ; CHECK-NEXT: store i32 [[TMP30]], ptr [[TMP26]], align 8 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP25]], i32 1 ; CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP27]], align 8 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP25]], i32 2 ; CHECK-NEXT: store i32 [[TMP32]], ptr [[TMP28]], align 8 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[TMP25]], i32 3 ; CHECK-NEXT: store i32 [[TMP33]], ptr [[TMP29]], align 8 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 3 ; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 3 ; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 3 ; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 3 -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0 ; CHECK-NEXT: store i32 [[TMP52]], ptr [[TMP48]], align 8 -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1 ; CHECK-NEXT: store i32 [[TMP53]], ptr [[TMP49]], align 8 -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2 ; CHECK-NEXT: store i32 [[TMP54]], ptr [[TMP50]], align 8 -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3 ; CHECK-NEXT: store i32 [[TMP55]], ptr [[TMP51]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1078,30 +1078,30 @@ define void @pointer_iv_non_uniform_0(ptr %a, i64 %n) { ; INTER-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC5]], <16 x i32> poison, <4 x i32> ; INTER-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i32> [[WIDE_VEC5]], <16 x i32> poison, <4 x i32> ; INTER-NEXT: [[TMP17:%.*]] = sub <4 x i32> [[STRIDED_VEC6]], [[STRIDED_VEC]] +; INTER-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP17]], i32 0 +; INTER-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP17]], i32 1 +; INTER-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP17]], i32 2 +; INTER-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP17]], i32 3 ; INTER-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC4]] +; INTER-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0 +; INTER-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1 +; INTER-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2 +; INTER-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3 ; INTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 2 ; INTER-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 2 ; INTER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 2 ; INTER-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 2 -; INTER-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP17]], i32 0 ; INTER-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 8 -; INTER-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP17]], i32 1 ; INTER-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 8 -; INTER-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP17]], i32 2 ; INTER-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 8 -; INTER-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP17]], i32 3 ; INTER-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 8 ; INTER-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 3 ; INTER-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 3 ; INTER-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 3 ; INTER-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 3 -; INTER-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0 ; INTER-NEXT: store i32 [[TMP23]], ptr [[TMP19]], align 8 -; INTER-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1 ; INTER-NEXT: store i32 [[TMP24]], ptr [[TMP20]], align 8 -; INTER-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2 ; INTER-NEXT: store i32 [[TMP25]], ptr [[TMP27]], align 8 -; INTER-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3 ; INTER-NEXT: store i32 [[TMP26]], ptr [[TMP22]], align 8 ; INTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; INTER-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1316,9 +1316,9 @@ define i32 @pointer_iv_mixed(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 8, !alias.scope [[META20:![0-9]+]] ; CHECK-NEXT: [[TMP7]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8, !alias.scope [[META23:![0-9]+]], !noalias [[META20]] @@ -1382,9 +1382,9 @@ define i32 @pointer_iv_mixed(ptr %a, ptr %b, i64 %n) { ; INTER-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ] ; INTER-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] ; INTER-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; INTER-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 ; INTER-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; INTER-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]] -; INTER-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 ; INTER-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 8, !alias.scope [[META20:![0-9]+]] ; INTER-NEXT: [[TMP7]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]] ; INTER-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8, !alias.scope [[META23:![0-9]+]], !noalias [[META20]] diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll index bcea03a6b9ba8..5177d7b6e0090 100644 --- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll +++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll @@ -15,8 +15,8 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4 @@ -144,8 +144,8 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 2) ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 2) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 2) ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4 @@ -219,8 +219,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 @@ -294,8 +294,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 @@ -369,8 +369,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 @@ -514,6 +514,8 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef % ; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer @@ -536,10 +538,8 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef % ; CHECK: [[PRED_LOAD_CONTINUE2]]: ; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP12]], <2 x i32> [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP20]], i64 4), "dereferenceable"(ptr [[TMP20]], i64 4) ] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP19]], i64 4), "dereferenceable"(ptr [[TMP19]], i64 4) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP3]], i64 4), "dereferenceable"(ptr [[TMP3]], i64 4) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ] ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]] ; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2 @@ -593,8 +593,8 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll index c07dc8804f36e..76596130b946f 100644 --- a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll +++ b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll @@ -48,14 +48,14 @@ define dso_local void @forked_ptrs_different_base_same_offset(ptr nocapture read ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x ptr> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT9]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 3 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr float, ptr [[TMP11]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP29]], i64 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 2 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr float, ptr [[TMP13]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP30]], i64 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 3 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP31]], i64 12 ; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP10]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/histograms.ll b/llvm/test/Transforms/LoopVectorize/histograms.ll index f0ceae7d5816b..5bb8722d734f4 100644 --- a/llvm/test/Transforms/LoopVectorize/histograms.ll +++ b/llvm/test/Transforms/LoopVectorize/histograms.ll @@ -16,8 +16,8 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 % ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[TMP5]], i64 1 diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index cc55a51e134a6..e33995327b856 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -1247,8 +1247,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[TMP7]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0 -; CHECK-NEXT: store i32 [[TMP9]], ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1 +; CHECK-NEXT: store i32 [[TMP9]], ptr [[TMP2]], align 8 ; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1293,8 +1293,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; IND-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 ; IND-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[TMP6]], [[BROADCAST_SPLAT]] ; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP7]], i64 0 -; IND-NEXT: store i32 [[TMP8]], ptr [[TMP1]], align 8 ; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i64 1 +; IND-NEXT: store i32 [[TMP8]], ptr [[TMP1]], align 8 ; IND-NEXT: store i32 [[TMP9]], ptr [[TMP2]], align 8 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1347,14 +1347,14 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; UNROLL-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 ; UNROLL-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 ; UNROLL-NEXT: [[TMP15:%.*]] = xor <2 x i32> [[TMP10]], [[BROADCAST_SPLAT]] -; UNROLL-NEXT: [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT]] ; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP15]], i64 0 -; UNROLL-NEXT: store i32 [[TMP17]], ptr [[TMP3]], align 8 ; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP15]], i64 1 -; UNROLL-NEXT: store i32 [[TMP18]], ptr [[TMP4]], align 8 +; UNROLL-NEXT: [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT]] ; UNROLL-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i64 0 -; UNROLL-NEXT: store i32 [[TMP19]], ptr [[TMP5]], align 8 ; UNROLL-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP16]], i64 1 +; UNROLL-NEXT: store i32 [[TMP17]], ptr [[TMP3]], align 8 +; UNROLL-NEXT: store i32 [[TMP18]], ptr [[TMP4]], align 8 +; UNROLL-NEXT: store i32 [[TMP19]], ptr [[TMP5]], align 8 ; UNROLL-NEXT: store i32 [[TMP20]], ptr [[TMP6]], align 8 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1408,14 +1408,14 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP13]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = xor <2 x i32> [[TMP11]], [[BROADCAST_SPLAT]] -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT]] ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 -; UNROLL-NO-IC-NEXT: store i32 [[TMP18]], ptr [[TMP4]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 -; UNROLL-NO-IC-NEXT: store i32 [[TMP19]], ptr [[TMP5]], align 8 +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT]] ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0 -; UNROLL-NO-IC-NEXT: store i32 [[TMP20]], ptr [[TMP6]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1 +; UNROLL-NO-IC-NEXT: store i32 [[TMP18]], ptr [[TMP4]], align 8 +; UNROLL-NO-IC-NEXT: store i32 [[TMP19]], ptr [[TMP5]], align 8 +; UNROLL-NO-IC-NEXT: store i32 [[TMP20]], ptr [[TMP6]], align 8 ; UNROLL-NO-IC-NEXT: store i32 [[TMP21]], ptr [[TMP7]], align 8 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1473,22 +1473,22 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; INTERLEAVE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8 ; INTERLEAVE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; INTERLEAVE-NEXT: [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; INTERLEAVE-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]] ; INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP17]], i64 0 -; INTERLEAVE-NEXT: store i32 [[TMP19]], ptr [[TMP9]], align 8 ; INTERLEAVE-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i64 1 -; INTERLEAVE-NEXT: store i32 [[TMP20]], ptr [[TMP10]], align 8 ; INTERLEAVE-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP17]], i64 2 -; INTERLEAVE-NEXT: store i32 [[TMP21]], ptr [[TMP11]], align 8 ; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP17]], i64 3 -; INTERLEAVE-NEXT: store i32 [[TMP22]], ptr [[TMP12]], align 8 +; INTERLEAVE-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]] ; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i64 0 -; INTERLEAVE-NEXT: store i32 [[TMP23]], ptr [[TMP13]], align 8 ; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i64 1 -; INTERLEAVE-NEXT: store i32 [[TMP24]], ptr [[TMP14]], align 8 ; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i64 2 -; INTERLEAVE-NEXT: store i32 [[TMP25]], ptr [[TMP15]], align 8 ; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i64 3 +; INTERLEAVE-NEXT: store i32 [[TMP19]], ptr [[TMP9]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP20]], ptr [[TMP10]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP21]], ptr [[TMP11]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP22]], ptr [[TMP12]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP23]], ptr [[TMP13]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP24]], ptr [[TMP14]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP25]], ptr [[TMP15]], align 8 ; INTERLEAVE-NEXT: store i32 [[TMP26]], ptr [[TMP16]], align 8 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1568,10 +1568,10 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP11:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP13]], align 1, !alias.scope [[META17:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP26]], align 1, !alias.scope [[META17:![0-9]+]] ; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope [[META17]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1 @@ -1630,16 +1630,16 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[TMP10:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2) ; IND-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i64 0 -; IND-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] ; IND-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i64 1 +; IND-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] ; IND-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] -; IND-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP12]], align 1, !alias.scope [[META17:![0-9]+]] +; IND-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 1, !alias.scope [[META17:![0-9]+]] ; IND-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 1, !alias.scope [[META17]] ; IND-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]] ; IND-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4 ; IND-NEXT: [[TMP17:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]] ; IND-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 12 -; IND-NEXT: store i32 [[TMP24]], ptr [[TMP16]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]] +; IND-NEXT: store i32 [[TMP25]], ptr [[TMP16]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]] ; IND-NEXT: store i32 [[TMP15]], ptr [[TMP18]], align 1, !alias.scope [[META20]], !noalias [[META17]] ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) @@ -1694,20 +1694,20 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[TMP12:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2) +; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i64 0 +; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP12]], i64 1 ; UNROLL-NEXT: [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 2) ; UNROLL-NEXT: [[TMP13:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 8) -; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i64 0 -; UNROLL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i64 1 +; UNROLL-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i64 0 +; UNROLL-NEXT: [[TMP35:%.*]] = extractelement <2 x i64> [[TMP13]], i64 1 ; UNROLL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] -; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP13]], i64 0 ; UNROLL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] -; UNROLL-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i64 1 ; UNROLL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]] -; UNROLL-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope [[META17:![0-9]+]] -; UNROLL-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 1, !alias.scope [[META17]] -; UNROLL-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope [[META17]] -; UNROLL-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 1, !alias.scope [[META17]] +; UNROLL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP35]] +; UNROLL-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP17]], align 1, !alias.scope [[META17:![0-9]+]] +; UNROLL-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope [[META17]] +; UNROLL-NEXT: [[TMP39:%.*]] = load i32, ptr [[TMP21]], align 1, !alias.scope [[META17]] +; UNROLL-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP36]], align 1, !alias.scope [[META17]] ; UNROLL-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4 ; UNROLL-NEXT: [[TMP24:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]] @@ -1716,9 +1716,9 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; UNROLL-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i64 20 ; UNROLL-NEXT: [[TMP28:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[TMP28]], i64 28 -; UNROLL-NEXT: store i32 [[TMP35]], ptr [[TMP23]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]] -; UNROLL-NEXT: store i32 [[TMP36]], ptr [[TMP25]], align 1, !alias.scope [[META20]], !noalias [[META17]] -; UNROLL-NEXT: store i32 [[TMP37]], ptr [[TMP27]], align 1, !alias.scope [[META20]], !noalias [[META17]] +; UNROLL-NEXT: store i32 [[TMP37]], ptr [[TMP23]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]] +; UNROLL-NEXT: store i32 [[TMP38]], ptr [[TMP25]], align 1, !alias.scope [[META20]], !noalias [[META17]] +; UNROLL-NEXT: store i32 [[TMP39]], ptr [[TMP27]], align 1, !alias.scope [[META20]], !noalias [[META17]] ; UNROLL-NEXT: store i32 [[TMP22]], ptr [[TMP29]], align 1, !alias.scope [[META20]], !noalias [[META17]] ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) @@ -1779,19 +1779,19 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 2 ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 3 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2) +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = shl nsw <2 x i64> [[STEP_ADD]], splat (i64 2) -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]] -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]] -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]] -; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]] -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP16]], align 1, !alias.scope [[META17:![0-9]+]] -; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP18]], align 1, !alias.scope [[META17]] -; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope [[META17]] -; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP22]], align 1, !alias.scope [[META17]] +; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP37]] +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP18]], align 1, !alias.scope [[META17:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope [[META17]] +; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP22]], align 1, !alias.scope [[META17]] +; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP38]], align 1, !alias.scope [[META17]] ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1 @@ -1859,7 +1859,15 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; INTERLEAVE-NEXT: [[DOTIDX5:%.*]] = shl nsw i64 [[TMP14]], 4 ; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[DOTIDX5]] ; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP18]], align 1, !alias.scope [[META17:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0 +; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4 +; INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8 +; INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12 ; INTERLEAVE-NEXT: [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP19]], align 1, !alias.scope [[META17]] +; INTERLEAVE-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0 +; INTERLEAVE-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4 +; INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8 +; INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12 ; INTERLEAVE-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]] ; INTERLEAVE-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4 ; INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]] @@ -1876,21 +1884,13 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; INTERLEAVE-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP24]], i64 52 ; INTERLEAVE-NEXT: [[TMP26:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]] ; INTERLEAVE-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i64 60 -; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0 ; INTERLEAVE-NEXT: store i32 [[TMP28]], ptr [[TMP41]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]] -; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4 ; INTERLEAVE-NEXT: store i32 [[TMP29]], ptr [[TMP16]], align 1, !alias.scope [[META20]], !noalias [[META17]] -; INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8 ; INTERLEAVE-NEXT: store i32 [[TMP30]], ptr [[TMP42]], align 1, !alias.scope [[META20]], !noalias [[META17]] -; INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12 ; INTERLEAVE-NEXT: store i32 [[TMP31]], ptr [[TMP20]], align 1, !alias.scope [[META20]], !noalias [[META17]] -; INTERLEAVE-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0 ; INTERLEAVE-NEXT: store i32 [[TMP32]], ptr [[TMP21]], align 1, !alias.scope [[META20]], !noalias [[META17]] -; INTERLEAVE-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4 ; INTERLEAVE-NEXT: store i32 [[TMP33]], ptr [[TMP23]], align 1, !alias.scope [[META20]], !noalias [[META17]] -; INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8 ; INTERLEAVE-NEXT: store i32 [[TMP34]], ptr [[TMP25]], align 1, !alias.scope [[META20]], !noalias [[META17]] -; INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12 ; INTERLEAVE-NEXT: store i32 [[TMP35]], ptr [[TMP27]], align 1, !alias.scope [[META20]], !noalias [[META17]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -2445,11 +2445,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0 ; CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 2 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1 ; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP8]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) @@ -2492,13 +2492,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; IND-NEXT: [[TMP5:%.*]] = trunc <2 x i32> [[TMP4]] to <2 x i16> +; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0 +; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1 ; IND-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]] ; IND-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2 ; IND-NEXT: [[TMP16:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]] ; IND-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP16]], i64 6 -; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0 ; IND-NEXT: store i16 [[TMP8]], ptr [[TMP6]], align 2 -; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1 ; IND-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 2 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) @@ -2544,7 +2544,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; UNROLL-NEXT: [[TMP6:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; UNROLL-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] ; UNROLL-NEXT: [[TMP8:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16> +; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0 +; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1 ; UNROLL-NEXT: [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16> +; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0 +; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1 ; UNROLL-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2 ; UNROLL-NEXT: [[TMP24:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]] @@ -2553,13 +2557,9 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP25]], i64 10 ; UNROLL-NEXT: [[TMP26:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP26]], i64 14 -; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0 ; UNROLL-NEXT: store i16 [[TMP14]], ptr [[TMP10]], align 2 -; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1 ; UNROLL-NEXT: store i16 [[TMP15]], ptr [[TMP11]], align 2 -; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0 ; UNROLL-NEXT: store i16 [[TMP16]], ptr [[TMP12]], align 2 -; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1 ; UNROLL-NEXT: store i16 [[TMP17]], ptr [[TMP13]], align 2 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4) @@ -2610,18 +2610,18 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16> +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16> +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0 ; UNROLL-NO-IC-NEXT: store i16 [[TMP15]], ptr [[TMP11]], align 2 -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1 ; UNROLL-NO-IC-NEXT: store i16 [[TMP16]], ptr [[TMP12]], align 2 -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0 ; UNROLL-NO-IC-NEXT: store i16 [[TMP17]], ptr [[TMP13]], align 2 -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1 ; UNROLL-NO-IC-NEXT: store i16 [[TMP18]], ptr [[TMP14]], align 2 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2) @@ -2666,7 +2666,15 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; INTERLEAVE-NEXT: [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; INTERLEAVE-NEXT: [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] ; INTERLEAVE-NEXT: [[TMP12:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16> +; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0 +; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1 +; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2 +; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3 ; INTERLEAVE-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i16> +; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0 +; INTERLEAVE-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1 +; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2 +; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3 ; INTERLEAVE-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]] ; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2 ; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]] @@ -2683,21 +2691,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP40]], i64 26 ; INTERLEAVE-NEXT: [[TMP41:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]] ; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP41]], i64 30 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0 ; INTERLEAVE-NEXT: store i16 [[TMP22]], ptr [[TMP14]], align 2 -; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1 ; INTERLEAVE-NEXT: store i16 [[TMP23]], ptr [[TMP15]], align 2 -; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2 ; INTERLEAVE-NEXT: store i16 [[TMP24]], ptr [[TMP16]], align 2 -; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3 ; INTERLEAVE-NEXT: store i16 [[TMP25]], ptr [[TMP17]], align 2 -; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0 ; INTERLEAVE-NEXT: store i16 [[TMP26]], ptr [[TMP18]], align 2 -; INTERLEAVE-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1 ; INTERLEAVE-NEXT: store i16 [[TMP27]], ptr [[TMP19]], align 2 -; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2 ; INTERLEAVE-NEXT: store i16 [[TMP28]], ptr [[TMP20]], align 2 -; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3 ; INTERLEAVE-NEXT: store i16 [[TMP29]], ptr [[TMP21]], align 2 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8) diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll index 0ebb652ef9648..16a56f3a38ac9 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll @@ -149,8 +149,8 @@ define void @interleaved_with_cond_store_1(ptr %p, i64 %x, i64 %n) { ; CHECK: pred.store.continue2: ; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 0 -; CHECK-NEXT: store i64 [[TMP11]], ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 2 +; CHECK-NEXT: store i64 [[TMP11]], ptr [[TMP2]], align 8 ; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll index b4cad1142134c..16357b3fbc4ab 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -557,6 +557,10 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture % ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1021, [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 1020, [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP1]] @@ -578,21 +582,17 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture % ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3 ; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3 ; CHECK-NEXT: store i64 [[TMP21]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1 ; CHECK-NEXT: store i64 [[TMP22]], ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2 ; CHECK-NEXT: store i64 [[TMP23]], ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3 ; CHECK-NEXT: store i64 [[TMP24]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0 ; CHECK-NEXT: store i64 [[TMP25]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1 ; CHECK-NEXT: store i64 [[TMP26]], ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2 ; CHECK-NEXT: store i64 [[TMP27]], ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3 ; CHECK-NEXT: store i64 [[TMP28]], ptr [[TMP11]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) @@ -888,12 +888,12 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) { ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 ; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -971,12 +971,12 @@ define i32 @PR27626_1(ptr %p, i64 %n) { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP20]], i64 28 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP6]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 -; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 -; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 +; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP6]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP7]], align 4 +; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 ; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> @@ -1073,12 +1073,12 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) { ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 +; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 +; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP13]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1150,29 +1150,29 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0 ; CHECK-NEXT: [[DOTSPLIT3:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT3]], i64 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1 ; CHECK-NEXT: [[DOTSPLIT4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT4]], i64 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2 ; CHECK-NEXT: [[DOTSPLIT5:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT5]], i64 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3 ; CHECK-NEXT: [[DOTSPLIT6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT6]], i64 4 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP6]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP6]], align 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP8]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP10]], align 4 ; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> @@ -1347,7 +1347,15 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3 ; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -3) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP12]], i64 12 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]] @@ -1356,21 +1364,13 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) { ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP34]], i64 28 ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP35]], i64 36 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]] ; CHECK-NEXT: store i32 [[X:%.*]], ptr [[TMP15]], align 4 ; CHECK-NEXT: store i32 [[X]], ptr [[TMP17]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index 9358fd9cc8440..00256a5c4a456 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -480,8 +480,8 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[TMP6]], align 2 ; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2 @@ -551,8 +551,8 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i32> [[REVERSE]], splat (i32 3) ; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll index ffeb3b19d11c4..f8ddd344f5587 100644 --- a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll @@ -192,8 +192,8 @@ define void @no_gep_or_bitcast(ptr noalias %a, i64 %n) { ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 0 -; CHECK-NEXT: store i32 0, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 1 +; CHECK-NEXT: store i32 0, ptr [[TMP1]], align 8 ; CHECK-NEXT: store i32 0, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll index 3c59a279e077d..fe25d1b231efc 100644 --- a/llvm/test/Transforms/LoopVectorize/metadata.ll +++ b/llvm/test/Transforms/LoopVectorize/metadata.ll @@ -447,8 +447,8 @@ define void @unknown_metadata(ptr nocapture %a, ptr noalias %b, i64 %size) { ; INTERLEAVE-NEXT: [[STEP_ADD3:%.*]] = add <2 x i32> [[VEC_IND1]], splat (i32 2) ; INTERLEAVE-NEXT: [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDEX]], !custom_md [[META2:![0-9]+]] ; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[VEC_IND]] -; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[STEP_ADD]] ; INTERLEAVE-NEXT: [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 +; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[STEP_ADD]] ; INTERLEAVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2 ; INTERLEAVE-NEXT: store <2 x i32> [[VEC_IND1]], ptr [[TMP3]], align 4 ; INTERLEAVE-NEXT: store <2 x i32> [[STEP_ADD3]], ptr [[TMP5]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll index 1319d068145a8..bfc7feecafbc4 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll @@ -802,17 +802,17 @@ define void @multiple_ivs_wide(ptr %dst) { ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 6 ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP2]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP3]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 ; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 ; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 ; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8) @@ -838,17 +838,17 @@ define void @multiple_ivs_wide(ptr %dst) { ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 6 ; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[VEC_IND2]], splat (i32 2) +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3 ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP15]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]] ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP17]] ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0 ; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP20]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1 ; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP21]], align 4 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2 ; CHECK-NEXT: store i32 [[TMP26]], ptr [[TMP22]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3 ; CHECK-NEXT: store i32 [[TMP27]], ptr [[TMP23]], align 4 ; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], splat (i32 8) diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll index 9f82795e1f71c..763072ab16f73 100644 --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -262,8 +262,8 @@ define void @pr43371() optsize { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] ; CHECK-NEXT: store i16 0, ptr [[TMP3]], align 1 ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 @@ -288,8 +288,8 @@ define void @pr43371() optsize { ; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; PGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] ; PGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 @@ -314,8 +314,8 @@ define void @pr43371() optsize { ; NPGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] ; NPGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; NPGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; NPGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; NPGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; NPGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; NPGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] ; NPGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 ; NPGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 @@ -363,8 +363,8 @@ define void @pr43371_pgso() !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] ; CHECK-NEXT: store i16 0, ptr [[TMP3]], align 1 ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 @@ -389,8 +389,8 @@ define void @pr43371_pgso() !prof !14 { ; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; PGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] ; PGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll index bf23485ebdf16..cebf90ae9f8ce 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll @@ -17,11 +17,14 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[SRC]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP2]] @@ -29,11 +32,8 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP4]] ; CHECK-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP7]], align 8 ; CHECK-NEXT: store ptr [[TMP5]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1 ; CHECK-NEXT: store ptr [[TMP12]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2 ; CHECK-NEXT: store ptr [[TMP13]], ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3 ; CHECK-NEXT: store ptr [[TMP14]], ptr [[TMP10]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 32 diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll index 5c04e4caaa830..5c62ca3ff3d01 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -147,11 +147,11 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, <4 x ptr> [[VECTOR_GEP]], i64 1 ; CHECK-NEXT: store <4 x ptr> [[TMP2]], ptr [[NEXT_GEP]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i8> [[WIDE_LOAD]], splat (i8 1) ; CHECK-NEXT: store <4 x i8> [[TMP6]], ptr [[TMP4]], align 1 @@ -551,12 +551,12 @@ define i64 @ivopt_widen_ptr_indvar_2(ptr noalias %a, i64 %stride, i64 %n) { ; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[INDEX]] ; STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP21]], align 8 ; STRIDED-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 0 -; STRIDED-NEXT: store i64 [[TMP23]], ptr [[NEXT_GEP]], align 8 ; STRIDED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 1 -; STRIDED-NEXT: store i64 [[TMP24]], ptr [[NEXT_GEP1]], align 8 ; STRIDED-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 2 -; STRIDED-NEXT: store i64 [[TMP16]], ptr [[NEXT_GEP2]], align 8 ; STRIDED-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 3 +; STRIDED-NEXT: store i64 [[TMP23]], ptr [[NEXT_GEP]], align 8 +; STRIDED-NEXT: store i64 [[TMP24]], ptr [[NEXT_GEP1]], align 8 +; STRIDED-NEXT: store i64 [[TMP16]], ptr [[NEXT_GEP2]], align 8 ; STRIDED-NEXT: store i64 [[TMP25]], ptr [[NEXT_GEP3]], align 8 ; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; STRIDED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/pr34681.ll b/llvm/test/Transforms/LoopVectorize/pr34681.ll index e1c1e2065498c..0f509a5c4eeb3 100644 --- a/llvm/test/Transforms/LoopVectorize/pr34681.ll +++ b/llvm/test/Transforms/LoopVectorize/pr34681.ll @@ -62,12 +62,12 @@ define i32 @foo1(i32 %N, ptr nocapture readnone %A, ptr nocapture readonly %B, i ; CHECK-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[BROADCAST_SPLAT3]] ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP16]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP18]] ; CHECK-NEXT: [[TMP20:%.*]] = load i16, ptr [[TMP13]], align 2 ; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[TMP15]], align 2 @@ -167,12 +167,12 @@ define i32 @foo2(i16 zeroext %N, ptr nocapture readnone %A, ptr nocapture readon ; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[BROADCAST_SPLAT3]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP13]] ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP15]] ; CHECK-NEXT: [[TMP17:%.*]] = load i16, ptr [[TMP10]], align 2 ; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[TMP12]], align 2 diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll index 32762a4e46a41..1bb6454cdeea2 100644 --- a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll +++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll @@ -54,12 +54,12 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP8]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll index 6542c42678cc5..cf973affae5f2 100644 --- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll +++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll @@ -131,8 +131,8 @@ define void @widen_ptr_induction_dbg(ptr %start, ptr %end) { ; DEBUGLOC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; DEBUGLOC-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG35:![0-9]+]] ; DEBUGLOC-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> , !dbg [[DBG35]] -; DEBUGLOC-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0, !dbg [[DBG36:![0-9]+]] -; DEBUGLOC-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 1, !dbg [[DBG36]] +; DEBUGLOC-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 +; DEBUGLOC-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 1, !dbg [[DBG36:![0-9]+]] ; DEBUGLOC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; DEBUGLOC-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 32, !dbg [[DBG35]] ; DEBUGLOC-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG37:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll index 7dd29958504eb..c708715c623e6 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll @@ -348,16 +348,16 @@ define void @reduc_store_inside_unrolled(ptr %dst, ptr readonly %src) { ; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP17:%.*]] = or disjoint <4 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4, !alias.scope [[META16]] -; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope [[META16]] -; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META16]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP37]], align 4, !alias.scope [[META16]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META16]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP38]], align 4, !alias.scope [[META16]] ; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4, !alias.scope [[META16]] ; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 ; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1 @@ -554,16 +554,16 @@ define void @reduc_store_middle_store_predicated(ptr %dst, ptr readonly %src) { ; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP17:%.*]] = or disjoint <4 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4, !alias.scope [[META23]] -; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope [[META23]] -; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META23]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP37]], align 4, !alias.scope [[META23]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META23]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP38]], align 4, !alias.scope [[META23]] ; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4, !alias.scope [[META23]] ; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 ; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1 diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll index 5894c3af1d637..c270a23344f54 100644 --- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll @@ -41,8 +41,8 @@ define void @reuse_lcssa_phi_for_add_rec1(ptr %head) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x ptr> [[WIDE_LOAD]], <2 x ptr> poison, <2 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x ptr> [[REVERSE]], i32 0 -; CHECK-NEXT: store ptr null, ptr [[TMP8]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[REVERSE]], i32 1 +; CHECK-NEXT: store ptr null, ptr [[TMP8]], align 8 ; CHECK-NEXT: store ptr null, ptr [[TMP9]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll index ad8cd425cd6c2..296fe017c4d42 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll @@ -22,10 +22,10 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) ; CHECK-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP8]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[FCMP1:%.*]] = fcmp ogt float [[TMP10]], 1.000000e+02 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[WIDE_LOAD1]], i32 0 +; CHECK-NEXT: [[FCMP1:%.*]] = fcmp ogt float [[TMP10]], 1.000000e+02 ; CHECK-NEXT: [[FCMP2:%.*]] = fcmp ogt float [[TMP12]], 1.000000e+02 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP1]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP2]]) diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll index 58542f4f27618..163faa2254700 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll @@ -29,16 +29,16 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64> ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP16]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP11]], align 8 -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP14]], align 8 ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP15]], align 8 +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP16]], align 8 ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP17]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll index 3bb39b95235ed..cde2de73b7bfd 100644 --- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll @@ -142,8 +142,8 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) { ; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll index 99916a503750a..8123092df1ccc 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll @@ -14,12 +14,12 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl ; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] ; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 ; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0 -; VF4-NEXT: [[TMP3:%.*]] = tail call { i64 } @fn1(float [[TMP2]]) #[[ATTR0:[0-9]+]] ; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1 -; VF4-NEXT: [[TMP5:%.*]] = tail call { i64 } @fn1(float [[TMP4]]) #[[ATTR0]] ; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2 -; VF4-NEXT: [[TMP7:%.*]] = tail call { i64 } @fn1(float [[TMP6]]) #[[ATTR0]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3 +; VF4-NEXT: [[TMP3:%.*]] = tail call { i64 } @fn1(float [[TMP2]]) #[[ATTR0:[0-9]+]] +; VF4-NEXT: [[TMP5:%.*]] = tail call { i64 } @fn1(float [[TMP4]]) #[[ATTR0]] +; VF4-NEXT: [[TMP7:%.*]] = tail call { i64 } @fn1(float [[TMP6]]) #[[ATTR0]] ; VF4-NEXT: [[TMP9:%.*]] = tail call { i64 } @fn1(float [[TMP8]]) #[[ATTR0]] ; VF4-NEXT: [[TMP10:%.*]] = extractvalue { i64 } [[TMP3]], 0 ; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0 @@ -55,11 +55,13 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] ; VF2IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2 ; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4 +; VF2IC2-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 +; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 ; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4 -; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 -; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { i64 } @fn1(float [[TMP3]]) #[[ATTR0:[0-9]+]] -; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 -; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { i64 } @fn1(float [[TMP5]]) #[[ATTR0]] +; VF2IC2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 +; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 +; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { i64 } @fn1(float [[TMP14]]) #[[ATTR0:[0-9]+]] +; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { i64 } @fn1(float [[TMP3]]) #[[ATTR0]] ; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { i64 } [[TMP4]], 0 ; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i64 0 ; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP8]], 0 @@ -67,10 +69,8 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: [[TMP11:%.*]] = extractvalue { <2 x i64> } [[TMP9]], 0 ; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i64 1 ; VF2IC2-NEXT: [[TMP13:%.*]] = insertvalue { <2 x i64> } [[TMP9]], <2 x i64> [[TMP12]], 0 -; VF2IC2-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 -; VF2IC2-NEXT: [[TMP15:%.*]] = tail call { i64 } @fn1(float [[TMP14]]) #[[ATTR0]] -; VF2IC2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 -; VF2IC2-NEXT: [[TMP17:%.*]] = tail call { i64 } @fn1(float [[TMP16]]) #[[ATTR0]] +; VF2IC2-NEXT: [[TMP15:%.*]] = tail call { i64 } @fn1(float [[TMP16]]) #[[ATTR0]] +; VF2IC2-NEXT: [[TMP17:%.*]] = tail call { i64 } @fn1(float [[TMP5]]) #[[ATTR0]] ; VF2IC2-NEXT: [[TMP18:%.*]] = extractvalue { i64 } [[TMP15]], 0 ; VF2IC2-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i64 0 ; VF2IC2-NEXT: [[TMP20:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP19]], 0 @@ -120,12 +120,12 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl ; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] ; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 ; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0 -; VF4-NEXT: [[TMP3:%.*]] = tail call { float, float } @fn2(float [[TMP2]]) #[[ATTR1:[0-9]+]] ; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1 -; VF4-NEXT: [[TMP5:%.*]] = tail call { float, float } @fn2(float [[TMP4]]) #[[ATTR1]] ; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2 -; VF4-NEXT: [[TMP7:%.*]] = tail call { float, float } @fn2(float [[TMP6]]) #[[ATTR1]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3 +; VF4-NEXT: [[TMP3:%.*]] = tail call { float, float } @fn2(float [[TMP2]]) #[[ATTR1:[0-9]+]] +; VF4-NEXT: [[TMP5:%.*]] = tail call { float, float } @fn2(float [[TMP4]]) #[[ATTR1]] +; VF4-NEXT: [[TMP7:%.*]] = tail call { float, float } @fn2(float [[TMP6]]) #[[ATTR1]] ; VF4-NEXT: [[TMP9:%.*]] = tail call { float, float } @fn2(float [[TMP8]]) #[[ATTR1]] ; VF4-NEXT: [[TMP10:%.*]] = extractvalue { float, float } [[TMP3]], 0 ; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0 @@ -180,11 +180,13 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] ; VF2IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2 ; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4 +; VF2IC2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 +; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 ; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4 -; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 -; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR1:[0-9]+]] -; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 -; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR1]] +; VF2IC2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 +; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 +; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP22]]) #[[ATTR1:[0-9]+]] +; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR1]] ; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { float, float } [[TMP4]], 0 ; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 ; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP8]], 0 @@ -200,10 +202,8 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: [[TMP19:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP17]], 1 ; VF2IC2-NEXT: [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1 ; VF2IC2-NEXT: [[TMP21:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP17]], <2 x float> [[TMP20]], 1 -; VF2IC2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 -; VF2IC2-NEXT: [[TMP23:%.*]] = tail call { float, float } @fn2(float [[TMP22]]) #[[ATTR1]] -; VF2IC2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 -; VF2IC2-NEXT: [[TMP25:%.*]] = tail call { float, float } @fn2(float [[TMP24]]) #[[ATTR1]] +; VF2IC2-NEXT: [[TMP23:%.*]] = tail call { float, float } @fn2(float [[TMP24]]) #[[ATTR1]] +; VF2IC2-NEXT: [[TMP25:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR1]] ; VF2IC2-NEXT: [[TMP26:%.*]] = extractvalue { float, float } [[TMP23]], 0 ; VF2IC2-NEXT: [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i64 0 ; VF2IC2-NEXT: [[TMP28:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP27]], 0 @@ -271,12 +271,12 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl ; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]] ; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0 -; VF4-NEXT: [[TMP3:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP2]]) #[[ATTR2:[0-9]+]] ; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1 -; VF4-NEXT: [[TMP5:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP4]]) #[[ATTR2]] ; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2 -; VF4-NEXT: [[TMP7:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP6]]) #[[ATTR2]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 +; VF4-NEXT: [[TMP3:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP2]]) #[[ATTR2:[0-9]+]] +; VF4-NEXT: [[TMP5:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP4]]) #[[ATTR2]] +; VF4-NEXT: [[TMP7:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP6]]) #[[ATTR2]] ; VF4-NEXT: [[TMP9:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP8]]) #[[ATTR2]] ; VF4-NEXT: [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 0 ; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0 @@ -350,11 +350,13 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]] ; VF2IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 2 ; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4 +; VF2IC2-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 ; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP3]]) #[[ATTR2:[0-9]+]] -; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP5]]) #[[ATTR2]] +; VF2IC2-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 0 +; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 1 +; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP30]]) #[[ATTR2:[0-9]+]] +; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP3]]) #[[ATTR2]] ; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 0 ; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0 ; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP8]], 0 @@ -378,10 +380,8 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: [[TMP27:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], 2 ; VF2IC2-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP27]], i32 [[TMP26]], i64 1 ; VF2IC2-NEXT: [[TMP29:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], <2 x i32> [[TMP28]], 2 -; VF2IC2-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 0 -; VF2IC2-NEXT: [[TMP31:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP30]]) #[[ATTR2]] -; VF2IC2-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 1 -; VF2IC2-NEXT: [[TMP33:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP32]]) #[[ATTR2]] +; VF2IC2-NEXT: [[TMP31:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP32]]) #[[ATTR2]] +; VF2IC2-NEXT: [[TMP33:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP5]]) #[[ATTR2]] ; VF2IC2-NEXT: [[TMP34:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 0 ; VF2IC2-NEXT: [[TMP35:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i64 0 ; VF2IC2-NEXT: [[TMP36:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP35]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll index 63ca45495335f..abdd5e9562590 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll @@ -76,10 +76,10 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[TMP0]]) #[[ATTR0]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]] +; CHECK-NEXT: [[TMP5:%.*]] = call double @foo(double [[TMP3]], i64 [[TMP0]]) #[[ATTR0]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP5]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] ; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[TMP8]], align 8 diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll index 985a9a2c2d155..358f1b0d108c2 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll @@ -104,12 +104,12 @@ define void @blend_chain_iv(i1 %c) { ; CHECK-NEXT: [[PREDPHI1:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> undef ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP7]] ; CHECK-NEXT: store i16 0, ptr [[TMP2]], align 2 ; CHECK-NEXT: store i16 0, ptr [[TMP4]], align 2 diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll index 571c55c276dd5..927fefc73ceea 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll @@ -101,10 +101,10 @@ define void @ld_div3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -153,20 +153,20 @@ define void @ld_div1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 @@ -210,12 +210,12 @@ define void @ld_div2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; CHECK-NEXT: store i64 [[TMP8]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP5]], ptr [[TMP7]], align 8 +; CHECK-NEXT: store i64 [[TMP6]], ptr [[TMP8]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -257,20 +257,20 @@ define void @ld_div3_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 @@ -313,20 +313,20 @@ define void @ld_div1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -368,20 +368,20 @@ define void @ld_div2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -424,12 +424,12 @@ define void @ld_div3_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; CHECK-NEXT: store i64 [[TMP8]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP5]], ptr [[TMP7]], align 8 +; CHECK-NEXT: store i64 [[TMP6]], ptr [[TMP8]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -509,10 +509,10 @@ define void @ld_div2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -558,10 +558,10 @@ define void @ld_div3_step1_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -610,20 +610,20 @@ define void @ld_div1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498 @@ -667,12 +667,12 @@ define void @ld_div2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; CHECK-NEXT: store i64 [[TMP8]], ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; CHECK-NEXT: store i64 [[TMP9]], ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP6]], ptr [[TMP8]], align 8 +; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP9]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] @@ -714,20 +714,20 @@ define void @ld_div3_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498 @@ -770,20 +770,20 @@ define void @ld_div1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -826,20 +826,20 @@ define void @ld_div2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -883,12 +883,12 @@ define void @ld_div3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; CHECK-NEXT: store i64 [[TMP8]], ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; CHECK-NEXT: store i64 [[TMP9]], ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP6]], ptr [[TMP8]], align 8 +; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP9]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] @@ -931,10 +931,10 @@ define void @test_step_is_not_invariant(ptr %A) { ; CHECK-NEXT: [[TMP5:%.*]] = udiv <2 x i16> [[TMP4]], splat (i16 6) ; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i16> [[TMP5]] to <2 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP9]], align 2 ; CHECK-NEXT: store i16 [[TMP2]], ptr [[TMP10]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll index 6cf82fc2c9d48..d6277d657ea7e 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll @@ -101,10 +101,10 @@ define void @ld_and_neg3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -153,20 +153,20 @@ define void @ld_and_neg1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -1) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 @@ -255,20 +255,20 @@ define void @ld_and_neg1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -1) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -310,20 +310,20 @@ define void @ld_and_neg2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -363,10 +363,10 @@ define void @ld_and_neg2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -415,20 +415,20 @@ define void @ld_and_neg2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498 @@ -471,20 +471,20 @@ define void @ld_and_neg2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -527,20 +527,20 @@ define void @ld_and_neg3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll index 9ed22400b7055..bc8184ab34682 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll @@ -18,28 +18,28 @@ define void @ld_div2_urem3_1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP1:%.*]] = urem <8 x i64> [[TMP0]], splat (i64 3) ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP3]], align 8 -; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8 ; CHECK-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8 ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> poison, i64 [[TMP18]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 1 @@ -94,28 +94,28 @@ define void @ld_div2_urem3_2(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = udiv <8 x i64> [[TMP0]], splat (i64 2) ; CHECK-NEXT: [[TMP2:%.*]] = urem <8 x i64> [[TMP1]], splat (i64 3) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i64> [[TMP2]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i64> [[TMP2]], i32 4 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i64> [[TMP2]], i32 5 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i64> [[TMP2]], i32 6 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i64> [[TMP2]], i32 7 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP14]], align 8 -; CHECK-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP16]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i32 4 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i64> [[TMP2]], i32 5 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i64> [[TMP2]], i32 6 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i64> [[TMP2]], i32 7 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8 +; CHECK-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8 ; CHECK-NEXT: [[TMP26:%.*]] = load i64, ptr [[TMP18]], align 8 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i64> poison, i64 [[TMP19]], i32 0 ; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 1 @@ -168,28 +168,28 @@ define void @ld_div4(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8 ; CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8 ; CHECK-NEXT: [[TMP25:%.*]] = insertelement <8 x i64> poison, i64 [[TMP17]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 1 diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll index 2b5d0f3cb0125..32873a4e90e81 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll @@ -105,16 +105,16 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VF4-NEXT: [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1) ; VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 -; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8 -; VF4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 +; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 +; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8 +; VF4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF4-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0 ; VF4-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1 @@ -233,20 +233,20 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 ; VF2-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], zeroinitializer ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; VF2-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; VF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; VF2-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; VF2-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 @@ -272,34 +272,34 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6 ; VF4-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], zeroinitializer ; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 +; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 ; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 ; VF4-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0 ; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3 ; VF4-NEXT: [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42) -; VF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 -; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP22]], align 8 -; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 -; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP23]], align 8 -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 +; VF4-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 +; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 +; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP22]], ptr [[TMP26]], align 8 +; VF4-NEXT: store i64 [[TMP23]], ptr [[TMP27]], align 8 +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 @@ -343,12 +343,12 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] ; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8 ; VF2-NEXT: [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: store i64 [[TMP7]], ptr [[TMP5]], align 8 -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: store i64 [[TMP8]], ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP5]], ptr [[TMP7]], align 8 +; VF2-NEXT: store i64 [[TMP6]], ptr [[TMP8]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 ; VF2-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -374,18 +374,18 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] ; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; VF4-NEXT: [[TMP6:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42) -; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: store i64 [[TMP11]], ptr [[TMP7]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: store i64 [[TMP12]], ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: store i64 [[TMP13]], ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: store i64 [[TMP14]], ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP7]], ptr [[TMP11]], align 8 +; VF4-NEXT: store i64 [[TMP8]], ptr [[TMP12]], align 8 +; VF4-NEXT: store i64 [[TMP9]], ptr [[TMP13]], align 8 +; VF4-NEXT: store i64 [[TMP10]], ptr [[TMP14]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 ; VF4-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -427,20 +427,20 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; VF2-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], zeroinitializer ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; VF2-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; VF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; VF2-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; VF2-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -465,34 +465,34 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9 ; VF4-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], zeroinitializer ; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 +; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 ; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 ; VF4-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0 ; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3 ; VF4-NEXT: [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42) -; VF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 -; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP22]], align 8 -; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 -; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP23]], align 8 -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 +; VF4-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 +; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 +; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP22]], ptr [[TMP26]], align 8 +; VF4-NEXT: store i64 [[TMP23]], ptr [[TMP27]], align 8 +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -534,20 +534,20 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; VF2-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1) ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; VF2-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; VF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; VF2-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; VF2-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -572,34 +572,34 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9 ; VF4-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1) ; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 +; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 ; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 ; VF4-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0 ; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3 ; VF4-NEXT: [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42) -; VF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 -; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP22]], align 8 -; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 -; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP23]], align 8 -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 +; VF4-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 +; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 +; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP22]], ptr [[TMP26]], align 8 +; VF4-NEXT: store i64 [[TMP23]], ptr [[TMP27]], align 8 +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -640,10 +640,10 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; VF2-NEXT: [[TMP0:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1) ; VF2-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; VF2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; VF2-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; VF2-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; VF2-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -670,16 +670,16 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; VF4-NEXT: [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1) ; VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 -; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8 -; VF4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 +; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 +; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8 +; VF4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF4-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0 ; VF4-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1 @@ -731,12 +731,12 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] ; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 ; VF2-NEXT: [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: store i64 [[TMP8]], ptr [[TMP6]], align 8 -; VF2-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: store i64 [[TMP9]], ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP6]], ptr [[TMP8]], align 8 +; VF2-NEXT: store i64 [[TMP7]], ptr [[TMP9]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498 ; VF2-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] @@ -762,18 +762,18 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] ; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8 ; VF4-NEXT: [[TMP7:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42) -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: store i64 [[TMP12]], ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: store i64 [[TMP13]], ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: store i64 [[TMP14]], ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: store i64 [[TMP15]], ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP8]], ptr [[TMP12]], align 8 +; VF4-NEXT: store i64 [[TMP9]], ptr [[TMP13]], align 8 +; VF4-NEXT: store i64 [[TMP10]], ptr [[TMP14]], align 8 +; VF4-NEXT: store i64 [[TMP11]], ptr [[TMP15]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496 ; VF4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] @@ -815,20 +815,20 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; VF2-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1) ; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; VF2-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -854,34 +854,34 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9 ; VF4-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1) ; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0 -; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP7]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 ; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP14]], i32 0 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 1 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3 ; VF4-NEXT: [[TMP22:%.*]] = add nsw <4 x i64> [[TMP21]], splat (i64 42) -; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0 -; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP23]], align 8 -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 +; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3 +; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP23]], ptr [[TMP27]], align 8 +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -924,20 +924,20 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; VF2-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 2) ; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; VF2-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -963,34 +963,34 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9 ; VF4-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 2) ; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0 -; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP7]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 ; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP14]], i32 0 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 1 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3 ; VF4-NEXT: [[TMP22:%.*]] = add nsw <4 x i64> [[TMP21]], splat (i64 42) -; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0 -; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP23]], align 8 -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 +; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3 +; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP23]], ptr [[TMP27]], align 8 +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll index 12851d7d91cc7..607d1365098f2 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll @@ -18,10 +18,10 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]] ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 @@ -52,16 +52,16 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -143,16 +143,16 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -208,10 +208,10 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]] ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 @@ -242,16 +242,16 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -310,20 +310,20 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -353,34 +353,34 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -431,20 +431,20 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -474,34 +474,34 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -552,20 +552,20 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -595,34 +595,34 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -673,20 +673,20 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -715,34 +715,34 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -792,20 +792,20 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -834,34 +834,34 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -911,20 +911,20 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -953,34 +953,34 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1028,10 +1028,10 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]] ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 @@ -1062,16 +1062,16 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -1127,10 +1127,10 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2) ; VF2-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]] ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 @@ -1161,16 +1161,16 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -1226,10 +1226,10 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]] ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 @@ -1260,16 +1260,16 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -1328,20 +1328,20 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1371,34 +1371,34 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1449,20 +1449,20 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1492,34 +1492,34 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1570,20 +1570,20 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1613,34 +1613,34 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1691,20 +1691,20 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1734,34 +1734,34 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1812,20 +1812,20 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1855,34 +1855,34 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1933,20 +1933,20 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1976,34 +1976,34 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index 027dcaf771072..6a6ae316f4e52 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -330,12 +330,12 @@ define void @test_versioned_with_non_ex_use(i32 %offset, ptr noalias %dst.1, ptr ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]] ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]] ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP17]] ; CHECK-NEXT: store i32 0, ptr [[TMP12]], align 8 ; CHECK-NEXT: store i32 0, ptr [[TMP14]], align 8 diff --git a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll index 9ace6be64b69a..e5e02674704f9 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll @@ -61,29 +61,29 @@ define void @expand(ptr %src, ptr %dst, i64 %0) { ; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr [[SRC]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; CHECK-NEXT: [[TMP20:%.*]] = shl <4 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP23]] ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]] ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP27]] -; CHECK-NEXT: store double [[TMP19]], ptr [[TMP22]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: store double [[TMP19]], ptr [[TMP24]], align 8, !alias.scope [[META3]] +; CHECK-NEXT: store double [[TMP19]], ptr [[TMP31]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: store double [[TMP19]], ptr [[TMP26]], align 8, !alias.scope [[META3]] +; CHECK-NEXT: store double [[TMP19]], ptr [[TMP33]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: store double [[TMP19]], ptr [[TMP28]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: [[TMP29:%.*]] = or disjoint <4 x i64> [[TMP20]], splat (i64 1) ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP29]], i32 0 -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP30]] ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP29]], i32 1 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]] ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i64> [[TMP29]], i32 2 -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i64> [[TMP29]], i32 3 +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]] +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]] ; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP36]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP31]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP33]], align 8, !alias.scope [[META3]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP41]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP35]], align 8, !alias.scope [[META3]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP42]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP37]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll index 52d279a8aafa6..e3765ed541e7a 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -114,14 +114,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP6]], i64 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ult <2 x i64> [[TMP8]], splat (i64 225) -; CHECK-NEXT: [[TMP12:%.*]] = icmp ult <2 x i64> [[TMP10]], splat (i64 225) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]]) ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i64 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP14]]) +; CHECK-NEXT: [[TMP12:%.*]] = icmp ult <2 x i64> [[TMP10]], splat (i64 225) ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP12]], i64 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP15]]) ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP12]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP14]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP15]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP16]]) ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP17]], i64 16 @@ -190,14 +190,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> poison, i64 [[TMP37]], i64 0 ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <2 x i64> [[TMP39]], i64 [[TMP38]], i64 1 ; CHECK-NEXT: [[TMP41:%.*]] = icmp ult <2 x i64> [[TMP36]], splat (i64 225) -; CHECK-NEXT: [[TMP42:%.*]] = icmp ult <2 x i64> [[TMP40]], splat (i64 225) ; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP41]], i64 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP43]]) ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x i1> [[TMP41]], i64 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP44]]) +; CHECK-NEXT: [[TMP42:%.*]] = icmp ult <2 x i64> [[TMP40]], splat (i64 225) ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i1> [[TMP42]], i64 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP45]]) ; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x i1> [[TMP42]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP43]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP44]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP45]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP46]]) ; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP33]] ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP47]], i64 16 @@ -267,14 +267,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[TMP70:%.*]] = insertelement <2 x i64> poison, i64 [[TMP68]], i64 0 ; CHECK-NEXT: [[TMP71:%.*]] = insertelement <2 x i64> [[TMP70]], i64 [[TMP69]], i64 1 ; CHECK-NEXT: [[TMP72:%.*]] = icmp ult <2 x i64> [[TMP67]], splat (i64 225) -; CHECK-NEXT: [[TMP73:%.*]] = icmp ult <2 x i64> [[TMP71]], splat (i64 225) ; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i1> [[TMP72]], i64 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP74]]) ; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i1> [[TMP72]], i64 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP75]]) +; CHECK-NEXT: [[TMP73:%.*]] = icmp ult <2 x i64> [[TMP71]], splat (i64 225) ; CHECK-NEXT: [[TMP76:%.*]] = extractelement <2 x i1> [[TMP73]], i64 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP76]]) ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <2 x i1> [[TMP73]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP74]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP75]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP76]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP77]]) ; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP64]] ; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP78]], i64 16 @@ -344,14 +344,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[TMP101:%.*]] = insertelement <2 x i64> poison, i64 [[TMP99]], i64 0 ; CHECK-NEXT: [[TMP102:%.*]] = insertelement <2 x i64> [[TMP101]], i64 [[TMP100]], i64 1 ; CHECK-NEXT: [[TMP103:%.*]] = icmp ult <2 x i64> [[TMP98]], splat (i64 225) -; CHECK-NEXT: [[TMP104:%.*]] = icmp ult <2 x i64> [[TMP102]], splat (i64 225) ; CHECK-NEXT: [[TMP105:%.*]] = extractelement <2 x i1> [[TMP103]], i64 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP105]]) ; CHECK-NEXT: [[TMP106:%.*]] = extractelement <2 x i1> [[TMP103]], i64 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP106]]) +; CHECK-NEXT: [[TMP104:%.*]] = icmp ult <2 x i64> [[TMP102]], splat (i64 225) ; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i1> [[TMP104]], i64 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP107]]) ; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i1> [[TMP104]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP105]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP106]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP107]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP108]]) ; CHECK-NEXT: [[TMP109:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP95]] ; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP109]], i64 16