diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f4259d3d69880..d5c60786d7f9b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7328,6 +7328,7 @@ DenseMap LoopVectorizationPlanner::executePlan( // cost model is complete for better cost estimates. VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF, OrigLoop->getHeader()->getContext()); + VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index a3f39f5ad7a29..2521b2b93e6c3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -261,6 +261,13 @@ Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) { return Data.VPV2Scalars[Def][0]; } + // Look through BuildVector to avoid redundant extracts. + // TODO: Remove once replicate regions are unrolled explicitly. + if (Lane.getKind() == VPLane::Kind::First && match(Def, m_BuildVector())) { + auto *BuildVector = cast(Def); + return get(BuildVector->getOperand(Lane.getKnownLane()), true); + } + assert(hasVectorValue(Def)); auto *VecPart = Data.VPV2Vector[Def]; if (!VecPart->getType()->isVectorTy()) { @@ -360,17 +367,9 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { // resulting vectors are stored in State, we will only generate the // insertelements once. Value *VectorValue = nullptr; - if (IsSingleScalar) { - VectorValue = GetBroadcastInstrs(ScalarValue); - set(Def, VectorValue); - } else { - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - // Initialize packing with insertelements to start from poison. - VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF)); - for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane) - VectorValue = packScalarIntoVectorizedValue(Def, VectorValue, Lane); - set(Def, VectorValue); - } + assert(IsSingleScalar && "replicates must be packed explicitly"); + VectorValue = GetBroadcastInstrs(ScalarValue); + set(Def, VectorValue); Builder.restoreIP(OldIP); return VectorValue; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index f4163b0743a9a..9ffdd481a784c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -936,6 +936,13 @@ class VPInstruction : public VPRecipeWithIRFlags, BranchOnCount, BranchOnCond, Broadcast, + /// Given operands of (the same) struct type, creates a struct of fixed- + /// width vectors each containing a struct field of all operands. The + /// number of operands matches the element count of every vector. + BuildStructVector, + /// Creates a fixed-width vector containing all operands. The number of + /// operands matches the vector element count. + BuildVector, ComputeAnyOfResult, ComputeFindLastIVResult, ComputeReductionResult, @@ -970,6 +977,9 @@ class VPInstruction : public VPRecipeWithIRFlags, // Creates a step vector starting from 0 to VF with a step of 1. StepVector, + Pack, + Unpack, + }; private: @@ -979,14 +989,6 @@ class VPInstruction : public VPRecipeWithIRFlags, /// An optional name that can be used for the generated IR instruction. const std::string Name; - /// Returns true if this VPInstruction generates scalar values for all lanes. - /// Most VPInstructions generate a single value per part, either vector or - /// scalar. VPReplicateRecipe takes care of generating multiple (scalar) - /// values per all lanes, stemming from an original ingredient. This method - /// identifies the (rare) cases of VPInstructions that do so as well, w/o an - /// underlying ingredient. - bool doesGeneratePerAllLanes() const; - /// Returns true if we can generate a scalar for the first lane only if /// needed. bool canGenerateScalarForFirstLane() const; @@ -1080,6 +1082,14 @@ class VPInstruction : public VPRecipeWithIRFlags, /// result is also a single scalar. bool isSingleScalar() const; + /// Returns true if this VPInstruction generates scalar values for all lanes. + /// Most VPInstructions generate a single value per part, either vector or + /// scalar. VPReplicateRecipe takes care of generating multiple (scalar) + /// values per all lanes, stemming from an original ingredient. This method + /// identifies the (rare) cases of VPInstructions that do so as well, w/o an + /// underlying ingredient. + bool doesGeneratePerAllLanes() const; + /// Returns the symbolic name assigned to the VPInstruction. StringRef getName() const { return Name; } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 76da5b0314a8e..9fda3406604b1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -100,6 +100,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::ExplicitVectorLength: return Type::getIntNTy(Ctx, 32); case Instruction::PHI: + case VPInstruction::Pack: + case VPInstruction::Unpack: // Infer the type of first operand only, as other operands of header phi's // may lead to infinite recursion. return inferScalarType(R->getOperand(0)); @@ -108,6 +110,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::AnyOf: + case VPInstruction::BuildStructVector: + case VPInstruction::BuildVector: return SetResultTyFromOp(); case VPInstruction::FirstActiveLane: return Type::getIntNTy(Ctx, 64); @@ -440,6 +444,10 @@ SmallVector llvm::calculateRegisterUsageForPlan( if (!VPBB->getParent()) break; for (VPRecipeBase &R : *VPBB) { + if (isa(&R) && + (cast(&R)->getOpcode() == VPInstruction::Pack || + cast(&R)->getOpcode() == VPInstruction::Unpack)) + continue; Idx2Recipe.push_back(&R); // Save the end location of each USE. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index ae91e310f7592..07caf2b7c5514 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -224,6 +224,9 @@ struct Recipe_match { if ((!matchRecipeAndOpcode(R) && ...)) return false; + auto *VPI = dyn_cast(R); + if (VPI && VPI->getOpcode() == VPInstruction::BuildVector) + return true; assert(R->getNumOperands() == std::tuple_size::value && "recipe with matched opcode does not have the expected number of " "operands"); @@ -263,6 +266,10 @@ struct Recipe_match { } }; +template +using ZeroOpRecipe_match = + Recipe_match, Opcode, false, RecipeTys...>; + template using UnaryRecipe_match = Recipe_match, Opcode, false, RecipeTys...>; @@ -271,6 +278,9 @@ template using UnaryVPInstruction_match = UnaryRecipe_match; +template +using ZeroOpVPInstruction_match = ZeroOpRecipe_match; + template using AllUnaryRecipe_match = UnaryRecipe_match; +inline ZeroOpVPInstruction_match m_BuildVector() { + return ZeroOpVPInstruction_match(); +} + template inline UnaryVPInstruction_match m_VPInstruction(const Op0_t &Op0) { @@ -364,6 +378,12 @@ m_Broadcast(const Op0_t &Op0) { return m_VPInstruction(Op0); } +template +inline UnaryVPInstruction_match +m_ExtractLastElement(const Op0_t &Op0) { + return m_VPInstruction(Op0); +} + template inline BinaryVPInstruction_match m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 3e12fdf9163eb..c0b284208e21a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -418,7 +418,8 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef Operands, } bool VPInstruction::doesGeneratePerAllLanes() const { - return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); + return (Opcode == VPInstruction::PtrAdd || Opcode == VPInstruction::Unpack) && + !vputils::onlyFirstLaneUsed(this); } bool VPInstruction::canGenerateScalarForFirstLane() const { @@ -438,6 +439,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { case VPInstruction::PtrAdd: case VPInstruction::ExplicitVectorLength: case VPInstruction::AnyOf: + case VPInstruction::Unpack: return true; default: return false; @@ -448,10 +450,17 @@ Value *VPInstruction::generatePerLane(VPTransformState &State, const VPLane &Lane) { IRBuilderBase &Builder = State.Builder; - assert(getOpcode() == VPInstruction::PtrAdd && - "only PtrAdd opcodes are supported for now"); - return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), - State.get(getOperand(1), Lane), Name); + switch (getOpcode()) { + case VPInstruction::PtrAdd: + return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), + State.get(getOperand(1), Lane), Name); + + case VPInstruction::Unpack: { + Value *LaneV = Lane.getAsRuntimeExpr(State.Builder, State.VF); + return Builder.CreateExtractElement(State.get(getOperand(0)), LaneV); + } + } + llvm_unreachable("all supported opcodes must be handled above"); } /// Create a conditional branch using \p Cond branching to the successors of \p @@ -495,9 +504,9 @@ Value *VPInstruction::generate(VPTransformState &State) { } case Instruction::ExtractElement: { assert(State.VF.isVector() && "Only extract elements from vectors"); - Value *Vec = State.get(getOperand(0)); - Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); - return Builder.CreateExtractElement(Vec, Idx, Name); + unsigned IdxToExtract = + cast(getOperand(1)->getLiveInIRValue())->getZExtValue(); + return State.get(getOperand(0), VPLane(IdxToExtract)); } case Instruction::Freeze: { Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); @@ -608,6 +617,32 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreateVectorSplat( State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast"); } + case VPInstruction::BuildStructVector: { + // For struct types, we need to build a new 'wide' struct type, where each + // element is widened, i.e. we crate a struct of vectors . + auto *StructTy = + cast(State.TypeAnalysis.inferScalarType(getOperand(0))); + Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF)); + for (const auto &[Idx, Op] : enumerate(operands())) { + for (unsigned I = 0; I != StructTy->getNumElements(); I++) { + Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I); + Value *VectorValue = Builder.CreateExtractValue(Res, I); + VectorValue = + Builder.CreateInsertElement(VectorValue, ScalarValue, Idx); + Res = Builder.CreateInsertValue(Res, VectorValue, I); + } + } + return Res; + } + case VPInstruction::BuildVector: { + auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0)); + auto NumOfElements = ElementCount::getFixed(getNumOperands()); + Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements)); + for (const auto &[Idx, Op] : enumerate(operands())) + Res = State.Builder.CreateInsertElement(Res, State.get(Op, true), + State.Builder.getInt32(Idx)); + return Res; + } case VPInstruction::ReductionStartVector: { if (State.VF.isScalar()) return State.get(getOperand(0), true); @@ -766,6 +801,20 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask, true, Name); } + case VPInstruction::Pack: { + assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); + Value *WideValue = PoisonValue::get( + toVectorizedTy(State.TypeAnalysis.inferScalarType(this), State.VF)); + for (unsigned Lane = 0; Lane < State.VF.getFixedValue(); ++Lane) + WideValue = + State.packScalarIntoVectorizedValue(getOperand(0), WideValue, Lane); + return WideValue; + } + case VPInstruction::Unpack: { + assert(vputils::onlyFirstLaneUsed(this) && + "can only generate first lane for PtrAdd"); + return generatePerLane(State, VPLane(0)); + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -894,10 +943,11 @@ void VPInstruction::execute(VPTransformState &State) { if (!hasResult()) return; assert(GeneratedValue && "generate must produce a value"); - assert( - (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || - State.VF.isScalar()) && - "scalar value but not only first lane defined"); + assert(((GeneratedValue->getType()->isVectorTy() || + GeneratedValue->getType()->isStructTy()) == + !GeneratesPerFirstLaneOnly || + State.VF.isScalar()) && + "scalar value but not only first lane defined"); State.set(this, GeneratedValue, /*IsScalar*/ GeneratesPerFirstLaneOnly); } @@ -911,6 +961,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: + case VPInstruction::BuildStructVector: + case VPInstruction::BuildVector: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExtractLastElement: @@ -923,6 +975,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::WideIVStep: case VPInstruction::StepVector: case VPInstruction::ReductionStartVector: + case VPInstruction::Pack: + case VPInstruction::Unpack: return false; default: return true; @@ -1033,6 +1087,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::Broadcast: O << "broadcast"; break; + case VPInstruction::BuildStructVector: + O << "buildstructvector"; + break; + case VPInstruction::BuildVector: + O << "buildvector"; + break; case VPInstruction::ExtractLastElement: O << "extract-last-element"; break; @@ -1063,6 +1123,13 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ReductionStartVector: O << "reduction-start-vector"; break; + case VPInstruction::Pack: + O << "pack-into-vector"; + break; + case VPInstruction::Unpack: + O << "unpack-into-scalars"; + break; + default: O << Instruction::getOpcodeName(getOpcode()); } @@ -2627,45 +2694,30 @@ static void scalarizeInstruction(const Instruction *Instr, void VPReplicateRecipe::execute(VPTransformState &State) { Instruction *UI = getUnderlyingInstr(); - if (State.Lane) { // Generate a single instance. - assert((State.VF.isScalar() || !isSingleScalar()) && - "uniform recipe shouldn't be predicated"); - assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - scalarizeInstruction(UI, this, *State.Lane, State); - // Insert scalar instance packing it into a vector. - if (State.VF.isVector() && shouldPack()) { - Value *WideValue; - // If we're constructing lane 0, initialize to start from poison. - if (State.Lane->isFirstLane()) { - assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); - WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF)); - } else { - WideValue = State.get(this); - } - State.set(this, State.packScalarIntoVectorizedValue(this, WideValue, - *State.Lane)); - } - return; - } - if (IsSingleScalar) { - // Uniform within VL means we need to generate lane 0. + if (!State.Lane) { + assert(IsSingleScalar && + "VPReplicateRecipes outside replicate regions must be unrolled"); scalarizeInstruction(UI, this, VPLane(0), State); return; } - // A store of a loop varying value to a uniform address only needs the last - // copy of the store. - if (isa(UI) && vputils::isSingleScalar(getOperand(1))) { - auto Lane = VPLane::getLastLaneForVF(State.VF); - scalarizeInstruction(UI, this, VPLane(Lane), State); - return; + assert((State.VF.isScalar() || !isSingleScalar()) && + "uniform recipe shouldn't be predicated"); + scalarizeInstruction(UI, this, *State.Lane, State); + // Insert scalar instance packing it into a vector. + if (State.VF.isVector() && shouldPack()) { + Value *WideValue; + // If we're constructing lane 0, initialize to start from poison. + if (State.Lane->isFirstLane()) { + assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); + WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF)); + } else { + WideValue = State.get(this); + } + State.set(this, State.packScalarIntoVectorizedValue(this, WideValue, + *State.Lane)); } - - // Generate scalar instances for all VF lanes. - const unsigned EndLane = State.VF.getFixedValue(); - for (unsigned Lane = 0; Lane < EndLane; ++Lane) - scalarizeInstruction(UI, this, VPLane(Lane), State); } bool VPReplicateRecipe::shouldPack() const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ac6be09ef271d..c1c21e1397ad1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -683,7 +683,8 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { // scalars. auto *WideIV = cast(&Phi); if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) { - return U->usesScalars(WideIV); + return U->usesScalars(WideIV) || + match(U, m_ExtractLastElement(m_VPValue())); })) continue; @@ -694,12 +695,19 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(), WideIV->getDebugLoc(), Builder); + bool HasWideOps = any_of(WideIV->users(), [WideIV](VPUser *U) { + return !U->usesScalars(WideIV) && + !match(U, m_ExtractLastElement(m_VPValue())); + }); + // Update scalar users of IV to use Step instead. if (!HasOnlyVectorVFs) WideIV->replaceAllUsesWith(Steps); else - WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) { - return U.usesScalars(WideIV); + WideIV->replaceUsesWithIf(Steps, [WideIV, HasWideOps](VPUser &U, + unsigned) { + return U.usesScalars(WideIV) || + (!HasWideOps && match(&U, m_ExtractLastElement(m_VPValue()))); }); } } @@ -1140,6 +1148,24 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } + // Look through ExtractLastElement (BuildVector ....). + if (match(&R, m_VPInstruction( + m_BuildVector()))) { + auto *BuildVector = cast(R.getOperand(0)); + Def->replaceAllUsesWith( + BuildVector->getOperand(BuildVector->getNumOperands() - 1)); + return; + } + + // Look through ExtractPenultimateElement (BuildVector ....). + if (match(&R, m_VPInstruction( + m_BuildVector()))) { + auto *BuildVector = cast(R.getOperand(0)); + Def->replaceAllUsesWith( + BuildVector->getOperand(BuildVector->getNumOperands() - 2)); + return; + } + // Some simplifications can only be applied after unrolling. Perform them // below. if (!Plan->isUnrolled()) @@ -1209,6 +1235,20 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { continue; auto *RepOrWidenR = cast(&R); + + if (RepR && isa(RepR->getUnderlyingInstr()) && + vputils::isSingleScalar(RepR->getOperand(1))) { + auto *Clone = new VPReplicateRecipe( + RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(), + true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/); + Clone->insertBefore(RepOrWidenR); + auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement, + {Clone->getOperand(0)}); + Ext->insertBefore(Clone); + Clone->setOperand(0, Ext); + RepR->eraseFromParent(); + continue; + } // Skip recipes that aren't single scalars or don't have only their // scalar results used. In the latter case, we would introduce extra // broadcasts. @@ -1895,6 +1935,62 @@ static void removeBranchOnConst(VPlan &Plan) { } } +static void materializePack(VPlan &Plan) { + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *RepR = dyn_cast(&R); + if (!(RepR && !RepR->isSingleScalar()) && + !(isa(&R) && + cast(&R)->doesGeneratePerAllLanes())) + continue; + auto *Def = cast(&R); + for (auto *Op : to_vector(Def->operands())) { + VPRecipeBase *OpDef = Op->getDefiningRecipe(); + if (!OpDef || isa(OpDef) || + vputils::isSingleScalar(Op) || + (isa(OpDef) && + cast(OpDef)->doesGeneratePerAllLanes()) || + isa(OpDef)) + continue; + + auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Op}); + if (OpDef->isPhi()) + Unpack->insertBefore(*OpDef->getParent(), + OpDef->getParent()->getFirstNonPhi()); + else + Unpack->insertAfter(OpDef); + Op->replaceUsesWithIf(Unpack, [](VPUser &U, unsigned) { + auto *RepR = dyn_cast(&U); + return RepR && (!isa(RepR->getUnderlyingInstr()) || + !vputils::isSingleScalar(RepR->getOperand(1))); + }); + } + if (all_of(Def->users(), [Def](VPUser *U) { + auto *R = cast(U); + return U->usesScalars(Def) && + (!R->getParent()->getParent() || + !R->getParent()->getParent()->getParent()); + })) + continue; + + auto *Pack = new VPInstruction(VPInstruction::Pack, {Def}); + Pack->insertAfter(Def); + Def->replaceUsesWithIf(Pack, [Pack, Def](VPUser &U, unsigned) { + auto *R = cast(&U); + return &U != Pack && + (!U.usesScalars(Def) || + (R->getParent()->getParent() && R->getParent()->getParent())) && + (!isa(&U) || + (cast(&U)->getOpcode() != + VPInstruction::ExtractLastElement && + cast(&U)->getOpcode() != + VPInstruction::ExtractPenultimateElement)); + }); + } + } +} + void VPlanTransforms::optimize(VPlan &Plan) { runPass(removeRedundantCanonicalIVs, Plan); runPass(removeRedundantInductionCasts, Plan); @@ -1912,6 +2008,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(createAndOptimizeReplicateRegions, Plan); runPass(mergeBlocksIntoPredecessors, Plan); runPass(licm, Plan); + runPass(materializePack, Plan); } // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 7e51c05d1b5b5..40885cd52a127 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -99,6 +99,12 @@ struct VPlanTransforms { /// Explicitly unroll \p Plan by \p UF. static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx); + /// Replace each VPReplicateRecipe outside on any replicate region in \p Plan + /// with \p VF single-scalar recipes. + /// TODO: Also replicate VPReplicateRecipes inside replicate regions, thereby + /// dissolving the latter. + static void replicateByVF(VPlan &Plan, ElementCount VF); + /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the /// resulting plan to \p BestVF and \p BestUF. static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 0bc683e557e70..cea56765924c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -15,6 +15,7 @@ #include "VPlan.h" #include "VPlanAnalysis.h" #include "VPlanCFG.h" +#include "VPlanHelpers.h" #include "VPlanPatternMatch.h" #include "VPlanTransforms.h" #include "VPlanUtils.h" @@ -450,3 +451,127 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { VPlanTransforms::removeDeadRecipes(Plan); } + +struct VPReplicateUnroller { + VPlan &Plan; + Type *IdxTy; + DenseMap> Rep2LaneDefs; + + VPReplicateUnroller(VPlan &Plan, Type *IdxTy) : Plan(Plan), IdxTy(IdxTy) {} + + void addLaneDef(VPValue *Def, VPValue *LaneDef) { + const auto &[LaneDefs, _] = Rep2LaneDefs.insert({Def, {}}); + LaneDefs->second.push_back(LaneDef); + } + + VPValue *getLane(VPValue *Op, const VPLane &Lane, VPBuilder &Builder) { + const auto &LaneDefs = Rep2LaneDefs.lookup(Op); + if (LaneDefs.empty()) { + if (Lane.getKind() == VPLane::Kind::ScalableLast) { + return Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}); + } + VPValue *Idx = + Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); + } + assert(Lane.getKind() != VPLane::Kind::ScalableLast); + return LaneDefs[Lane.getKnownLane()]; + } + + VPValue *getLane0(VPReplicateRecipe *RepR) { + VPBuilder B; + return getLane(RepR, VPLane::getFirstLane(), B); + } +}; + +/// Create a single-scalar clone of \p RepR for lane \p Lane. +static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, + Type *IdxTy, VPReplicateRecipe *RepR, + VPLane Lane, + VPReplicateUnroller &State) { + // Collect the operands at Lane, creating extracts as needed. + SmallVector NewOps; + for (VPValue *Op : RepR->operands()) { + if (vputils::isSingleScalar(Op)) { + NewOps.push_back(Op); + continue; + } + NewOps.push_back(State.getLane(Op, Lane, Builder)); + } + + auto *New = + new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps, + /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR); + New->insertBefore(RepR); + return New; +} + +void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { + Type *IdxTy = IntegerType::get( + Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32); + VPReplicateUnroller State(Plan, IdxTy); + SmallVector ToRemove; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *Pack = dyn_cast(&R); + if (Pack && Pack->getOpcode() == VPInstruction::Pack) { + auto LaneDefs = State.Rep2LaneDefs.lookup(Pack->getOperand(0)); + if (!LaneDefs.empty()) { + auto *RepR = cast(Pack->getOperand(0)); + // If needed, create a Build(Struct)Vector recipe to insert the scalar + // lane values into a vector. + Type *ResTy = RepR->getUnderlyingInstr()->getType(); + VPBuilder Builder(Pack); + VPValue *VecRes = Builder.createNaryOp( + ResTy->isStructTy() ? VPInstruction::BuildStructVector + : VPInstruction::BuildVector, + LaneDefs); + Pack->replaceAllUsesWith(VecRes); + Pack->eraseFromParent(); + } else { + assert(!isa(Pack->getOperand(0))); + } + continue; + } + if (Pack && Pack->getOpcode() == VPInstruction::Unpack) { + VPBuilder Builder(Pack); + + auto *Def = Pack->getOperand(0); + for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) { + VPValue *Idx = Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, I)); + State.addLaneDef(Pack, Builder.createNaryOp( + Instruction::ExtractElement, {Def, Idx})); + } + continue; + } + + auto *RepR = dyn_cast(&R); + if (!RepR || RepR->isSingleScalar()) + continue; + + VPBuilder Builder(RepR); + ToRemove.push_back(RepR); + // Stores to invariant addresses need to store the last lane only. + if (isa(RepR->getUnderlyingInstr()) && + vputils::isSingleScalar(RepR->getOperand(1))) { + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF), + State); + continue; + } + + /// Create single-scalar version of RepR for all lanes. + for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) + State.addLaneDef( + RepR, cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), State)); + + /// Users that only demand the first lane can use the definition for lane + /// 0. + RepR->replaceUsesWithIf( + State.getLane0(RepR), + [RepR](VPUser &U, unsigned) { return U.onlyFirstLaneUsed(RepR); }); + } + } + for (auto *R : reverse(ToRemove)) + R->eraseFromParent(); +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll index 83b35309638a7..79a2cdce4086b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll @@ -20,21 +20,21 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) { ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_START_1:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[NEXT_GEP1]], i32 1 ; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP2]] ; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[NEXT_GEP1]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP2]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <2 x ptr> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <2 x ptr> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP10]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP10]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP12]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]]) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2 @@ -62,8 +62,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) { ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP8]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]]) ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP23]]) ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i32 0 ; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP24]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll index 078f98f54525b..1ab39f567e710 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll @@ -109,10 +109,10 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2:%.*]], align 8 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3:[0-9]+]] ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3]] -; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1 ; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3]] ; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3]] +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1 ; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0 ; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP10]], i32 1 ; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = fcmp ule <2 x double> [[TMP8]], zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index 0fc324f720e60..b621ffdbae791 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -306,22 +306,6 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 -; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 -; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 -; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 -; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 -; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 -; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 -; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 -; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 -; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 -; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 -; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 -; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 -; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 -; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 ; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2 @@ -338,6 +322,22 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 ; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 ; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index a44b9bfb3b446..0b53894742722 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -655,22 +655,6 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 -; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 -; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 -; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 -; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 -; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 -; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 -; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 -; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 -; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 -; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 -; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 -; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 -; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 -; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 ; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2 @@ -687,6 +671,22 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 ; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 ; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll index c721493243734..ea8fa88573ac4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll @@ -15,13 +15,17 @@ target triple = "aarch64--linux-gnu" ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> ; -; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> -; -; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> +; +; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> define void @struct_return_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { ; CHECK-LABEL: define void @struct_return_widen( @@ -64,17 +68,23 @@ exit: ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_a = extractvalue { half, half } %call, 0 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_b = extractvalue { half, half } %call, 1 ; -; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> ; -; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> ; -; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> define void @struct_return_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { ; CHECK-LABEL: define void @struct_return_replicate( @@ -118,29 +128,41 @@ exit: ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_a = extractvalue { half, half } %call, 0 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_b = extractvalue { half, half } %call, 1 ; -; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> -; -; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> -; -; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> -; -; CHECK-COST: Cost of Invalid for VF vscale x 1: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> -; -; CHECK-COST: Cost of Invalid for VF vscale x 2: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> -; -; CHECK-COST: Cost of Invalid for VF vscale x 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> +; +; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> +; +; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> +; +; CHECK-COST: Cost of 0 for VF vscale x 1: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-COST: Cost of Invalid for VF vscale x 1: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-COST: Cost of 0 for VF vscale x 1: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> +; +; CHECK-COST: Cost of 0 for VF vscale x 2: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-COST: Cost of Invalid for VF vscale x 2: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-COST: Cost of 0 for VF vscale x 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> +; +; CHECK-COST: Cost of 0 for VF vscale x 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-COST: Cost of Invalid for VF vscale x 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-COST: Cost of 0 for VF vscale x 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> ; ; CHECK-COST: Cost of 10 for VF vscale x 8: WIDEN-CALL ir<%call> = call @foo(ir<%in_val>, ir) (using library function: scalable_vec_masked_foo) ; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll index 46b4762520ded..8527f46f740e4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll @@ -28,10 +28,12 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN ir<%load> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: REPLICATE ir<%call> = call @foo(ir<%load>) +; CHECK-NEXT: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%load> +; CHECK-NEXT: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-NEXT: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, vp<[[PACK]]> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll index 813d61b52100f..35441a38a6fee 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll @@ -768,15 +768,15 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) { ; VF2-NEXT: [[TMP22:%.*]] = shufflevector <6 x i32> [[WIDE_VEC1]], <6 x i32> poison, <2 x i32> ; VF2-NEXT: [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]] ; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0 -; VF2-NEXT: store i32 [[TMP15]], ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1 +; VF2-NEXT: store i32 [[TMP15]], ptr [[TMP8]], align 8 ; VF2-NEXT: store i32 [[TMP16]], ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1 ; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1 ; VF2-NEXT: [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]] ; VF2-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0 -; VF2-NEXT: store i32 [[TMP24]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1 +; VF2-NEXT: store i32 [[TMP24]], ptr [[TMP17]], align 8 ; VF2-NEXT: store i32 [[TMP25]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 98 @@ -809,12 +809,12 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) { ; VF4-NEXT: [[TMP44:%.*]] = shufflevector <12 x i32> [[WIDE_VEC1]], <12 x i32> poison, <4 x i32> ; VF4-NEXT: [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]] ; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0 -; VF4-NEXT: store i32 [[TMP29]], ptr [[TMP16]], align 8 ; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1 -; VF4-NEXT: store i32 [[TMP30]], ptr [[TMP17]], align 8 ; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2 -; VF4-NEXT: store i32 [[TMP31]], ptr [[TMP18]], align 8 ; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3 +; VF4-NEXT: store i32 [[TMP29]], ptr [[TMP16]], align 8 +; VF4-NEXT: store i32 [[TMP30]], ptr [[TMP17]], align 8 +; VF4-NEXT: store i32 [[TMP31]], ptr [[TMP18]], align 8 ; VF4-NEXT: store i32 [[TMP32]], ptr [[TMP19]], align 8 ; VF4-NEXT: [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1 ; VF4-NEXT: [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1 @@ -822,12 +822,12 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) { ; VF4-NEXT: [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1 ; VF4-NEXT: [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]] ; VF4-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0 -; VF4-NEXT: store i32 [[TMP46]], ptr [[TMP33]], align 8 ; VF4-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1 -; VF4-NEXT: store i32 [[TMP47]], ptr [[TMP34]], align 8 ; VF4-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2 -; VF4-NEXT: store i32 [[TMP48]], ptr [[TMP35]], align 8 ; VF4-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3 +; VF4-NEXT: store i32 [[TMP46]], ptr [[TMP33]], align 8 +; VF4-NEXT: store i32 [[TMP47]], ptr [[TMP34]], align 8 +; VF4-NEXT: store i32 [[TMP48]], ptr [[TMP35]], align 8 ; VF4-NEXT: store i32 [[TMP49]], ptr [[TMP36]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll index c017c1f3b36ee..ae1f39982d565 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll @@ -21,30 +21,30 @@ define void @test0(ptr noalias %M3, ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i16> [[WIDE_LOAD]], splat (i16 10) -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = ashr exact i64 [[TMP11]], 32 -; CHECK-NEXT: [[TMP16:%.*]] = ashr exact i64 [[TMP12]], 32 -; CHECK-NEXT: [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32 -; CHECK-NEXT: [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: store i16 [[TMP23]], ptr [[TMP19]], align 2 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1 -; CHECK-NEXT: store i16 [[TMP24]], ptr [[TMP20]], align 2 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2 -; CHECK-NEXT: store i16 [[TMP25]], ptr [[TMP21]], align 2 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3 -; CHECK-NEXT: store i16 [[TMP26]], ptr [[TMP22]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32 +; CHECK-NEXT: [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32 +; CHECK-NEXT: [[TMP21:%.*]] = ashr exact i64 [[TMP17]], 32 +; CHECK-NEXT: [[TMP22:%.*]] = ashr exact i64 [[TMP18]], 32 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP22]] +; CHECK-NEXT: store i16 [[TMP7]], ptr [[TMP23]], align 2 +; CHECK-NEXT: store i16 [[TMP8]], ptr [[TMP24]], align 2 +; CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP25]], align 2 +; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP26]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -115,30 +115,30 @@ define void @test1(ptr noalias %M3, ptr noalias %A, ptr noalias %B, ptr noalias ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2 ; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i16> [[WIDE_LOAD]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32 -; CHECK-NEXT: [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32 -; CHECK-NEXT: [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32 -; CHECK-NEXT: [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP8]], i32 0 -; CHECK-NEXT: store i16 [[TMP25]], ptr [[TMP21]], align 2 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP8]], i32 1 -; CHECK-NEXT: store i16 [[TMP26]], ptr [[TMP22]], align 2 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 -; CHECK-NEXT: store i16 [[TMP27]], ptr [[TMP23]], align 2 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 -; CHECK-NEXT: store i16 [[TMP28]], ptr [[TMP24]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP16]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = ashr exact i64 [[TMP17]], 32 +; CHECK-NEXT: [[TMP22:%.*]] = ashr exact i64 [[TMP18]], 32 +; CHECK-NEXT: [[TMP23:%.*]] = ashr exact i64 [[TMP19]], 32 +; CHECK-NEXT: [[TMP24:%.*]] = ashr exact i64 [[TMP20]], 32 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP24]] +; CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP25]], align 2 +; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP26]], align 2 +; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP27]], align 2 +; CHECK-NEXT: store i16 [[TMP12]], ptr [[TMP28]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll index 1f2e91884a5d9..96fc4144f4a89 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll @@ -28,8 +28,9 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%conv> = fpext ir<%l> to double ; CHECK-NEXT: WIDEN-CALL ir<%s> = call fast @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64) +; CHECK-NEXT: EMIT vp<[[SCALARS:%.+]]> = unpack-into-scalars ir<%s> ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> +; CHECK-NEXT: REPLICATE store vp<[[SCALARS]]>, ir<%gep.dst> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -75,8 +76,9 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%conv> = fpext ir<%l> to double ; CHECK-NEXT: WIDEN-INTRINSIC ir<%s> = call fast llvm.sin(ir<%conv>) +; CHECK-NEXT: EMIT vp<[[SCALARS:%.+]]> = unpack-into-scalars ir<%s> ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> +; CHECK-NEXT: REPLICATE store vp<[[SCALARS]]>, ir<%gep.dst> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -113,11 +115,11 @@ define void @test(ptr noalias %src, ptr noalias %dst) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double> ; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x double> @__simd_sin_v2f64(<2 x double> [[TMP4]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 ; CHECK-NEXT: store double [[TMP8]], ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 ; CHECK-NEXT: store double [[TMP9]], ptr [[TMP7]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll index 3545c6b2239d7..7015b3c0fbc53 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll @@ -57,8 +57,8 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 { ; NARROW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 ; NARROW-NEXT: [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float> ; NARROW-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 -; NARROW-NEXT: [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]] ; NARROW-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; NARROW-NEXT: [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]] ; NARROW-NEXT: [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]] ; NARROW-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 ; NARROW-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll index 04a97f451770a..a0c48e312d727 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll @@ -8,7 +8,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK-COST-LABEL: arm_offset_q15 ; CHECK-COST: LV: Found an estimated cost of 2 for VF 1 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) -; CHECK-COST: Cost of 36 for VF 2: REPLICATE ir<%1> = call @llvm.sadd.sat.i16(ir<%0>, ir<%offset>) +; CHECK-COST: Cost of 36 for VF 2: REPLICATE ir<%1> = call @llvm.sadd.sat.i16(vp<{{.+}}>, ir<%offset>) ; CHECK-COST: Cost of 8 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>) ; CHECK-COST: Cost of 2 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll index 09b274de30214..291d80bfe94ff 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll @@ -460,17 +460,17 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2 ; RV64-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3 ; RV64-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1) +; RV64-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 +; RV64-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 +; RV64-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 +; RV64-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 ; RV64-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]] ; RV64-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]] ; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]] ; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]] -; RV64-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 ; RV64-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1 -; RV64-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 ; RV64-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1 -; RV64-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 ; RV64-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1 -; RV64-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 ; RV64-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; RV64-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 @@ -523,17 +523,17 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2 ; RV32-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3 ; RV32-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1) +; RV32-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 +; RV32-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 +; RV32-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 +; RV32-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 ; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]] ; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]] ; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]] ; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]] -; RV32-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 ; RV32-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1 -; RV32-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 ; RV32-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1 -; RV32-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 ; RV32-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1 -; RV32-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 ; RV32-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; RV32-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 @@ -593,43 +593,43 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) { ; RV64-UF2-NEXT: [[TMP6:%.*]] = load i7, ptr [[TMP4]], align 1 ; RV64-UF2-NEXT: [[TMP26:%.*]] = load i7, ptr [[TMP18]], align 1 ; RV64-UF2-NEXT: [[TMP27:%.*]] = load i7, ptr [[TMP19]], align 1 -; RV64-UF2-NEXT: [[TMP28:%.*]] = insertelement <4 x i7> poison, i7 [[TMP5]], i32 0 -; RV64-UF2-NEXT: [[TMP29:%.*]] = insertelement <4 x i7> [[TMP28]], i7 [[TMP6]], i32 1 -; RV64-UF2-NEXT: [[TMP30:%.*]] = insertelement <4 x i7> [[TMP29]], i7 [[TMP26]], i32 2 -; RV64-UF2-NEXT: [[TMP31:%.*]] = insertelement <4 x i7> [[TMP30]], i7 [[TMP27]], i32 3 ; RV64-UF2-NEXT: [[TMP32:%.*]] = load i7, ptr [[TMP20]], align 1 ; RV64-UF2-NEXT: [[TMP33:%.*]] = load i7, ptr [[TMP21]], align 1 ; RV64-UF2-NEXT: [[TMP34:%.*]] = load i7, ptr [[TMP22]], align 1 ; RV64-UF2-NEXT: [[TMP35:%.*]] = load i7, ptr [[TMP23]], align 1 +; RV64-UF2-NEXT: [[TMP60:%.*]] = insertelement <4 x i7> poison, i7 [[TMP5]], i32 0 +; RV64-UF2-NEXT: [[TMP61:%.*]] = insertelement <4 x i7> [[TMP60]], i7 [[TMP6]], i32 1 +; RV64-UF2-NEXT: [[TMP62:%.*]] = insertelement <4 x i7> [[TMP61]], i7 [[TMP26]], i32 2 +; RV64-UF2-NEXT: [[TMP31:%.*]] = insertelement <4 x i7> [[TMP62]], i7 [[TMP27]], i32 3 ; RV64-UF2-NEXT: [[TMP36:%.*]] = insertelement <4 x i7> poison, i7 [[TMP32]], i32 0 ; RV64-UF2-NEXT: [[TMP37:%.*]] = insertelement <4 x i7> [[TMP36]], i7 [[TMP33]], i32 1 ; RV64-UF2-NEXT: [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2 ; RV64-UF2-NEXT: [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3 ; RV64-UF2-NEXT: [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1) ; RV64-UF2-NEXT: [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1) -; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP1]] -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP2]] -; RV64-UF2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP51]] -; RV64-UF2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP59]] -; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]] -; RV64-UF2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]] -; RV64-UF2-NEXT: [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]] ; RV64-UF2-NEXT: [[TMP7:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0 -; RV64-UF2-NEXT: store i7 [[TMP7]], ptr [[TMP9]], align 1 ; RV64-UF2-NEXT: [[TMP8:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1 -; RV64-UF2-NEXT: store i7 [[TMP8]], ptr [[TMP10]], align 1 ; RV64-UF2-NEXT: [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2 -; RV64-UF2-NEXT: store i7 [[TMP52]], ptr [[TMP44]], align 1 ; RV64-UF2-NEXT: [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3 -; RV64-UF2-NEXT: store i7 [[TMP53]], ptr [[TMP45]], align 1 ; RV64-UF2-NEXT: [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0 -; RV64-UF2-NEXT: store i7 [[TMP54]], ptr [[TMP46]], align 1 ; RV64-UF2-NEXT: [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1 -; RV64-UF2-NEXT: store i7 [[TMP55]], ptr [[TMP47]], align 1 ; RV64-UF2-NEXT: [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2 -; RV64-UF2-NEXT: store i7 [[TMP56]], ptr [[TMP48]], align 1 ; RV64-UF2-NEXT: [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3 +; RV64-UF2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP1]] +; RV64-UF2-NEXT: [[TMP64:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP2]] +; RV64-UF2-NEXT: [[TMP65:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP51]] +; RV64-UF2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]] +; RV64-UF2-NEXT: [[TMP67:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP59]] +; RV64-UF2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]] +; RV64-UF2-NEXT: [[TMP69:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]] +; RV64-UF2-NEXT: [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]] +; RV64-UF2-NEXT: store i7 [[TMP7]], ptr [[TMP63]], align 1 +; RV64-UF2-NEXT: store i7 [[TMP8]], ptr [[TMP64]], align 1 +; RV64-UF2-NEXT: store i7 [[TMP52]], ptr [[TMP65]], align 1 +; RV64-UF2-NEXT: store i7 [[TMP53]], ptr [[TMP66]], align 1 +; RV64-UF2-NEXT: store i7 [[TMP54]], ptr [[TMP67]], align 1 +; RV64-UF2-NEXT: store i7 [[TMP55]], ptr [[TMP68]], align 1 +; RV64-UF2-NEXT: store i7 [[TMP56]], ptr [[TMP69]], align 1 ; RV64-UF2-NEXT: store i7 [[TMP57]], ptr [[TMP49]], align 1 ; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; RV64-UF2-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 2c6fe4f5c808e..9e15a46828350 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -126,30 +126,30 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP20]], align 4 ; CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP21]], align 4 ; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP22]], align 4 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x float> poison, float [[TMP35]], i32 0 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP36]], i32 1 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i32 2 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i32 3 ; CHECK-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP23]], align 4 ; CHECK-NEXT: [[TMP44:%.*]] = load float, ptr [[TMP24]], align 4 ; CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[TMP25]], align 4 ; CHECK-NEXT: [[TMP46:%.*]] = load float, ptr [[TMP26]], align 4 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x float> poison, float [[TMP43]], i32 0 -; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP44]], i32 1 -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP45]], i32 2 -; CHECK-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP46]], i32 3 ; CHECK-NEXT: [[TMP51:%.*]] = load float, ptr [[TMP27]], align 4 ; CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[TMP28]], align 4 ; CHECK-NEXT: [[TMP53:%.*]] = load float, ptr [[TMP29]], align 4 ; CHECK-NEXT: [[TMP54:%.*]] = load float, ptr [[TMP30]], align 4 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x float> poison, float [[TMP51]], i32 0 -; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x float> [[TMP55]], float [[TMP52]], i32 1 -; CHECK-NEXT: [[TMP57:%.*]] = insertelement <4 x float> [[TMP56]], float [[TMP53]], i32 2 -; CHECK-NEXT: [[TMP58:%.*]] = insertelement <4 x float> [[TMP57]], float [[TMP54]], i32 3 ; CHECK-NEXT: [[TMP59:%.*]] = load float, ptr [[TMP31]], align 4 ; CHECK-NEXT: [[TMP60:%.*]] = load float, ptr [[TMP32]], align 4 ; CHECK-NEXT: [[TMP61:%.*]] = load float, ptr [[TMP33]], align 4 ; CHECK-NEXT: [[TMP62:%.*]] = load float, ptr [[TMP34]], align 4 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x float> poison, float [[TMP35]], i32 0 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP36]], i32 1 +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i32 2 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i32 3 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x float> poison, float [[TMP43]], i32 0 +; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP44]], i32 1 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP45]], i32 2 +; CHECK-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP46]], i32 3 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x float> poison, float [[TMP51]], i32 0 +; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x float> [[TMP55]], float [[TMP52]], i32 1 +; CHECK-NEXT: [[TMP57:%.*]] = insertelement <4 x float> [[TMP56]], float [[TMP53]], i32 2 +; CHECK-NEXT: [[TMP58:%.*]] = insertelement <4 x float> [[TMP57]], float [[TMP54]], i32 3 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x float> poison, float [[TMP59]], i32 0 ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x float> [[TMP63]], float [[TMP60]], i32 1 ; CHECK-NEXT: [[TMP65:%.*]] = insertelement <4 x float> [[TMP64]], float [[TMP61]], i32 2 @@ -174,30 +174,30 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: [[TMP84:%.*]] = load float, ptr [[TMP68]], align 4 ; CHECK-NEXT: [[TMP85:%.*]] = load float, ptr [[TMP69]], align 4 ; CHECK-NEXT: [[TMP86:%.*]] = load float, ptr [[TMP70]], align 4 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x float> poison, float [[TMP83]], i32 0 -; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x float> [[TMP87]], float [[TMP84]], i32 1 -; CHECK-NEXT: [[TMP89:%.*]] = insertelement <4 x float> [[TMP88]], float [[TMP85]], i32 2 -; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x float> [[TMP89]], float [[TMP86]], i32 3 ; CHECK-NEXT: [[TMP91:%.*]] = load float, ptr [[TMP71]], align 4 ; CHECK-NEXT: [[TMP92:%.*]] = load float, ptr [[TMP72]], align 4 ; CHECK-NEXT: [[TMP93:%.*]] = load float, ptr [[TMP73]], align 4 ; CHECK-NEXT: [[TMP94:%.*]] = load float, ptr [[TMP74]], align 4 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x float> poison, float [[TMP91]], i32 0 -; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x float> [[TMP95]], float [[TMP92]], i32 1 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x float> [[TMP96]], float [[TMP93]], i32 2 -; CHECK-NEXT: [[TMP98:%.*]] = insertelement <4 x float> [[TMP97]], float [[TMP94]], i32 3 ; CHECK-NEXT: [[TMP99:%.*]] = load float, ptr [[TMP75]], align 4 ; CHECK-NEXT: [[TMP100:%.*]] = load float, ptr [[TMP76]], align 4 ; CHECK-NEXT: [[TMP101:%.*]] = load float, ptr [[TMP77]], align 4 ; CHECK-NEXT: [[TMP102:%.*]] = load float, ptr [[TMP78]], align 4 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x float> poison, float [[TMP99]], i32 0 -; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x float> [[TMP103]], float [[TMP100]], i32 1 -; CHECK-NEXT: [[TMP105:%.*]] = insertelement <4 x float> [[TMP104]], float [[TMP101]], i32 2 -; CHECK-NEXT: [[TMP106:%.*]] = insertelement <4 x float> [[TMP105]], float [[TMP102]], i32 3 ; CHECK-NEXT: [[TMP107:%.*]] = load float, ptr [[TMP79]], align 4 ; CHECK-NEXT: [[TMP108:%.*]] = load float, ptr [[TMP80]], align 4 ; CHECK-NEXT: [[TMP109:%.*]] = load float, ptr [[TMP81]], align 4 ; CHECK-NEXT: [[TMP110:%.*]] = load float, ptr [[TMP82]], align 4 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x float> poison, float [[TMP83]], i32 0 +; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x float> [[TMP87]], float [[TMP84]], i32 1 +; CHECK-NEXT: [[TMP89:%.*]] = insertelement <4 x float> [[TMP88]], float [[TMP85]], i32 2 +; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x float> [[TMP89]], float [[TMP86]], i32 3 +; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x float> poison, float [[TMP91]], i32 0 +; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x float> [[TMP95]], float [[TMP92]], i32 1 +; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x float> [[TMP96]], float [[TMP93]], i32 2 +; CHECK-NEXT: [[TMP98:%.*]] = insertelement <4 x float> [[TMP97]], float [[TMP94]], i32 3 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x float> poison, float [[TMP99]], i32 0 +; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x float> [[TMP103]], float [[TMP100]], i32 1 +; CHECK-NEXT: [[TMP105:%.*]] = insertelement <4 x float> [[TMP104]], float [[TMP101]], i32 2 +; CHECK-NEXT: [[TMP106:%.*]] = insertelement <4 x float> [[TMP105]], float [[TMP102]], i32 3 ; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x float> poison, float [[TMP107]], i32 0 ; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x float> [[TMP111]], float [[TMP108]], i32 1 ; CHECK-NEXT: [[TMP113:%.*]] = insertelement <4 x float> [[TMP112]], float [[TMP109]], i32 2 @@ -473,10 +473,10 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 { ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 8 ; CHECK-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP10]], align 8 ; CHECK-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x ptr> [[TMP16]], ptr [[TMP15]], i32 1 ; CHECK-NEXT: [[TMP18:%.*]] = load ptr, ptr [[TMP12]], align 8 ; CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x ptr> [[TMP22]], ptr [[TMP15]], i32 1 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP18]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x ptr> [[TMP20]], ptr [[TMP19]], i32 1 ; CHECK-NEXT: [[TMP24:%.*]] = icmp ne <2 x ptr> [[TMP17]], zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll index 3361068c99220..3a0c15ebd05e7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll @@ -376,30 +376,30 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x ptr> [[TMP17]], ptr [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x ptr> [[TMP18]], ptr [[TMP15]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x ptr> [[TMP19]], ptr [[TMP16]], i32 3 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x ptr> [[TMP19]], ptr [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x ptr> [[TMP42]], ptr [[TMP15]], i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x ptr> [[TMP43]], ptr [[TMP16]], i32 3 ; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP21]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x ptr> [[TMP25]], ptr [[TMP22]], i32 1 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x ptr> [[TMP26]], ptr [[TMP23]], i32 2 ; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x ptr> [[TMP27]], ptr [[TMP24]], i32 3 ; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP20]], <4 x ptr> [[TMP28]], <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 1 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 2 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 3 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP21]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3 ; CHECK-NEXT: store float [[TMP30]], ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP31]], align 4 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 1 ; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[TMP33]], align 4 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 2 ; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 3 ; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP37]], align 4 ; CHECK-NEXT: store float [[TMP36]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll index c4fc60908c7e0..57fda151f4739 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -687,16 +687,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[OFFSET_IDX21]] ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]] -; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]] +; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META8]] +; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD16]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]] ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr float, ptr [[TMP28]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD18:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]] +; AVX512-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META15]] ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD18]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]] +; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]] ; AVX512-NEXT: [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8 ; AVX512-NEXT: [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512 ; AVX512-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]] -; AVX512-NEXT: br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]] ; AVX512-NEXT: br i1 [[CMP_N17]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] @@ -716,7 +716,7 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt ; AVX512-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1 ; AVX512-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16 ; AVX512-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]] -; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -778,16 +778,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt ; FVW2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0 ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP22]], align 4, !alias.scope [[META8:![0-9]+]] ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 -; FVW2-NEXT: store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] ; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 +; FVW2-NEXT: store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] ; FVW2-NEXT: store float [[TMP24]], ptr [[TMP20]], align 4, !alias.scope [[META11]], !noalias [[META13]] ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[TMP16]], i32 0 ; FVW2-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP25]], align 4, !alias.scope [[META15:![0-9]+]] +; FVW2-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 0 +; FVW2-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 1 ; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1 ; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1 -; FVW2-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 0 ; FVW2-NEXT: store float [[TMP28]], ptr [[TMP26]], align 4, !alias.scope [[META11]], !noalias [[META13]] -; FVW2-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 1 ; FVW2-NEXT: store float [[TMP29]], ptr [[TMP27]], align 4, !alias.scope [[META11]], !noalias [[META13]] ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; FVW2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index 0a85548f8750b..2b08eb27d89b1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -303,36 +303,36 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 { ; CHECK-NEXT: [[TMP24:%.*]] = lshr <16 x i32> [[TMP23]], splat (i32 1) ; CHECK-NEXT: [[TMP25:%.*]] = trunc <16 x i32> [[TMP24]] to <16 x i8> ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[TMP25]], i32 0 -; CHECK-NEXT: store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META14]] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP25]], i32 1 -; CHECK-NEXT: store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[TMP25]], i32 2 -; CHECK-NEXT: store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i8> [[TMP25]], i32 3 -; CHECK-NEXT: store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[TMP25]], i32 4 -; CHECK-NEXT: store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP25]], i32 5 -; CHECK-NEXT: store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[TMP25]], i32 6 -; CHECK-NEXT: store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i8> [[TMP25]], i32 7 -; CHECK-NEXT: store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[TMP25]], i32 8 -; CHECK-NEXT: store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP25]], i32 9 -; CHECK-NEXT: store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[TMP25]], i32 10 -; CHECK-NEXT: store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i8> [[TMP25]], i32 11 -; CHECK-NEXT: store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[TMP25]], i32 12 -; CHECK-NEXT: store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP25]], i32 13 -; CHECK-NEXT: store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[TMP25]], i32 14 -; CHECK-NEXT: store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i8> [[TMP25]], i32 15 +; CHECK-NEXT: store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META17]], !noalias [[META14]] +; CHECK-NEXT: store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: store i8 [[TMP41]], ptr [[NEXT_GEP21]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967184 diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll index 3a07bcca523ce..abb69ff67baf0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll @@ -32,9 +32,10 @@ define void @test_pr55375_interleave_opaque_ptr(ptr %start, ptr %end) { ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> [[TMP9]], ptr [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x ptr> zeroinitializer, <2 x ptr> [[TMP10]], <4 x i32> ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x ptr> [[TMP12]], <4 x ptr> poison, <4 x i32> -; CHECK-NEXT: store <4 x ptr> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8 +; CHECK-NEXT: store <4 x ptr> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll index cdc7839bfc0f0..a46cd6987ec1c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll @@ -32,42 +32,31 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 104 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 112 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 120 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP0]] +; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP0]] ; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP1]] ; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP2]] ; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP3]] -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP4]] +; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP4]] ; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP5]] ; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP6]] ; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP7]] -; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP8]] +; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP8]] ; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP9]] ; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP10]] -; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP11]] -; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP12]] -; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP13]] -; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP14]] +; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP11]] +; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP12]] +; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP13]] +; CHECK-NEXT: [[NEXT_GEP19:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP14]] ; CHECK-NEXT: [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 4 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 4 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 4 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 4 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 4 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 4 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i64 4 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[NEXT_GEP8]], i64 4 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP9]], i64 4 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i64 4 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[NEXT_GEP11]], i64 4 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[NEXT_GEP12]], i64 4 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[NEXT_GEP13]], i64 4 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[NEXT_GEP14]], i64 4 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[NEXT_GEP15]], i64 4 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[NEXT_GEP16]], i64 4 -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[TMP16]], i32 -4 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP20]], i32 -4 -; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP24]], i32 -4 -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -4 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP29]], i32 -4 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP30]], i32 -4 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> @@ -85,36 +74,36 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 { ; CHECK-NEXT: [[TMP38:%.*]] = add <4 x i32> [[STRIDED_VEC23]], [[STRIDED_VEC22]] ; CHECK-NEXT: [[TMP39:%.*]] = add <4 x i32> [[STRIDED_VEC26]], [[STRIDED_VEC25]] ; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0 -; CHECK-NEXT: store i32 [[TMP40]], ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1 -; CHECK-NEXT: store i32 [[TMP41]], ptr [[NEXT_GEP2]], align 4 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2 -; CHECK-NEXT: store i32 [[TMP42]], ptr [[NEXT_GEP3]], align 4 ; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[TMP36]], i32 3 -; CHECK-NEXT: store i32 [[TMP43]], ptr [[NEXT_GEP4]], align 4 ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i32> [[TMP37]], i32 0 -; CHECK-NEXT: store i32 [[TMP44]], ptr [[NEXT_GEP5]], align 4 ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i32> [[TMP37]], i32 1 -; CHECK-NEXT: store i32 [[TMP45]], ptr [[NEXT_GEP6]], align 4 ; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP37]], i32 2 -; CHECK-NEXT: store i32 [[TMP46]], ptr [[NEXT_GEP7]], align 4 ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP37]], i32 3 -; CHECK-NEXT: store i32 [[TMP47]], ptr [[NEXT_GEP8]], align 4 ; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP38]], i32 0 -; CHECK-NEXT: store i32 [[TMP48]], ptr [[NEXT_GEP9]], align 4 ; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP38]], i32 1 -; CHECK-NEXT: store i32 [[TMP49]], ptr [[NEXT_GEP10]], align 4 ; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i32> [[TMP38]], i32 2 -; CHECK-NEXT: store i32 [[TMP50]], ptr [[NEXT_GEP11]], align 4 ; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i32> [[TMP38]], i32 3 -; CHECK-NEXT: store i32 [[TMP51]], ptr [[NEXT_GEP12]], align 4 ; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0 -; CHECK-NEXT: store i32 [[TMP52]], ptr [[NEXT_GEP13]], align 4 ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1 -; CHECK-NEXT: store i32 [[TMP53]], ptr [[NEXT_GEP14]], align 4 ; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2 -; CHECK-NEXT: store i32 [[TMP54]], ptr [[NEXT_GEP15]], align 4 ; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3 +; CHECK-NEXT: store i32 [[TMP40]], ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: store i32 [[TMP41]], ptr [[NEXT_GEP2]], align 4 +; CHECK-NEXT: store i32 [[TMP42]], ptr [[NEXT_GEP3]], align 4 +; CHECK-NEXT: store i32 [[TMP43]], ptr [[NEXT_GEP4]], align 4 +; CHECK-NEXT: store i32 [[TMP44]], ptr [[NEXT_GEP5]], align 4 +; CHECK-NEXT: store i32 [[TMP45]], ptr [[NEXT_GEP6]], align 4 +; CHECK-NEXT: store i32 [[TMP46]], ptr [[NEXT_GEP7]], align 4 +; CHECK-NEXT: store i32 [[TMP47]], ptr [[NEXT_GEP8]], align 4 +; CHECK-NEXT: store i32 [[TMP48]], ptr [[NEXT_GEP9]], align 4 +; CHECK-NEXT: store i32 [[TMP49]], ptr [[NEXT_GEP10]], align 4 +; CHECK-NEXT: store i32 [[TMP50]], ptr [[NEXT_GEP11]], align 4 +; CHECK-NEXT: store i32 [[TMP51]], ptr [[NEXT_GEP12]], align 4 +; CHECK-NEXT: store i32 [[TMP52]], ptr [[NEXT_GEP13]], align 4 +; CHECK-NEXT: store i32 [[TMP53]], ptr [[NEXT_GEP14]], align 4 +; CHECK-NEXT: store i32 [[TMP54]], ptr [[NEXT_GEP15]], align 4 ; CHECK-NEXT: store i32 [[TMP55]], ptr [[NEXT_GEP16]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP56:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 @@ -123,11 +112,11 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 { ; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[M]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL26:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL26]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8 ; CHECK-NEXT: [[P_4:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 4 ; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[P_4]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll index 09946bfda5a7a..be1781da234f8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll @@ -28,18 +28,18 @@ define void @pr63602_1(ptr %arr) { ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0 -; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1 -; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]] +; CHECK-NEXT: store i32 [[TMP8]], ptr [[TMP12]], align 4 +; CHECK-NEXT: store i32 [[TMP9]], ptr [[TMP13]], align 4 +; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP14]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP15]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP16]] ; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <12 x i32>, ptr [[TMP17]], align 4 @@ -47,13 +47,13 @@ define void @pr63602_1(ptr %arr) { ; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC2]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = add <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0 -; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1 -; CHECK-NEXT: store i32 [[TMP20]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2 -; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3 -; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP11]], align 4 +; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP12]], align 4 +; CHECK-NEXT: store i32 [[TMP20]], ptr [[TMP13]], align 4 +; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP14]], align 4 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP15]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -143,18 +143,18 @@ define void @pr63602_2(ptr %arr) { ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP10]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP11]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1 -; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP13]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2 -; CHECK-NEXT: store i32 [[TMP18]], ptr [[TMP14]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3 -; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP9]] +; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP16]], align 4 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP17]], align 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP18]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP20:%.*]] = add nuw nsw i64 [[TMP1]], 2 ; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[TMP2]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw i64 [[TMP3]], 2 @@ -163,10 +163,10 @@ define void @pr63602_2(ptr %arr) { ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP23]] -; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP13]], align 4 -; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP14]], align 4 -; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP17]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP18]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP28]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 1 ; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP30]], i32 2 @@ -181,13 +181,13 @@ define void @pr63602_2(ptr %arr) { ; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 3 ; CHECK-NEXT: [[TMP44:%.*]] = add <4 x i32> [[TMP35]], [[TMP43]] ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i32> [[TMP44]], i32 0 -; CHECK-NEXT: store i32 [[TMP45]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP44]], i32 1 -; CHECK-NEXT: store i32 [[TMP46]], ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP44]], i32 2 -; CHECK-NEXT: store i32 [[TMP47]], ptr [[TMP14]], align 4 ; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP44]], i32 3 -; CHECK-NEXT: store i32 [[TMP48]], ptr [[TMP15]], align 4 +; CHECK-NEXT: store i32 [[TMP45]], ptr [[TMP16]], align 4 +; CHECK-NEXT: store i32 [[TMP46]], ptr [[TMP17]], align 4 +; CHECK-NEXT: store i32 [[TMP47]], ptr [[TMP18]], align 4 +; CHECK-NEXT: store i32 [[TMP48]], ptr [[TMP19]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll index f4cd48de60243..333454bb6d27b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -163,30 +163,30 @@ define i32 @test_explicit_pred_generic(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -320,30 +320,30 @@ define i32 @test_invariant_address(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -352,30 +352,30 @@ define i32 @test_invariant_address(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP65:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP67:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP68:%.*]] = insertelement <4 x i32> poison, i32 [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP65]], i32 1 -; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP66]], i32 2 -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP67]], i32 3 ; CHECK-NEXT: [[TMP72:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP73:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP74:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP75:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> poison, i32 [[TMP72]], i32 0 -; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP73]], i32 1 -; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP74]], i32 2 -; CHECK-NEXT: [[TMP79:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP75]], i32 3 ; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[ALLOCA]], align 4 -; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 ; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP68:%.*]] = insertelement <4 x i32> poison, i32 [[TMP64]], i32 0 +; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP65]], i32 1 +; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP66]], i32 2 +; CHECK-NEXT: [[TMP71:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP67]], i32 3 +; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> poison, i32 [[TMP72]], i32 0 +; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP73]], i32 1 +; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP74]], i32 2 +; CHECK-NEXT: [[TMP79:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP75]], i32 3 +; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 ; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 ; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 ; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 @@ -495,30 +495,30 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i1> [[TMP64]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP74:%.*]] = insertelement <4 x i1> [[TMP69]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP74]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP79:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i1> [[TMP79]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP89:%.*]] = insertelement <4 x i1> [[TMP84]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP89]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -773,30 +773,30 @@ define i32 @test_max_trip_count(i64 %len, ptr %test_base, i64 %n) { ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = load i1, ptr [[TMP20]], align 1 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = load i1, ptr [[TMP24]], align 1 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2 -; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = load i1, ptr [[TMP28]], align 1 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2 -; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = load i1, ptr [[TMP32]], align 1 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2 +; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2 +; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2 @@ -933,30 +933,30 @@ define i32 @test_non_zero_start(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -1138,30 +1138,30 @@ define i32 @test_non_unit_stride(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -1186,30 +1186,30 @@ define i32 @test_non_unit_stride(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4 ; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4 ; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4 -; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 ; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4 ; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4 ; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4 ; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 -; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 -; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 ; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4 ; CHECK-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4 ; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4 ; CHECK-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4 -; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 ; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4 ; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4 ; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4 ; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4 +; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 +; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 +; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 +; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 ; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 ; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 ; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 @@ -1330,30 +1330,30 @@ define i32 @neg_off_by_many(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -1483,30 +1483,30 @@ define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -1636,30 +1636,30 @@ define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -1798,30 +1798,30 @@ define i32 @test_constant_max(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = load i1, ptr [[TMP20]], align 1 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = load i1, ptr [[TMP24]], align 1 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2 -; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = load i1, ptr [[TMP28]], align 1 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2 -; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = load i1, ptr [[TMP32]], align 1 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2 +; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2 +; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2 @@ -1959,30 +1959,30 @@ define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -2113,30 +2113,30 @@ define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -2277,30 +2277,30 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -2439,30 +2439,30 @@ define i32 @test_stride_three(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -2487,30 +2487,30 @@ define i32 @test_stride_three(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4 ; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4 ; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4 -; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 ; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4 ; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4 ; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4 ; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 -; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 -; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 ; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4 ; CHECK-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4 ; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4 ; CHECK-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4 -; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 ; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4 ; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4 ; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4 ; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4 +; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 +; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 +; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 +; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 ; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 ; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 ; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 @@ -2614,14 +2614,14 @@ define i32 @test_non_unit_stride_four(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP17:%.*]] = load i1, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP18:%.*]] = load i1, ptr [[TMP10]], align 1 ; CHECK-NEXT: [[TMP19:%.*]] = load i1, ptr [[TMP11]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i1> poison, i1 [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i1> [[TMP20]], i1 [[TMP17]], i32 1 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i1> [[TMP21]], i1 [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP19]], i32 3 ; CHECK-NEXT: [[TMP24:%.*]] = load i1, ptr [[TMP12]], align 1 ; CHECK-NEXT: [[TMP25:%.*]] = load i1, ptr [[TMP13]], align 1 ; CHECK-NEXT: [[TMP26:%.*]] = load i1, ptr [[TMP14]], align 1 ; CHECK-NEXT: [[TMP27:%.*]] = load i1, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP17]], i32 1 +; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP18]], i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP19]], i32 3 ; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x i1> poison, i1 [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i1> [[TMP28]], i1 [[TMP25]], i32 1 ; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i1> [[TMP29]], i1 [[TMP26]], i32 2 @@ -2638,14 +2638,14 @@ define i32 @test_non_unit_stride_four(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP33]], align 4 ; CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[TMP34]], align 4 ; CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP35]], align 4 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> poison, i32 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr [[TMP36]], align 4 ; CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP37]], align 4 ; CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[TMP38]], align 4 ; CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP39]], align 4 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> poison, i32 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i32> poison, i32 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[TMP50]], i32 2 @@ -2761,30 +2761,30 @@ define i32 @test_non_unit_stride_five(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -2809,30 +2809,30 @@ define i32 @test_non_unit_stride_five(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4 ; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4 ; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4 -; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 ; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4 ; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4 ; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4 ; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 -; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 -; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 ; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4 ; CHECK-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4 ; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4 ; CHECK-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4 -; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 ; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4 ; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4 ; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4 ; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4 +; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 +; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 +; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 +; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 ; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 ; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 ; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 @@ -2954,30 +2954,30 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 ; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 ; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -3002,30 +3002,30 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4 ; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4 ; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4 -; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 ; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4 ; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4 ; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4 ; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 -; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 -; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 ; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4 ; CHECK-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4 ; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4 ; CHECK-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4 -; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 ; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4 ; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4 ; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4 ; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4 +; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 +; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 +; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 +; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 ; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 ; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 ; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 @@ -3164,30 +3164,30 @@ define i32 @test_non_unit_stride_with_first_iteration_step_access(i64 %len, ptr ; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP33]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP34]], align 1 ; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP35]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 ; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP36]], align 1 ; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP37]], align 1 ; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP38]], align 1 ; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP39]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 ; CHECK-NEXT: [[TMP64:%.*]] = load i1, ptr [[TMP40]], align 1 ; CHECK-NEXT: [[TMP65:%.*]] = load i1, ptr [[TMP41]], align 1 ; CHECK-NEXT: [[TMP66:%.*]] = load i1, ptr [[TMP42]], align 1 ; CHECK-NEXT: [[TMP67:%.*]] = load i1, ptr [[TMP43]], align 1 -; CHECK-NEXT: [[TMP68:%.*]] = insertelement <4 x i1> poison, i1 [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i1> [[TMP68]], i1 [[TMP65]], i32 1 -; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i1> [[TMP69]], i1 [[TMP66]], i32 2 -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <4 x i1> [[TMP70]], i1 [[TMP67]], i32 3 ; CHECK-NEXT: [[TMP72:%.*]] = load i1, ptr [[TMP44]], align 1 ; CHECK-NEXT: [[TMP73:%.*]] = load i1, ptr [[TMP45]], align 1 ; CHECK-NEXT: [[TMP74:%.*]] = load i1, ptr [[TMP46]], align 1 ; CHECK-NEXT: [[TMP75:%.*]] = load i1, ptr [[TMP47]], align 1 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 +; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 +; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 +; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 +; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 +; CHECK-NEXT: [[TMP68:%.*]] = insertelement <4 x i1> poison, i1 [[TMP64]], i32 0 +; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i1> [[TMP68]], i1 [[TMP65]], i32 1 +; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i1> [[TMP69]], i1 [[TMP66]], i32 2 +; CHECK-NEXT: [[TMP71:%.*]] = insertelement <4 x i1> [[TMP70]], i1 [[TMP67]], i32 3 ; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i1> poison, i1 [[TMP72]], i32 0 ; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i1> [[TMP76]], i1 [[TMP73]], i32 1 ; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i1> [[TMP77]], i1 [[TMP74]], i32 2 @@ -3212,30 +3212,30 @@ define i32 @test_non_unit_stride_with_first_iteration_step_access(i64 %len, ptr ; CHECK-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP81]], align 4 ; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP82]], align 4 ; CHECK-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP83]], align 4 -; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 ; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP84]], align 4 ; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP85]], align 4 ; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP86]], align 4 ; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP87]], align 4 -; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 -; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 -; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3 ; CHECK-NEXT: [[TMP112:%.*]] = load i32, ptr [[TMP88]], align 4 ; CHECK-NEXT: [[TMP113:%.*]] = load i32, ptr [[TMP89]], align 4 ; CHECK-NEXT: [[TMP114:%.*]] = load i32, ptr [[TMP90]], align 4 ; CHECK-NEXT: [[TMP115:%.*]] = load i32, ptr [[TMP91]], align 4 -; CHECK-NEXT: [[TMP116:%.*]] = insertelement <4 x i32> poison, i32 [[TMP112]], i32 0 -; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP116]], i32 [[TMP113]], i32 1 -; CHECK-NEXT: [[TMP118:%.*]] = insertelement <4 x i32> [[TMP117]], i32 [[TMP114]], i32 2 -; CHECK-NEXT: [[TMP119:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP115]], i32 3 ; CHECK-NEXT: [[TMP120:%.*]] = load i32, ptr [[TMP92]], align 4 ; CHECK-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP93]], align 4 ; CHECK-NEXT: [[TMP122:%.*]] = load i32, ptr [[TMP94]], align 4 ; CHECK-NEXT: [[TMP123:%.*]] = load i32, ptr [[TMP95]], align 4 +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 +; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 +; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 +; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3 +; CHECK-NEXT: [[TMP116:%.*]] = insertelement <4 x i32> poison, i32 [[TMP112]], i32 0 +; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP116]], i32 [[TMP113]], i32 1 +; CHECK-NEXT: [[TMP118:%.*]] = insertelement <4 x i32> [[TMP117]], i32 [[TMP114]], i32 2 +; CHECK-NEXT: [[TMP119:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP115]], i32 3 ; CHECK-NEXT: [[TMP124:%.*]] = insertelement <4 x i32> poison, i32 [[TMP120]], i32 0 ; CHECK-NEXT: [[TMP125:%.*]] = insertelement <4 x i32> [[TMP124]], i32 [[TMP121]], i32 1 ; CHECK-NEXT: [[TMP126:%.*]] = insertelement <4 x i32> [[TMP125]], i32 [[TMP122]], i32 2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll index f1cbb4acc6aa6..879066cee931a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll @@ -73,6 +73,10 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4 @@ -92,14 +96,10 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0 -; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP1:![0-9]+]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1 -; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP1]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2 -; CHECK-NEXT: store i32 [[TMP23]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP1]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3 -; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP1]] +; CHECK-NEXT: store i32 [[TMP1]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP1:![0-9]+]] +; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP1]] +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP1]] +; CHECK-NEXT: store i32 [[TMP23]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP1]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP27]], i64 4 ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4, !llvm.access.group [[ACC_GRP0]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll index 644f10b617eb7..db9be20f7d190 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll @@ -10,6 +10,8 @@ define ptr @test(ptr noalias %src, ptr noalias %dst) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x ptr> [[TMP16]], ptr [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i64> [[VEC_IND]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll index e5883351ae30d..51944b9329275 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll @@ -145,20 +145,20 @@ define void @replicate_udiv_with_only_first_lane_used2(i32 %x, ptr %dst, i64 %d) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP16]] ; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP32]] ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP33]] ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 1 -; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]] ; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 2 -; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]] ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]] ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 0 -; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]] ; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 1 -; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]] ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 2 -; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]] ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 3 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]] +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]] +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]] +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]] ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]] ; CHECK-NEXT: store i16 0, ptr [[TMP11]], align 2 ; CHECK-NEXT: store i16 0, ptr [[TMP36]], align 2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll index ad8f1fb3ccd21..a36ad9b854d06 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -106,14 +106,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; CHECK-NEXT: [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0 -; CHECK-NEXT: [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1 -; CHECK-NEXT: [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2 -; CHECK-NEXT: [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3 -; CHECK-NEXT: [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5 -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7 ; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[TBAA1]] @@ -122,14 +114,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; CHECK-NEXT: [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1 -; CHECK-NEXT: [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2 -; CHECK-NEXT: [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3 -; CHECK-NEXT: [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7 ; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[TBAA1]] @@ -138,14 +122,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; CHECK-NEXT: [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0 -; CHECK-NEXT: [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1 -; CHECK-NEXT: [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2 -; CHECK-NEXT: [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3 -; CHECK-NEXT: [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4 -; CHECK-NEXT: [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5 -; CHECK-NEXT: [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6 -; CHECK-NEXT: [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7 ; CHECK-NEXT: [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[TBAA1]] @@ -154,6 +130,30 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; CHECK-NEXT: [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0 +; CHECK-NEXT: [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1 +; CHECK-NEXT: [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2 +; CHECK-NEXT: [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3 +; CHECK-NEXT: [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7 +; CHECK-NEXT: [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0 +; CHECK-NEXT: [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1 +; CHECK-NEXT: [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2 +; CHECK-NEXT: [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3 +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7 +; CHECK-NEXT: [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0 +; CHECK-NEXT: [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1 +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2 +; CHECK-NEXT: [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3 +; CHECK-NEXT: [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4 +; CHECK-NEXT: [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5 +; CHECK-NEXT: [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6 +; CHECK-NEXT: [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7 ; CHECK-NEXT: [[TMP128:%.*]] = insertelement <8 x i32> poison, i32 [[TMP120]], i32 0 ; CHECK-NEXT: [[TMP129:%.*]] = insertelement <8 x i32> [[TMP128]], i32 [[TMP121]], i32 1 ; CHECK-NEXT: [[TMP130:%.*]] = insertelement <8 x i32> [[TMP129]], i32 [[TMP122]], i32 2 @@ -338,14 +338,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; MAX-BW-NEXT: [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[TBAA1]] -; MAX-BW-NEXT: [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0 -; MAX-BW-NEXT: [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1 -; MAX-BW-NEXT: [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2 -; MAX-BW-NEXT: [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3 -; MAX-BW-NEXT: [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4 -; MAX-BW-NEXT: [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5 -; MAX-BW-NEXT: [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6 -; MAX-BW-NEXT: [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7 ; MAX-BW-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[TBAA1]] @@ -354,14 +346,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; MAX-BW-NEXT: [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[TBAA1]] -; MAX-BW-NEXT: [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0 -; MAX-BW-NEXT: [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1 -; MAX-BW-NEXT: [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2 -; MAX-BW-NEXT: [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3 -; MAX-BW-NEXT: [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4 -; MAX-BW-NEXT: [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5 -; MAX-BW-NEXT: [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6 -; MAX-BW-NEXT: [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7 ; MAX-BW-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[TBAA1]] @@ -370,14 +354,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; MAX-BW-NEXT: [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[TBAA1]] -; MAX-BW-NEXT: [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0 -; MAX-BW-NEXT: [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1 -; MAX-BW-NEXT: [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2 -; MAX-BW-NEXT: [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3 -; MAX-BW-NEXT: [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4 -; MAX-BW-NEXT: [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5 -; MAX-BW-NEXT: [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6 -; MAX-BW-NEXT: [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7 ; MAX-BW-NEXT: [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[TBAA1]] @@ -386,6 +362,30 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; MAX-BW-NEXT: [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0 +; MAX-BW-NEXT: [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1 +; MAX-BW-NEXT: [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2 +; MAX-BW-NEXT: [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3 +; MAX-BW-NEXT: [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4 +; MAX-BW-NEXT: [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5 +; MAX-BW-NEXT: [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6 +; MAX-BW-NEXT: [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7 +; MAX-BW-NEXT: [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0 +; MAX-BW-NEXT: [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1 +; MAX-BW-NEXT: [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2 +; MAX-BW-NEXT: [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3 +; MAX-BW-NEXT: [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4 +; MAX-BW-NEXT: [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5 +; MAX-BW-NEXT: [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6 +; MAX-BW-NEXT: [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7 +; MAX-BW-NEXT: [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0 +; MAX-BW-NEXT: [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1 +; MAX-BW-NEXT: [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2 +; MAX-BW-NEXT: [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3 +; MAX-BW-NEXT: [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4 +; MAX-BW-NEXT: [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5 +; MAX-BW-NEXT: [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6 +; MAX-BW-NEXT: [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7 ; MAX-BW-NEXT: [[TMP128:%.*]] = insertelement <8 x i32> poison, i32 [[TMP120]], i32 0 ; MAX-BW-NEXT: [[TMP129:%.*]] = insertelement <8 x i32> [[TMP128]], i32 [[TMP121]], i32 1 ; MAX-BW-NEXT: [[TMP130:%.*]] = insertelement <8 x i32> [[TMP129]], i32 [[TMP122]], i32 2 @@ -530,6 +530,14 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] ; CHECK-NEXT: [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i8> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7 ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP8]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP9]] ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP10]] @@ -538,21 +546,13 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP13]] ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP14]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP15]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0 ; CHECK-NEXT: store i8 [[TMP28]], ptr [[TMP20]], align 1 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1 ; CHECK-NEXT: store i8 [[TMP29]], ptr [[TMP21]], align 1 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2 ; CHECK-NEXT: store i8 [[TMP30]], ptr [[TMP22]], align 1 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3 ; CHECK-NEXT: store i8 [[TMP31]], ptr [[TMP23]], align 1 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4 ; CHECK-NEXT: store i8 [[TMP32]], ptr [[TMP24]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5 ; CHECK-NEXT: store i8 [[TMP33]], ptr [[TMP25]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6 ; CHECK-NEXT: store i8 [[TMP34]], ptr [[TMP26]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7 ; CHECK-NEXT: store i8 [[TMP35]], ptr [[TMP27]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 @@ -626,53 +626,53 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; MAX-BW-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> ; MAX-BW-NEXT: [[TMP34:%.*]] = add <16 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] ; MAX-BW-NEXT: [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8> -; MAX-BW-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP16]] -; MAX-BW-NEXT: [[TMP37:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]] -; MAX-BW-NEXT: [[TMP38:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]] -; MAX-BW-NEXT: [[TMP39:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]] -; MAX-BW-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]] -; MAX-BW-NEXT: [[TMP41:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]] -; MAX-BW-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]] -; MAX-BW-NEXT: [[TMP43:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]] -; MAX-BW-NEXT: [[TMP44:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]] -; MAX-BW-NEXT: [[TMP45:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]] -; MAX-BW-NEXT: [[TMP46:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]] -; MAX-BW-NEXT: [[TMP47:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]] -; MAX-BW-NEXT: [[TMP48:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]] -; MAX-BW-NEXT: [[TMP49:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]] -; MAX-BW-NEXT: [[TMP50:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]] -; MAX-BW-NEXT: [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]] ; MAX-BW-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i32 0 -; MAX-BW-NEXT: store i8 [[TMP52]], ptr [[TMP36]], align 1 ; MAX-BW-NEXT: [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i32 1 -; MAX-BW-NEXT: store i8 [[TMP53]], ptr [[TMP37]], align 1 ; MAX-BW-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i32 2 -; MAX-BW-NEXT: store i8 [[TMP54]], ptr [[TMP38]], align 1 ; MAX-BW-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i32 3 -; MAX-BW-NEXT: store i8 [[TMP55]], ptr [[TMP39]], align 1 ; MAX-BW-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i32 4 -; MAX-BW-NEXT: store i8 [[TMP56]], ptr [[TMP40]], align 1 ; MAX-BW-NEXT: [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i32 5 -; MAX-BW-NEXT: store i8 [[TMP57]], ptr [[TMP41]], align 1 ; MAX-BW-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i32 6 -; MAX-BW-NEXT: store i8 [[TMP58]], ptr [[TMP42]], align 1 ; MAX-BW-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i32 7 -; MAX-BW-NEXT: store i8 [[TMP59]], ptr [[TMP43]], align 1 ; MAX-BW-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i32 8 -; MAX-BW-NEXT: store i8 [[TMP60]], ptr [[TMP44]], align 1 ; MAX-BW-NEXT: [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i32 9 -; MAX-BW-NEXT: store i8 [[TMP61]], ptr [[TMP45]], align 1 ; MAX-BW-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i32 10 -; MAX-BW-NEXT: store i8 [[TMP62]], ptr [[TMP46]], align 1 ; MAX-BW-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i32 11 -; MAX-BW-NEXT: store i8 [[TMP63]], ptr [[TMP47]], align 1 ; MAX-BW-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i32 12 -; MAX-BW-NEXT: store i8 [[TMP64]], ptr [[TMP48]], align 1 ; MAX-BW-NEXT: [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i32 13 -; MAX-BW-NEXT: store i8 [[TMP65]], ptr [[TMP49]], align 1 ; MAX-BW-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i32 14 -; MAX-BW-NEXT: store i8 [[TMP66]], ptr [[TMP50]], align 1 ; MAX-BW-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i32 15 +; MAX-BW-NEXT: [[TMP69:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP16]] +; MAX-BW-NEXT: [[TMP70:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]] +; MAX-BW-NEXT: [[TMP71:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]] +; MAX-BW-NEXT: [[TMP72:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]] +; MAX-BW-NEXT: [[TMP73:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]] +; MAX-BW-NEXT: [[TMP74:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]] +; MAX-BW-NEXT: [[TMP75:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]] +; MAX-BW-NEXT: [[TMP76:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]] +; MAX-BW-NEXT: [[TMP77:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]] +; MAX-BW-NEXT: [[TMP78:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]] +; MAX-BW-NEXT: [[TMP79:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]] +; MAX-BW-NEXT: [[TMP80:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]] +; MAX-BW-NEXT: [[TMP81:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]] +; MAX-BW-NEXT: [[TMP82:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]] +; MAX-BW-NEXT: [[TMP83:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]] +; MAX-BW-NEXT: [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]] +; MAX-BW-NEXT: store i8 [[TMP52]], ptr [[TMP69]], align 1 +; MAX-BW-NEXT: store i8 [[TMP53]], ptr [[TMP70]], align 1 +; MAX-BW-NEXT: store i8 [[TMP54]], ptr [[TMP71]], align 1 +; MAX-BW-NEXT: store i8 [[TMP55]], ptr [[TMP72]], align 1 +; MAX-BW-NEXT: store i8 [[TMP56]], ptr [[TMP73]], align 1 +; MAX-BW-NEXT: store i8 [[TMP57]], ptr [[TMP74]], align 1 +; MAX-BW-NEXT: store i8 [[TMP58]], ptr [[TMP75]], align 1 +; MAX-BW-NEXT: store i8 [[TMP59]], ptr [[TMP76]], align 1 +; MAX-BW-NEXT: store i8 [[TMP60]], ptr [[TMP77]], align 1 +; MAX-BW-NEXT: store i8 [[TMP61]], ptr [[TMP78]], align 1 +; MAX-BW-NEXT: store i8 [[TMP62]], ptr [[TMP79]], align 1 +; MAX-BW-NEXT: store i8 [[TMP63]], ptr [[TMP80]], align 1 +; MAX-BW-NEXT: store i8 [[TMP64]], ptr [[TMP81]], align 1 +; MAX-BW-NEXT: store i8 [[TMP65]], ptr [[TMP82]], align 1 +; MAX-BW-NEXT: store i8 [[TMP66]], ptr [[TMP83]], align 1 ; MAX-BW-NEXT: store i8 [[TMP67]], ptr [[TMP51]], align 1 ; MAX-BW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; MAX-BW-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll index 85d6c801dee69..f11e194018f4f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll @@ -32,12 +32,12 @@ define void @test(ptr %A) { ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[STRIDED_VEC]], splat (i32 2) ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP13]], i32 0 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i32 1 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i32 2 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3 +; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP8]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP9]], align 4 +; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP10]], align 4 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll index f29428c51c636..0a4f326608dd7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -794,20 +794,20 @@ define dso_local void @unconditional_strided1_optsize(ptr noalias nocapture read ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], splat (i32 1) ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = extractelement <8 x i32> [[TMP0]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP3]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP0]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP5]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP0]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP7]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP0]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP9]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP0]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP11]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP0]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP13]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP0]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP3]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP5]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP7]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP9]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP11]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP13]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP15]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = load i8, ptr [[TMP2]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP4]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll index 414394a8942e5..d8cbcec318f22 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll @@ -27,41 +27,41 @@ define dso_local void @test1(ptr noalias nocapture %points, ptr noalias nocaptur ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], splat (i64 2) ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS:%.*]], i64 [[TMP2]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP4]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP1]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP6]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP1]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS:%.*]], i64 [[TMP2]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP4]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP6]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP8]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP10]], ptr [[TMP3]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP11]], ptr [[TMP5]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP12]], ptr [[TMP7]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP13]], ptr [[TMP9]], align 2 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP14]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = or disjoint <4 x i64> [[TMP1]], splat (i64 1) ; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP15]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP16]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP15]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP18]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP15]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP20]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP15]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP16]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP18]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP20]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP22]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP24]], ptr [[TMP17]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 1 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP25]], ptr [[TMP19]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 2 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP26]], ptr [[TMP21]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 3 ; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP27]], ptr [[TMP23]], align 2 ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll index c81f48ff62afc..23a26b28acec9 100644 --- a/llvm/test/Transforms/LoopVectorize/assume.ll +++ b/llvm/test/Transforms/LoopVectorize/assume.ll @@ -8,12 +8,12 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) ; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+02) ; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+02) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP3]]) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]]) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP5]]) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP3]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP5]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP6]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll index 8397b1ed88dcd..5e96abce8c5e2 100644 --- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll +++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll @@ -21,12 +21,12 @@ define i32 @foo(ptr nocapture %A) { ; CHECK-NEXT: [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 8) ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] ; CHECK-NEXT: store i32 4, ptr [[TMP3]], align 4 ; CHECK-NEXT: store i32 4, ptr [[TMP5]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll index 7a54519c7cdf8..35780f207f162 100644 --- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll +++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll @@ -15,8 +15,8 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 @@ -26,8 +26,7 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 ; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] ; CHECK: [[PRED_LOAD_IF]]: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] ; CHECK: [[PRED_LOAD_CONTINUE]]: @@ -35,8 +34,7 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 ; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_IF1]]: -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP27]], i32 1 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_CONTINUE2]]: @@ -191,8 +189,8 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 2) ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 2) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 2) ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 @@ -202,8 +200,7 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 ; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] ; CHECK: [[PRED_LOAD_IF]]: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] ; CHECK: [[PRED_LOAD_CONTINUE]]: @@ -211,8 +208,7 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 ; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_IF1]]: -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP27]], i32 1 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_CONTINUE2]]: @@ -291,8 +287,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 @@ -302,8 +298,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 ; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] ; CHECK: [[PRED_LOAD_IF]]: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] ; CHECK: [[PRED_LOAD_CONTINUE]]: @@ -311,8 +306,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 ; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_IF1]]: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP15]], i32 1 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_CONTINUE2]]: @@ -391,8 +385,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 @@ -402,8 +396,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 ; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] ; CHECK: [[PRED_LOAD_IF]]: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] ; CHECK: [[PRED_LOAD_CONTINUE]]: @@ -411,8 +404,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 ; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_IF1]]: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP15]], i32 1 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_CONTINUE2]]: @@ -491,8 +483,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 @@ -502,8 +494,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 ; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] ; CHECK: [[PRED_LOAD_IF]]: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] ; CHECK: [[PRED_LOAD_CONTINUE]]: @@ -511,8 +502,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 ; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_IF1]]: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP15]], i32 1 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_CONTINUE2]]: @@ -686,6 +676,8 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef % ; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 @@ -694,7 +686,6 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef % ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] ; CHECK: [[PRED_LOAD_IF]]: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] @@ -703,19 +694,14 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef % ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_IF1]]: -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP23]], i32 1 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_CONTINUE2]]: ; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP15]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP28]], i64 4), "dereferenceable"(ptr [[TMP20]], i64 4) ] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP29]], i64 4), "dereferenceable"(ptr [[TMP19]], i64 4) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP17]], i64 4), "dereferenceable"(ptr [[TMP17]], i64 4) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP22]], i64 4), "dereferenceable"(ptr [[TMP22]], i64 4) ] ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 0 ; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4 @@ -792,8 +778,8 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 @@ -803,8 +789,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 ; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] ; CHECK: [[PRED_LOAD_IF]]: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] ; CHECK: [[PRED_LOAD_CONTINUE]]: @@ -812,8 +797,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 ; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_IF1]]: -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP27]], i32 1 ; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] ; CHECK: [[PRED_LOAD_CONTINUE2]]: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 7684d274a75cf..910cfabda4b76 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -1055,16 +1055,10 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 10 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 12 ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 14 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2 -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2 ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2 ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2 -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]] -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]] ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 @@ -1152,16 +1146,10 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) { ; SINK-AFTER-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2 ; SINK-AFTER-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4 ; SINK-AFTER-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 6 -; SINK-AFTER-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2 -; SINK-AFTER-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2 ; SINK-AFTER-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2 ; SINK-AFTER-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2 -; SINK-AFTER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]] -; SINK-AFTER-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; SINK-AFTER-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; SINK-AFTER-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]] ; SINK-AFTER-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] -; SINK-AFTER-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4 -; SINK-AFTER-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4 ; SINK-AFTER-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4 ; SINK-AFTER-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4 ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -1486,14 +1474,14 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) { ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = load double, ptr [[TMP9]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = load double, ptr [[TMP10]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = load double, ptr [[TMP11]], align 8 -; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = insertelement <4 x double> poison, double [[TMP16]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = insertelement <4 x double> [[TMP20]], double [[TMP17]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = insertelement <4 x double> [[TMP21]], double [[TMP18]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = insertelement <4 x double> [[TMP22]], double [[TMP19]], i32 3 ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP12]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load double, ptr [[TMP13]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = load double, ptr [[TMP14]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = load double, ptr [[TMP15]], align 8 +; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = insertelement <4 x double> poison, double [[TMP16]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP46:%.*]] = insertelement <4 x double> [[TMP45]], double [[TMP17]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP47:%.*]] = insertelement <4 x double> [[TMP46]], double [[TMP18]], i32 2 +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = insertelement <4 x double> [[TMP47]], double [[TMP19]], i32 3 ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = insertelement <4 x double> poison, double [[TMP24]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = insertelement <4 x double> [[TMP28]], double [[TMP25]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = insertelement <4 x double> [[TMP29]], double [[TMP26]], i32 2 @@ -1943,14 +1931,14 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = load i16, ptr [[TMP10]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = load i16, ptr [[TMP11]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP12]], align 2 -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = insertelement <4 x i16> poison, i16 [[TMP19]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[TMP20]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP21]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> [[TMP25]], i16 [[TMP22]], i32 3 ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = load i16, ptr [[TMP13]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = load i16, ptr [[TMP14]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = load i16, ptr [[TMP15]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = load i16, ptr [[TMP16]], align 2 +; UNROLL-NO-IC-NEXT: [[TMP49:%.*]] = insertelement <4 x i16> poison, i16 [[TMP19]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP50:%.*]] = insertelement <4 x i16> [[TMP49]], i16 [[TMP20]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = insertelement <4 x i16> [[TMP50]], i16 [[TMP21]], i32 2 +; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> [[TMP51]], i16 [[TMP22]], i32 3 ; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = insertelement <4 x i16> poison, i16 [[TMP27]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = insertelement <4 x i16> [[TMP31]], i16 [[TMP28]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP29]], i32 2 diff --git a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll index 6374fba71f2c2..6a5e92d1c96f2 100644 --- a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll +++ b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll @@ -47,14 +47,14 @@ define dso_local void @forked_ptrs_different_base_same_offset(ptr nocapture read ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x ptr> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT9]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 3 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr float, ptr [[TMP11]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP29]], i64 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 2 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr float, ptr [[TMP13]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP30]], i64 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 3 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP31]], i64 12 ; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP10]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 5a5b06de69552..d83a2c89ec8b8 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -1002,10 +1002,10 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) { ; UNROLL-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] ; UNROLL-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 ; UNROLL-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 -; UNROLL-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 -; UNROLL-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP11]], i64 1 ; UNROLL-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP8]], align 4 ; UNROLL-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP9]], align 4 +; UNROLL-NEXT: [[TMP24:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 +; UNROLL-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP24]], float [[TMP11]], i64 1 ; UNROLL-NEXT: [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i64 0 ; UNROLL-NEXT: [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP15]], i64 1 ; UNROLL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]] @@ -1014,10 +1014,10 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) { ; UNROLL-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] ; UNROLL-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP18]], align 4 ; UNROLL-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP19]], align 4 -; UNROLL-NEXT: [[TMP24:%.*]] = insertelement <2 x float> poison, float [[TMP22]], i64 0 -; UNROLL-NEXT: [[TMP25:%.*]] = insertelement <2 x float> [[TMP24]], float [[TMP23]], i64 1 ; UNROLL-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP20]], align 4 ; UNROLL-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP21]], align 4 +; UNROLL-NEXT: [[TMP45:%.*]] = insertelement <2 x float> poison, float [[TMP22]], i64 0 +; UNROLL-NEXT: [[TMP25:%.*]] = insertelement <2 x float> [[TMP45]], float [[TMP23]], i64 1 ; UNROLL-NEXT: [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i64 0 ; UNROLL-NEXT: [[TMP29:%.*]] = insertelement <2 x float> [[TMP28]], float [[TMP27]], i64 1 ; UNROLL-NEXT: [[TMP30:%.*]] = fadd fast <2 x float> [[VEC_PHI]], splat (float 1.000000e+00) @@ -1083,10 +1083,10 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP8]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP12]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP9]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP10]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP12]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP3]] @@ -1095,10 +1095,10 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP19]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP20]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP23]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP24]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP21]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP22]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP46:%.*]] = insertelement <2 x float> poison, float [[TMP23]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <2 x float> [[TMP46]], float [[TMP24]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP28]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = fadd fast <2 x float> [[VEC_PHI]], splat (float 1.000000e+00) @@ -1255,8 +1255,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[TMP7]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0 -; CHECK-NEXT: store i32 [[TMP9]], ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1 +; CHECK-NEXT: store i32 [[TMP9]], ptr [[TMP2]], align 8 ; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1300,8 +1300,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; IND-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 ; IND-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[TMP6]], [[BROADCAST_SPLAT]] ; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP7]], i64 0 -; IND-NEXT: store i32 [[TMP8]], ptr [[TMP1]], align 8 ; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i64 1 +; IND-NEXT: store i32 [[TMP8]], ptr [[TMP1]], align 8 ; IND-NEXT: store i32 [[TMP9]], ptr [[TMP2]], align 8 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1345,21 +1345,21 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1 ; UNROLL-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 8 ; UNROLL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8 -; UNROLL-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0 -; UNROLL-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i64 1 ; UNROLL-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP5]], align 8 ; UNROLL-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP6]], align 8 +; UNROLL-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0 +; UNROLL-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP8]], i64 1 ; UNROLL-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 ; UNROLL-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 ; UNROLL-NEXT: [[TMP15:%.*]] = xor <2 x i32> [[TMP10]], [[BROADCAST_SPLAT]] ; UNROLL-NEXT: [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT]] ; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP15]], i64 0 -; UNROLL-NEXT: store i32 [[TMP17]], ptr [[TMP3]], align 8 ; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP15]], i64 1 -; UNROLL-NEXT: store i32 [[TMP18]], ptr [[TMP4]], align 8 ; UNROLL-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i64 0 -; UNROLL-NEXT: store i32 [[TMP19]], ptr [[TMP5]], align 8 ; UNROLL-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP16]], i64 1 +; UNROLL-NEXT: store i32 [[TMP17]], ptr [[TMP3]], align 8 +; UNROLL-NEXT: store i32 [[TMP18]], ptr [[TMP4]], align 8 +; UNROLL-NEXT: store i32 [[TMP19]], ptr [[TMP5]], align 8 ; UNROLL-NEXT: store i32 [[TMP20]], ptr [[TMP6]], align 8 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1405,21 +1405,21 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8 -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP9]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP6]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP7]], align 8 +; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP9]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP13]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = xor <2 x i32> [[TMP11]], [[BROADCAST_SPLAT]] ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT]] ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 -; UNROLL-NO-IC-NEXT: store i32 [[TMP18]], ptr [[TMP4]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 -; UNROLL-NO-IC-NEXT: store i32 [[TMP19]], ptr [[TMP5]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0 -; UNROLL-NO-IC-NEXT: store i32 [[TMP20]], ptr [[TMP6]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1 +; UNROLL-NO-IC-NEXT: store i32 [[TMP18]], ptr [[TMP4]], align 8 +; UNROLL-NO-IC-NEXT: store i32 [[TMP19]], ptr [[TMP5]], align 8 +; UNROLL-NO-IC-NEXT: store i32 [[TMP20]], ptr [[TMP6]], align 8 ; UNROLL-NO-IC-NEXT: store i32 [[TMP21]], ptr [[TMP7]], align 8 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1478,20 +1478,20 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; INTERLEAVE-NEXT: [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] ; INTERLEAVE-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]] ; INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP17]], i64 0 -; INTERLEAVE-NEXT: store i32 [[TMP19]], ptr [[TMP9]], align 8 ; INTERLEAVE-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i64 1 -; INTERLEAVE-NEXT: store i32 [[TMP20]], ptr [[TMP10]], align 8 ; INTERLEAVE-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP17]], i64 2 -; INTERLEAVE-NEXT: store i32 [[TMP21]], ptr [[TMP11]], align 8 ; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP17]], i64 3 -; INTERLEAVE-NEXT: store i32 [[TMP22]], ptr [[TMP12]], align 8 ; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i64 0 -; INTERLEAVE-NEXT: store i32 [[TMP23]], ptr [[TMP13]], align 8 ; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i64 1 -; INTERLEAVE-NEXT: store i32 [[TMP24]], ptr [[TMP14]], align 8 ; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i64 2 -; INTERLEAVE-NEXT: store i32 [[TMP25]], ptr [[TMP15]], align 8 ; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i64 3 +; INTERLEAVE-NEXT: store i32 [[TMP19]], ptr [[TMP9]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP20]], ptr [[TMP10]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP21]], ptr [[TMP11]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP22]], ptr [[TMP12]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP23]], ptr [[TMP13]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP24]], ptr [[TMP14]], align 8 +; INTERLEAVE-NEXT: store i32 [[TMP25]], ptr [[TMP15]], align 8 ; INTERLEAVE-NEXT: store i32 [[TMP26]], ptr [[TMP16]], align 8 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1570,10 +1570,10 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP11:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP13]], align 1, !alias.scope [[META17:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP26]], align 1, !alias.scope [[META17:![0-9]+]] ; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope [[META17]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1 @@ -1633,10 +1633,10 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; IND-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 1 ; IND-NEXT: [[TMP10:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2) ; IND-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i64 0 -; IND-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] ; IND-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i64 1 +; IND-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] ; IND-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] -; IND-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP12]], align 1, !alias.scope [[META17:![0-9]+]] +; IND-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP24]], align 1, !alias.scope [[META17:![0-9]+]] ; IND-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1, !alias.scope [[META17]] ; IND-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1 ; IND-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1 @@ -1700,16 +1700,16 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; UNROLL-NEXT: [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 2) ; UNROLL-NEXT: [[TMP13:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 8) ; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i64 0 -; UNROLL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] ; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i64 1 -; UNROLL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] ; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP13]], i64 0 -; UNROLL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] ; UNROLL-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i64 1 +; UNROLL-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] +; UNROLL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] +; UNROLL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] ; UNROLL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]] -; UNROLL-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope [[META17:![0-9]+]] -; UNROLL-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP17]], align 1, !alias.scope [[META17]] -; UNROLL-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope [[META17]] +; UNROLL-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP35]], align 1, !alias.scope [[META17:![0-9]+]] +; UNROLL-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope [[META17]] +; UNROLL-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP36]], align 1, !alias.scope [[META17]] ; UNROLL-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP21]], align 1, !alias.scope [[META17]] ; UNROLL-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1 ; UNROLL-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1 @@ -1779,16 +1779,16 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2) ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = shl nsw <2 x i64> [[STEP_ADD]], splat (i64 2) ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]] ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]] ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]] ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]] +; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]] +; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]] ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]] -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP16]], align 1, !alias.scope [[META17:![0-9]+]] -; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP18]], align 1, !alias.scope [[META17]] -; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope [[META17]] +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP37]], align 1, !alias.scope [[META17:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope [[META17]] +; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP38]], align 1, !alias.scope [[META17]] ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP22]], align 1, !alias.scope [[META17]] ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1 @@ -1864,30 +1864,30 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[DOTIDX5]] ; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP18]], align 1 ; INTERLEAVE-NEXT: [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP19]], align 1 -; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1 -; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP12]], i32 1 -; INTERLEAVE-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP13]], i32 1 -; INTERLEAVE-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP14]], i32 1 -; INTERLEAVE-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP15]], i32 1 -; INTERLEAVE-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP16]], i32 1 -; INTERLEAVE-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP17]], i32 1 ; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0 -; INTERLEAVE-NEXT: store i32 [[TMP28]], ptr [[TMP20]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META20:![0-9]+]] ; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4 -; INTERLEAVE-NEXT: store i32 [[TMP29]], ptr [[TMP21]], align 1, !alias.scope [[META17]], !noalias [[META20]] ; INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8 -; INTERLEAVE-NEXT: store i32 [[TMP30]], ptr [[TMP22]], align 1, !alias.scope [[META17]], !noalias [[META20]] ; INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12 -; INTERLEAVE-NEXT: store i32 [[TMP31]], ptr [[TMP23]], align 1, !alias.scope [[META17]], !noalias [[META20]] ; INTERLEAVE-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0 -; INTERLEAVE-NEXT: store i32 [[TMP32]], ptr [[TMP24]], align 1, !alias.scope [[META17]], !noalias [[META20]] ; INTERLEAVE-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4 -; INTERLEAVE-NEXT: store i32 [[TMP33]], ptr [[TMP25]], align 1, !alias.scope [[META17]], !noalias [[META20]] ; INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8 -; INTERLEAVE-NEXT: store i32 [[TMP34]], ptr [[TMP26]], align 1, !alias.scope [[META17]], !noalias [[META20]] ; INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12 -; INTERLEAVE-NEXT: store i32 [[TMP35]], ptr [[TMP27]], align 1, !alias.scope [[META17]], !noalias [[META20]] +; INTERLEAVE-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1 +; INTERLEAVE-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1 +; INTERLEAVE-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP12]], i32 1 +; INTERLEAVE-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP13]], i32 1 +; INTERLEAVE-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP14]], i32 1 +; INTERLEAVE-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP15]], i32 1 +; INTERLEAVE-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP16]], i32 1 +; INTERLEAVE-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP17]], i32 1 +; INTERLEAVE-NEXT: store i32 [[TMP28]], ptr [[TMP41]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META20:![0-9]+]] +; INTERLEAVE-NEXT: store i32 [[TMP29]], ptr [[TMP42]], align 1, !alias.scope [[META17]], !noalias [[META20]] +; INTERLEAVE-NEXT: store i32 [[TMP30]], ptr [[TMP43]], align 1, !alias.scope [[META17]], !noalias [[META20]] +; INTERLEAVE-NEXT: store i32 [[TMP31]], ptr [[TMP44]], align 1, !alias.scope [[META17]], !noalias [[META20]] +; INTERLEAVE-NEXT: store i32 [[TMP32]], ptr [[TMP45]], align 1, !alias.scope [[META17]], !noalias [[META20]] +; INTERLEAVE-NEXT: store i32 [[TMP33]], ptr [[TMP46]], align 1, !alias.scope [[META17]], !noalias [[META20]] +; INTERLEAVE-NEXT: store i32 [[TMP34]], ptr [[TMP47]], align 1, !alias.scope [[META17]], !noalias [[META20]] +; INTERLEAVE-NEXT: store i32 [[TMP35]], ptr [[TMP48]], align 1, !alias.scope [[META17]], !noalias [[META20]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] @@ -2442,11 +2442,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0 ; CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 2 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1 ; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP8]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) @@ -2490,11 +2490,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; IND-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1 ; IND-NEXT: [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; IND-NEXT: [[TMP5:%.*]] = trunc <2 x i32> [[TMP4]] to <2 x i16> +; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0 +; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1 ; IND-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 ; IND-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1 -; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0 ; IND-NEXT: store i16 [[TMP8]], ptr [[TMP6]], align 2 -; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1 ; IND-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 2 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) @@ -2543,17 +2543,17 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; UNROLL-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] ; UNROLL-NEXT: [[TMP8:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16> ; UNROLL-NEXT: [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16> +; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0 +; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1 +; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0 +; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1 ; UNROLL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 ; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1 ; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 ; UNROLL-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1 -; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0 ; UNROLL-NEXT: store i16 [[TMP14]], ptr [[TMP10]], align 2 -; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1 ; UNROLL-NEXT: store i16 [[TMP15]], ptr [[TMP11]], align 2 -; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0 ; UNROLL-NEXT: store i16 [[TMP16]], ptr [[TMP12]], align 2 -; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1 ; UNROLL-NEXT: store i16 [[TMP17]], ptr [[TMP13]], align 2 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4) @@ -2604,17 +2604,17 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16> ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16> +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0 ; UNROLL-NO-IC-NEXT: store i16 [[TMP15]], ptr [[TMP11]], align 2 -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1 ; UNROLL-NO-IC-NEXT: store i16 [[TMP16]], ptr [[TMP12]], align 2 -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0 ; UNROLL-NO-IC-NEXT: store i16 [[TMP17]], ptr [[TMP13]], align 2 -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1 ; UNROLL-NO-IC-NEXT: store i16 [[TMP18]], ptr [[TMP14]], align 2 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2) @@ -2667,6 +2667,14 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; INTERLEAVE-NEXT: [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] ; INTERLEAVE-NEXT: [[TMP12:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16> ; INTERLEAVE-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i16> +; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0 +; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1 +; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2 +; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3 +; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0 +; INTERLEAVE-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1 +; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2 +; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3 ; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 ; INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1 ; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 @@ -2675,21 +2683,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP7]], i32 1 ; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP8]], i32 1 ; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP9]], i32 1 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0 ; INTERLEAVE-NEXT: store i16 [[TMP22]], ptr [[TMP14]], align 2 -; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1 ; INTERLEAVE-NEXT: store i16 [[TMP23]], ptr [[TMP15]], align 2 -; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2 ; INTERLEAVE-NEXT: store i16 [[TMP24]], ptr [[TMP16]], align 2 -; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3 ; INTERLEAVE-NEXT: store i16 [[TMP25]], ptr [[TMP17]], align 2 -; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0 ; INTERLEAVE-NEXT: store i16 [[TMP26]], ptr [[TMP18]], align 2 -; INTERLEAVE-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1 ; INTERLEAVE-NEXT: store i16 [[TMP27]], ptr [[TMP19]], align 2 -; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2 ; INTERLEAVE-NEXT: store i16 [[TMP28]], ptr [[TMP20]], align 2 -; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3 ; INTERLEAVE-NEXT: store i16 [[TMP29]], ptr [[TMP21]], align 2 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8) diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll index fb4545cb715bb..0ff24c34ff3ca 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll @@ -144,8 +144,8 @@ define void @interleaved_with_cond_store_1(ptr %p, i64 %x, i64 %n) { ; CHECK: pred.store.continue2: ; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 0 -; CHECK-NEXT: store i64 [[TMP11]], ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 2 +; CHECK-NEXT: store i64 [[TMP11]], ptr [[TMP2]], align 8 ; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll index 80ccc383b67f8..b37f8caa5b457 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -578,6 +578,10 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture % ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1021, [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 1020, [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP1]], i32 0 @@ -595,21 +599,17 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture % ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3 ; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3 ; CHECK-NEXT: store i64 [[TMP21]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1 ; CHECK-NEXT: store i64 [[TMP22]], ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2 ; CHECK-NEXT: store i64 [[TMP23]], ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3 ; CHECK-NEXT: store i64 [[TMP24]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0 ; CHECK-NEXT: store i64 [[TMP25]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1 ; CHECK-NEXT: store i64 [[TMP26]], ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2 ; CHECK-NEXT: store i64 [[TMP27]], ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3 ; CHECK-NEXT: store i64 [[TMP28]], ptr [[TMP11]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) @@ -919,12 +919,12 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) { ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 ; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1000,12 +1000,12 @@ define i32 @PR27626_1(ptr %p, i64 %n) { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP6]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 -; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 -; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 +; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP6]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP7]], align 4 +; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 ; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> @@ -1097,12 +1097,12 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) { ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 +; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 +; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP13]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1173,24 +1173,24 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP6]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP6]], align 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP8]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP10]], align 4 ; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> @@ -1364,7 +1364,15 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3 ; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -3) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP12]], i64 12 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]] @@ -1373,21 +1381,13 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) { ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP34]], i64 28 ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP35]], i64 36 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]] ; CHECK-NEXT: store i32 [[X:%.*]], ptr [[TMP15]], align 4 ; CHECK-NEXT: store i32 [[X]], ptr [[TMP17]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll index 0306567c5ce98..a02fddc4cf72d 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll @@ -134,18 +134,14 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT6]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]] -; CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4, !alias.scope [[META12]] +; CHECK-NEXT: store i32 [[K]], ptr [[A]], align 4, !alias.scope [[META12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll index f801443b85d3f..5f9f87ffeecdc 100644 --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -1101,7 +1101,6 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) { ; VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]] ; VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 ; VEC-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP2]], align 2 -; VEC-NEXT: [[TMP4:%.*]] = add i32 [[STEP_2]], [[TMP0]] ; VEC-NEXT: [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP6]] ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; VEC-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8 @@ -1294,8 +1293,8 @@ define i32 @iv_ext_used_outside( ptr %dst) { ; VEC-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP2]], align 4 ; VEC-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i16> [[VEC_IND]], splat (i16 1) ; VEC-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0 -; VEC-NEXT: [[TMP4:%.*]] = zext nneg i16 [[TMP3]] to i32 ; VEC-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1 +; VEC-NEXT: [[TMP9:%.*]] = zext nneg i16 [[TMP3]] to i32 ; VEC-NEXT: [[TMP7:%.*]] = zext nneg i16 [[TMP8]] to i32 ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; VEC-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) @@ -1389,9 +1388,7 @@ define i64 @test_iv_increment_incremented(ptr %dst) { ; VEC-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[TMP0]], i32 0 ; VEC-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 -1 ; VEC-NEXT: store <2 x i16> splat (i16 1), ptr [[TMP2]], align 2 -; VEC-NEXT: [[TMP3:%.*]] = add i64 2, -1 ; VEC-NEXT: [[TMP5:%.*]] = add i64 1, -1 -; VEC-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 1 ; VEC-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 1 ; VEC-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VEC: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index 7a33fd092e293..f8d0c2737d7bc 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -570,8 +570,8 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[TMP6]], align 2 ; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2 @@ -664,11 +664,13 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i32> [[REVERSE]], splat (i32 3) ; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP12]], i32 1 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll index ffeb3b19d11c4..f8ddd344f5587 100644 --- a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll @@ -192,8 +192,8 @@ define void @no_gep_or_bitcast(ptr noalias %a, i64 %n) { ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 0 -; CHECK-NEXT: store i32 0, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 1 +; CHECK-NEXT: store i32 0, ptr [[TMP1]], align 8 ; CHECK-NEXT: store i32 0, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll index 1480bc930a5d2..fda6e4811c0f3 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll @@ -821,17 +821,17 @@ define void @multiple_ivs_wide(ptr %dst) { ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 6 ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP2]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP3]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 ; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 ; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 ; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8) @@ -857,17 +857,17 @@ define void @multiple_ivs_wide(ptr %dst) { ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 6 ; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[VEC_IND2]], splat (i32 2) +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3 ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP15]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]] ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP17]] ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0 ; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP20]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1 ; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP21]], align 4 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2 ; CHECK-NEXT: store i32 [[TMP26]], ptr [[TMP22]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3 ; CHECK-NEXT: store i32 [[TMP27]], ptr [[TMP23]], align 4 ; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], splat (i32 8) diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll index ebddca2294d9c..e84261aaf74dc 100644 --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -264,8 +264,8 @@ define void @pr43371() optsize { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] ; CHECK-NEXT: store i16 0, ptr [[TMP3]], align 1 ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 @@ -302,8 +302,8 @@ define void @pr43371() optsize { ; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; PGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] ; PGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 @@ -340,8 +340,8 @@ define void @pr43371() optsize { ; NPGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] ; NPGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; NPGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; NPGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; NPGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; NPGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; NPGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] ; NPGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 ; NPGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 @@ -401,8 +401,8 @@ define void @pr43371_pgso() !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] ; CHECK-NEXT: store i16 0, ptr [[TMP3]], align 1 ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 @@ -439,8 +439,8 @@ define void @pr43371_pgso() !prof !14 { ; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; PGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] ; PGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll index 4c04d96d46479..566003114e44a 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll @@ -17,11 +17,14 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[SRC]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP1]] @@ -31,11 +34,8 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 ; CHECK-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP11]], align 8 ; CHECK-NEXT: store ptr [[TMP5]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1 ; CHECK-NEXT: store ptr [[TMP12]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2 ; CHECK-NEXT: store ptr [[TMP13]], ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3 ; CHECK-NEXT: store ptr [[TMP14]], ptr [[TMP10]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 32 diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll index fde93d760696c..98e26dbf1eb94 100644 --- a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll +++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll @@ -42,12 +42,12 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP8]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll index 7579fbcb70ef1..db8d96cee011f 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll @@ -230,12 +230,12 @@ for.end: ; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP17:%.*]] = or disjoint <4 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll index a5cf45c082adb..1ef2667843127 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll @@ -30,16 +30,16 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64> ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP16]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP11]], align 8 -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP14]], align 8 ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP15]], align 8 +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP16]], align 8 ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP17]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll index d5a206ff21da0..136b1419d91b8 100644 --- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll @@ -192,8 +192,8 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) { ; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll index 50b9ba12af82d..0c5cf09430bb9 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll @@ -79,20 +79,20 @@ define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly ; CHECK: [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}}) ; // Lane 0 ; CHECK: [[A_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0 -; CHECK: [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK: [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i64 0 ; CHECK: [[WIDE_A_0:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VEC_A_0]], 0 ; CHECK: [[B_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1 ; CHECK: [[UNDEF_B_0:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], 1 -; CHECK: [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i32 0 +; CHECK: [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i64 0 ; CHECK: [[WIDE_0:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], <2 x float> [[VEC_B_0]], 1 ; // Lane 1 ; CHECK: [[A_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0 ; CHECK: [[VEC_A_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_0]], 0 -; CHECK: [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i32 1 +; CHECK: [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i64 1 ; CHECK: [[WIDE_A:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_0]], <2 x float> [[VEC_A]], 0 ; CHECK: [[B_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1 ; CHECK: [[VEC_B_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A]], 1 -; CHECK: [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i32 1 +; CHECK: [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i64 1 ; CHECK: [[WIDE:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A]], <2 x float> [[VEC_B]], 1 ; // Store wide values: ; CHECK: [[VEC_A_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll index 63ca45495335f..abdd5e9562590 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll @@ -76,10 +76,10 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[TMP0]]) #[[ATTR0]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]] +; CHECK-NEXT: [[TMP5:%.*]] = call double @foo(double [[TMP3]], i64 [[TMP0]]) #[[ATTR0]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP5]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] ; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[TMP8]], align 8 diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll index fe533672f2ca2..fadbaaa228608 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll @@ -140,12 +140,12 @@ define void @blend_chain_iv(i1 %c) { ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i64> undef, <4 x i64> [[VEC_IND]] ; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI]], <4 x i64> undef ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP7]] ; CHECK-NEXT: store i16 0, ptr [[TMP2]], align 2 ; CHECK-NEXT: store i16 0, ptr [[TMP4]], align 2 @@ -229,6 +229,10 @@ define void @redundant_branch_and_blends_without_mask(ptr %A) { ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x ptr> [[TMP35]], ptr [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x ptr> [[TMP36]], ptr [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x ptr> [[TMP37]], ptr [[TMP8]], i32 3 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 ; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] ; CHECK: [[PRED_LOAD_IF]]: diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll index 3aad626554cee..a694a22de110a 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll @@ -102,10 +102,10 @@ define void @ld_div3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -154,20 +154,20 @@ define void @ld_div1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 @@ -211,12 +211,12 @@ define void @ld_div2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; CHECK-NEXT: store i64 [[TMP8]], ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; CHECK-NEXT: store i64 [[TMP9]], ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP6]], ptr [[TMP8]], align 8 +; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP9]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -257,20 +257,20 @@ define void @ld_div3_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 @@ -312,20 +312,20 @@ define void @ld_div1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -367,20 +367,20 @@ define void @ld_div2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -424,12 +424,12 @@ define void @ld_div3_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; CHECK-NEXT: store i64 [[TMP8]], ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; CHECK-NEXT: store i64 [[TMP9]], ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP6]], ptr [[TMP8]], align 8 +; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP9]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] @@ -511,10 +511,10 @@ define void @ld_div2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -561,10 +561,10 @@ define void @ld_div3_step1_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -614,20 +614,20 @@ define void @ld_div1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498 @@ -672,12 +672,12 @@ define void @ld_div2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; CHECK-NEXT: store i64 [[TMP9]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; CHECK-NEXT: store i64 [[TMP10]], ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP9]], align 8 +; CHECK-NEXT: store i64 [[TMP8]], ptr [[TMP10]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498 ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] @@ -719,20 +719,20 @@ define void @ld_div3_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498 @@ -775,20 +775,20 @@ define void @ld_div1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -831,20 +831,20 @@ define void @ld_div2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -889,12 +889,12 @@ define void @ld_div3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; CHECK-NEXT: store i64 [[TMP9]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; CHECK-NEXT: store i64 [[TMP10]], ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP9]], align 8 +; CHECK-NEXT: store i64 [[TMP8]], ptr [[TMP10]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] @@ -937,10 +937,10 @@ define void @test_step_is_not_invariant(ptr %A) { ; CHECK-NEXT: [[TMP5:%.*]] = udiv <2 x i16> [[TMP4]], splat (i16 6) ; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i16> [[TMP5]] to <2 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP9]], align 2 ; CHECK-NEXT: store i16 [[TMP2]], ptr [[TMP10]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll index 1f331a4bf973d..a24842e87f5b6 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll @@ -102,10 +102,10 @@ define void @ld_and_neg3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -154,20 +154,20 @@ define void @ld_and_neg1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -1) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 @@ -254,20 +254,20 @@ define void @ld_and_neg1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -1) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -309,20 +309,20 @@ define void @ld_and_neg2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -362,10 +362,10 @@ define void @ld_and_neg2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -415,20 +415,20 @@ define void @ld_and_neg2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498 @@ -471,20 +471,20 @@ define void @ld_and_neg2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -527,20 +527,20 @@ define void @ld_and_neg3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll index ea8831c8ab7e5..9a72cd6faed2d 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll @@ -18,28 +18,28 @@ define void @ld_div2_urem3_1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP1:%.*]] = urem <8 x i64> [[TMP0]], splat (i64 3) ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP3]], align 8 -; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP13]], align 8 -; CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8 ; CHECK-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8 ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> poison, i64 [[TMP18]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 1 @@ -94,28 +94,28 @@ define void @ld_div2_urem3_2(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = udiv <8 x i64> [[TMP0]], splat (i64 2) ; CHECK-NEXT: [[TMP2:%.*]] = urem <8 x i64> [[TMP1]], splat (i64 3) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i64> [[TMP2]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i64> [[TMP2]], i32 4 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i64> [[TMP2]], i32 5 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i64> [[TMP2]], i32 6 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i64> [[TMP2]], i32 7 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP14]], align 8 -; CHECK-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP16]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i32 4 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i64> [[TMP2]], i32 5 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i64> [[TMP2]], i32 6 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i64> [[TMP2]], i32 7 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8 +; CHECK-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8 ; CHECK-NEXT: [[TMP26:%.*]] = load i64, ptr [[TMP18]], align 8 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i64> poison, i64 [[TMP19]], i32 0 ; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 1 @@ -168,28 +168,28 @@ define void @ld_div4(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8 ; CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8 ; CHECK-NEXT: [[TMP25:%.*]] = insertelement <8 x i64> poison, i64 [[TMP17]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 1 diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll index 1f33f7a15edd4..843fd09c3fd96 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll @@ -107,16 +107,16 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VF4-NEXT: [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1) ; VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 -; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8 -; VF4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 +; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 +; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8 +; VF4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF4-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0 ; VF4-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1 @@ -235,20 +235,20 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 ; VF2-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], zeroinitializer ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; VF2-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; VF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; VF2-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; VF2-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 @@ -273,34 +273,34 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6 ; VF4-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], zeroinitializer ; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 +; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 ; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 ; VF4-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0 ; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3 ; VF4-NEXT: [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42) -; VF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 -; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP22]], align 8 -; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 -; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP23]], align 8 -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 +; VF4-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 +; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 +; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP22]], ptr [[TMP26]], align 8 +; VF4-NEXT: store i64 [[TMP23]], ptr [[TMP27]], align 8 +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 @@ -344,12 +344,12 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 ; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 ; VF2-NEXT: [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: store i64 [[TMP8]], ptr [[TMP6]], align 8 -; VF2-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: store i64 [[TMP9]], ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP6]], ptr [[TMP8]], align 8 +; VF2-NEXT: store i64 [[TMP7]], ptr [[TMP9]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 ; VF2-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -375,18 +375,18 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 ; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8 ; VF4-NEXT: [[TMP7:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42) -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: store i64 [[TMP12]], ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: store i64 [[TMP13]], ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: store i64 [[TMP14]], ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: store i64 [[TMP15]], ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP8]], ptr [[TMP12]], align 8 +; VF4-NEXT: store i64 [[TMP9]], ptr [[TMP13]], align 8 +; VF4-NEXT: store i64 [[TMP10]], ptr [[TMP14]], align 8 +; VF4-NEXT: store i64 [[TMP11]], ptr [[TMP15]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 ; VF4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -427,20 +427,20 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; VF2-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], zeroinitializer ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; VF2-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; VF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; VF2-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; VF2-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -465,34 +465,34 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9 ; VF4-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], zeroinitializer ; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 +; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 ; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 ; VF4-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0 ; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3 ; VF4-NEXT: [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42) -; VF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 -; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP22]], align 8 -; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 -; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP23]], align 8 -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 +; VF4-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 +; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 +; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP22]], ptr [[TMP26]], align 8 +; VF4-NEXT: store i64 [[TMP23]], ptr [[TMP27]], align 8 +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -534,20 +534,20 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3 ; VF2-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1) ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 ; VF2-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42) -; VF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP12]], align 8 -; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 +; VF2-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP12]], ptr [[TMP14]], align 8 +; VF2-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -572,34 +572,34 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9 ; VF4-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1) ; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 +; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 +; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 ; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 ; VF4-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0 ; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3 ; VF4-NEXT: [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42) -; VF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 -; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP22]], align 8 -; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 -; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP23]], align 8 -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 +; VF4-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 +; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 +; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP22]], ptr [[TMP26]], align 8 +; VF4-NEXT: store i64 [[TMP23]], ptr [[TMP27]], align 8 +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -640,10 +640,10 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; VF2-NEXT: [[TMP0:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1) ; VF2-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; VF2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 ; VF2-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 ; VF2-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 ; VF2-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1 @@ -671,16 +671,16 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; VF4-NEXT: [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1) ; VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 -; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8 -; VF4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 +; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 +; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8 +; VF4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF4-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0 ; VF4-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1 @@ -734,12 +734,12 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 ; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42) -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; VF2-NEXT: store i64 [[TMP9]], ptr [[TMP7]], align 8 -; VF2-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; VF2-NEXT: store i64 [[TMP10]], ptr [[TMP8]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP7]], ptr [[TMP9]], align 8 +; VF2-NEXT: store i64 [[TMP8]], ptr [[TMP10]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498 ; VF2-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] @@ -766,18 +766,18 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 ; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 ; VF4-NEXT: [[TMP8:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42) -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0 -; VF4-NEXT: store i64 [[TMP13]], ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1 -; VF4-NEXT: store i64 [[TMP14]], ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2 -; VF4-NEXT: store i64 [[TMP15]], ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3 -; VF4-NEXT: store i64 [[TMP16]], ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2 +; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3 +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP9]], ptr [[TMP13]], align 8 +; VF4-NEXT: store i64 [[TMP10]], ptr [[TMP14]], align 8 +; VF4-NEXT: store i64 [[TMP11]], ptr [[TMP15]], align 8 +; VF4-NEXT: store i64 [[TMP12]], ptr [[TMP16]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496 ; VF4-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] @@ -819,20 +819,20 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; VF2-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1) ; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; VF2-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -858,34 +858,34 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9 ; VF4-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1) ; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0 -; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP7]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 ; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP14]], i32 0 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 1 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3 ; VF4-NEXT: [[TMP22:%.*]] = add nsw <4 x i64> [[TMP21]], splat (i64 42) -; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0 -; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP23]], align 8 -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 +; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3 +; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP23]], ptr [[TMP27]], align 8 +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -928,20 +928,20 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3 ; VF2-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 2) ; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; VF2-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42) -; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP13]], align 8 -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 +; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 +; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP13]], ptr [[TMP15]], align 8 +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 @@ -967,34 +967,34 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9 ; VF4-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 2) ; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0 -; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP7]], align 8 -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 ; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP14]], i32 0 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 1 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3 ; VF4-NEXT: [[TMP22:%.*]] = add nsw <4 x i64> [[TMP21]], splat (i64 42) -; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0 -; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP23]], align 8 -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 +; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3 +; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP23]], ptr [[TMP27]], align 8 +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332 diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll index ef6255720d73c..de77020d9034c 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll @@ -18,10 +18,10 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]] ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 @@ -52,16 +52,16 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -144,16 +144,16 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -209,10 +209,10 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]] ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 @@ -243,16 +243,16 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -311,20 +311,20 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -353,34 +353,34 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -430,20 +430,20 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -472,34 +472,34 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -549,20 +549,20 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -591,34 +591,34 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -668,20 +668,20 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -710,34 +710,34 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -787,20 +787,20 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -829,34 +829,34 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -906,20 +906,20 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] ; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 ; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 ; VF2-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42) -; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP14]], align 8 -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 +; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP16]], align 8 +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -948,34 +948,34 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 -; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 -; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] -; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 +; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 ; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3 ; VF4-NEXT: [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42) -; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 -; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP24]], align 8 -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 +; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3 +; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP28]], align 8 +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1023,10 +1023,10 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]] ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 @@ -1058,16 +1058,16 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -1124,10 +1124,10 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2) ; VF2-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]] ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 @@ -1159,16 +1159,16 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -1225,10 +1225,10 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]] ; VF2-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 ; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 ; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1 @@ -1260,16 +1260,16 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]] ; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] -; VF4-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] -; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] -; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8 -; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]] +; VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 ; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0 ; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1 @@ -1329,20 +1329,20 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1372,34 +1372,34 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1450,20 +1450,20 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1493,34 +1493,34 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1571,20 +1571,20 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1614,34 +1614,34 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1692,20 +1692,20 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1735,34 +1735,34 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1813,20 +1813,20 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1856,34 +1856,34 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) @@ -1934,20 +1934,20 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3) ; VF2-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] ; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] -; VF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]] +; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 ; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 ; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1 ; VF2-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42) -; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 -; VF2-NEXT: store i64 [[TMP17]], ptr [[TMP15]], align 8 -; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 -; VF2-NEXT: store i64 [[TMP18]], ptr [[TMP16]], align 8 +; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF2-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; VF2-NEXT: store i64 [[TMP16]], ptr [[TMP18]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6) ; VF2-NEXT: [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2) @@ -1977,34 +1977,34 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3) ; VF4-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 -; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 -; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; VF4-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 -; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]] -; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 -; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]] -; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8 -; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; VF4-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] +; VF4-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8 +; VF4-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8 ; VF4-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0 ; VF4-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1 ; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2 ; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3 ; VF4-NEXT: [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42) -; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] -; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] -; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] -; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 -; VF4-NEXT: store i64 [[TMP29]], ptr [[TMP25]], align 8 -; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 -; VF4-NEXT: store i64 [[TMP30]], ptr [[TMP26]], align 8 -; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 -; VF4-NEXT: store i64 [[TMP31]], ptr [[TMP27]], align 8 -; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 -; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP28]], align 8 +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0 +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3 +; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]] +; VF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; VF4-NEXT: store i64 [[TMP25]], ptr [[TMP29]], align 8 +; VF4-NEXT: store i64 [[TMP26]], ptr [[TMP30]], align 8 +; VF4-NEXT: store i64 [[TMP27]], ptr [[TMP31]], align 8 +; VF4-NEXT: store i64 [[TMP28]], ptr [[TMP32]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12) ; VF4-NEXT: [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4) diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index 8d854dd1164c3..5c3ef63c79833 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -340,12 +340,12 @@ define void @test_versioned_with_non_ex_use(i32 %offset, ptr noalias %dst.1, ptr ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]] ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]] ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP17]] ; CHECK-NEXT: store i32 0, ptr [[TMP12]], align 8 ; CHECK-NEXT: store i32 0, ptr [[TMP14]], align 8 diff --git a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll index 4a0a5e3be5b19..fcdb04b494dce 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll @@ -63,29 +63,29 @@ define void @expand(ptr %src, ptr %dst, i64 %0) { ; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr [[SRC]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; CHECK-NEXT: [[TMP20:%.*]] = shl <4 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP23]] ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]] ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP27]] -; CHECK-NEXT: store double [[TMP19]], ptr [[TMP22]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: store double [[TMP19]], ptr [[TMP24]], align 8, !alias.scope [[META3]] +; CHECK-NEXT: store double [[TMP19]], ptr [[TMP31]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: store double [[TMP19]], ptr [[TMP26]], align 8, !alias.scope [[META3]] +; CHECK-NEXT: store double [[TMP19]], ptr [[TMP33]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: store double [[TMP19]], ptr [[TMP28]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: [[TMP29:%.*]] = or disjoint <4 x i64> [[TMP20]], splat (i64 1) ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP29]], i32 0 -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP30]] ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP29]], i32 1 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]] ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i64> [[TMP29]], i32 2 -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i64> [[TMP29]], i32 3 +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]] +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]] ; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP36]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP31]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP33]], align 8, !alias.scope [[META3]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP41]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP35]], align 8, !alias.scope [[META3]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP42]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP37]], align 8, !alias.scope [[META3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 732214aa1449e..ca14d1977c411 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -462,8 +462,9 @@ define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<0> + vp<[[CAN_IV]]> * vp<[[EXP_SCEV]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, vp<[[EXP_SCEV]]> ; CHECK-NEXT: WIDEN ir<%v3> = add nuw ir<%iv>, ir<1> +; CHECK-NEXT: EMIT vp<[[SCALARS:%.+]]> = unpack-into-scalars ir<%v3> ; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE store ir<%v3>, ir<%gep> +; CHECK-NEXT: REPLICATE store vp<[[SCALARS]]>, ir<%gep> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -905,6 +906,7 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) { ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%zext> = zext nneg ir<%l> +; CHECK-NEXT: EMIT vp<[[SCALARS:%.+]]> = unpack-into-scalars ir<%zext> ; CHECK-NEXT: REPLICATE store ir<%zext>, ir<%p1> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll index f948906d4d0b8..e88710b0f4889 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll @@ -78,9 +78,11 @@ define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%in>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[IN_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN ir<%in_val> = load vp<[[IN_VEC_PTR]]> -; CHECK-NEXT: REPLICATE ir<%call> = call @foo(ir<%in_val>) -; CHECK-NEXT: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> -; CHECK-NEXT: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; CHECK-NEXT: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val> +; CHECK-NEXT: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>) +; CHECK-NEXT: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call> +; CHECK-NEXT: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0> +; CHECK-NEXT: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%out_a>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[OUT_A_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx2> ; CHECK-NEXT: WIDEN store vp<[[OUT_A_VEC_PTR]]>, ir<%extract_a>