diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ca092dcfcb492..d4bd4b56e89d3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -578,8 +578,10 @@ class InnerLoopVectorizer { /// The profitablity analysis. LoopVectorizationCostModel *Cost; - /// BFI and PSI are used to check for profile guided size optimizations. + /// Used to calculate the probability of predicated blocks in + /// getPredBlockCostDivisor. BlockFrequencyInfo *BFI; + /// Used to check for profile guided size optimizations. ProfileSummaryInfo *PSI; /// Structure to hold information about generated runtime checks, responsible @@ -900,7 +902,7 @@ class LoopVectorizationCostModel { InterleavedAccessInfo &IAI, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), - TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), + TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), BFI(BFI), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) { if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors) initializeVScaleForTuning(); @@ -1249,6 +1251,17 @@ class LoopVectorizationCostModel { /// Superset of instructions that return true for isScalarWithPredication. bool isPredicatedInst(Instruction *I) const; + /// A helper function that returns how much we should divide the cost of a + /// predicated block by. Typically this is the reciprocal of the block + /// probability, i.e. if we return X we are assuming the predicated block will + /// execute once for every X iterations of the loop header so the block should + /// only contribute 1/X of its cost to the total cost calculation, but when + /// optimizing for code size it will just be 1 as code size costs don't depend + /// on execution probabilities. + inline unsigned + getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, + const BasicBlock *BB) const; + /// Return the costs for our two available strategies for lowering a /// div/rem operation which requires speculating at least one lane. /// First result is for scalarization (will be invalid for scalable @@ -1711,6 +1724,8 @@ class LoopVectorizationCostModel { /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; + const BlockFrequencyInfo *BFI; + const Function *TheFunction; /// Loop Vectorize Hint. @@ -2866,6 +2881,19 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { } } +unsigned LoopVectorizationCostModel::getPredBlockCostDivisor( + TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const { + if (CostKind == TTI::TCK_CodeSize) + return 1; + + uint64_t HeaderFreq = BFI->getBlockFreq(TheLoop->getHeader()).getFrequency(); + uint64_t BBFreq = BFI->getBlockFreq(BB).getFrequency(); + assert(HeaderFreq >= BBFreq && + "Header has smaller block freq than dominated BB?"); + return BFI->getBlockFreq(TheLoop->getHeader()).getFrequency() / + BFI->getBlockFreq(BB).getFrequency(); +} + std::pair LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, ElementCount VF) const { @@ -2902,7 +2930,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. - ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind); + ScalarizationCost = + ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent()); } InstructionCost SafeDivisorCost = 0; @@ -5035,7 +5064,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( } // Scale the total scalar cost by block probability. - ScalarCost /= getPredBlockCostDivisor(CostKind); + ScalarCost /= getPredBlockCostDivisor(CostKind, PredInst->getParent()); // Compute the discount. A non-negative discount means the vector version // of the instruction costs more, and scalarizing would be beneficial. @@ -5088,7 +5117,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { // cost by the probability of executing it. blockNeedsPredication from // Legal is used so as to not include all blocks in tail folded loops. if (VF.isScalar() && Legal->blockNeedsPredication(BB)) - BlockCost /= getPredBlockCostDivisor(CostKind); + BlockCost /= getPredBlockCostDivisor(CostKind, BB); Cost += BlockCost; } @@ -5167,7 +5196,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. if (isPredicatedInst(I)) { - Cost /= getPredBlockCostDivisor(CostKind); + Cost /= getPredBlockCostDivisor(CostKind, I->getParent()); // Add the cost of an i1 extract and a branch auto *VecI1Ty = @@ -6727,6 +6756,11 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { SkipCostComputation.contains(UI); } +unsigned VPCostContext::getPredBlockCostDivisor( + TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const { + return CM.getPredBlockCostDivisor(CostKind, BB); +} + InstructionCost LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, VPCostContext &CostCtx) const { @@ -10310,9 +10344,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &MAMProxy = AM.getResult(F); PSI = MAMProxy.getCachedResult(*F.getParent()); - BFI = nullptr; - if (PSI && PSI->hasProfileSummary()) - BFI = &AM.getResult(F); + BFI = &AM.getResult(F); LoopVectorizeResult Result = runImpl(F); if (!Result.MadeAnyChange) return PreservedAnalyses::all(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index a1c6f7977885f..e3b0c2bff9d02 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -855,7 +855,9 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) { // For the scalar case, we may not always execute the original predicated // block, Thus, scale the block's cost by the probability of executing it. if (VF.isScalar()) - return ThenCost / getPredBlockCostDivisor(Ctx.CostKind); + if (auto *VPIRBB = dyn_cast(Then)) + return ThenCost / Ctx.getPredBlockCostDivisor(Ctx.CostKind, + VPIRBB->getIRBasicBlock()); return ThenCost; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index fe59774b7c838..0e9d6e47c740d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step); -/// A helper function that returns how much we should divide the cost of a -/// predicated block by. Typically this is the reciprocal of the block -/// probability, i.e. if we return X we are assuming the predicated block will -/// execute once for every X iterations of the loop header so the block should -/// only contribute 1/X of its cost to the total cost calculation, but when -/// optimizing for code size it will just be 1 as code size costs don't depend -/// on execution probabilities. -/// -/// TODO: We should use actual block probability here, if available. Currently, -/// we always assume predicated blocks have a 50% chance of executing. -inline unsigned -getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) { - return CostKind == TTI::TCK_CodeSize ? 1 : 2; -} - /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: /// [1, 16) = {1, 2, 4, 8} @@ -378,6 +363,9 @@ struct VPCostContext { InstructionCost getScalarizationOverhead(Type *ResultTy, ArrayRef Operands, ElementCount VF); + + unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, + const BasicBlock *BB) const; }; /// This class can be used to assign names to VPValues. For VPValues without diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index aa3de3613b68e..2e77b75b16e47 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3170,7 +3170,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. - ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind); + ScalarCost /= Ctx.getPredBlockCostDivisor(Ctx.CostKind, UI->getParent()); return ScalarCost; } case Instruction::Load: diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index 94e860b8ce304..781f00f061799 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -261,6 +261,8 @@ ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo +; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo ; CHECK-O-NEXT: Running pass: InferAlignmentPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: InstCombinePass diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll index 3aea0f2061f3e..b01db2f09844f 100644 --- a/llvm/test/Other/new-pm-lto-defaults.ll +++ b/llvm/test/Other/new-pm-lto-defaults.ll @@ -128,6 +128,8 @@ ; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo ; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo +; CHECK-O23SZ-NEXT: Running analysis: BlockFrequencyAnalysis on foo +; CHECK-O23SZ-NEXT: Running analysis: BranchProbabilityAnalysis on foo ; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo ; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo ; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll index a08a140a35166..e0f39e8d73012 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll @@ -180,6 +180,8 @@ ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo +; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo ; CHECK-POSTLINK-O-NEXT: Running pass: InferAlignmentPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll index ddfdb257ed49a..43adfdcb3e603 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -25,7 +25,7 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_UDIV_CONTINUE2]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], splat (i64 1) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0 ; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; CHECK: pred.udiv.if: @@ -65,7 +65,7 @@ for.body: %r = phi i64 [ 0, %entry ], [ %var6, %for.inc ] %var0 = getelementptr inbounds i64, ptr %a, i64 %i %var2 = load i64, ptr %var0, align 4 - %cond0 = icmp sgt i64 %var2, 0 + %cond0 = icmp sgt i64 %var2, 1 br i1 %cond0, label %if.then, label %for.inc if.then: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index e4ee6776ae24c..8602a66d08805 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -612,63 +612,18 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { ; ; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store( ; COMMON-SAME: ptr [[DST:%.*]]) { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: br label %[[VECTOR_PH:.*]] -; COMMON: [[VECTOR_PH]]: -; COMMON-NEXT: br label %[[VECTOR_BODY:.*]] -; COMMON: [[VECTOR_BODY]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; COMMON: [[PRED_STORE_IF]]: -; COMMON-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0 -; COMMON-NEXT: store i8 0, ptr [[TMP0]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE]] -; COMMON: [[PRED_STORE_CONTINUE]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] -; COMMON: [[PRED_STORE_IF1]]: -; COMMON-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1 -; COMMON-NEXT: store i8 1, ptr [[TMP1]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; COMMON: [[PRED_STORE_CONTINUE2]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; COMMON: [[PRED_STORE_IF3]]: -; COMMON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2 -; COMMON-NEXT: store i8 2, ptr [[TMP2]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE4]] -; COMMON: [[PRED_STORE_CONTINUE4]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] -; COMMON: [[PRED_STORE_IF5]]: -; COMMON-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3 -; COMMON-NEXT: store i8 3, ptr [[TMP3]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE6]] -; COMMON: [[PRED_STORE_CONTINUE6]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] -; COMMON: [[PRED_STORE_IF7]]: -; COMMON-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4 -; COMMON-NEXT: store i8 4, ptr [[TMP4]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE8]] -; COMMON: [[PRED_STORE_CONTINUE8]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] -; COMMON: [[PRED_STORE_IF9]]: -; COMMON-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5 -; COMMON-NEXT: store i8 5, ptr [[TMP5]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE10]] -; COMMON: [[PRED_STORE_CONTINUE10]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] -; COMMON: [[PRED_STORE_IF11]]: -; COMMON-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6 -; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]] -; COMMON: [[PRED_STORE_CONTINUE12]]: -; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]] -; COMMON: [[PRED_STORE_IF13]]: -; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7 -; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1 -; COMMON-NEXT: br label %[[EXIT]] -; COMMON: [[EXIT]]: -; COMMON-NEXT: br label %[[SCALAR_PH:.*]] -; COMMON: [[SCALAR_PH]]: -; COMMON-NEXT: br [[EXIT1:label %.*]] -; COMMON: [[SCALAR_PH1:.*:]] +; COMMON-NEXT: [[ENTRY:.*]]: +; COMMON-NEXT: br label %[[EXIT1:.*]] +; COMMON: [[EXIT1]]: +; COMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[EXIT1]] ] +; COMMON-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8 +; COMMON-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] +; COMMON-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1 +; COMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; COMMON-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7 +; COMMON-NEXT: br i1 [[EC]], label %[[SCALAR_PH1:.*]], label %[[EXIT1]] +; COMMON: [[SCALAR_PH1]]: +; COMMON-NEXT: ret void ; entry: br label %loop @@ -1241,8 +1196,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 { ; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 ; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 -; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 8) -; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]] +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; DEFAULT: [[VECTOR_MEMCHECK]]: ; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll index 7ae50a5e4a075..791ef734ec48b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll @@ -57,8 +57,8 @@ define i64 @same_exit_block_pre_inc_use1_nosve() { ; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane ir<%cmp3> ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add ; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV -; CHECK: LV: Minimum required TC for runtime checks to be profitable:160 -; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160) +; CHECK: LV: Minimum required TC for runtime checks to be profitable:128 +; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 128) ; CHECK-NEXT: LV: Too many memory checks needed. entry: %p1 = alloca [1024 x i8] @@ -105,7 +105,7 @@ loop.header: %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv %l = load i64, ptr %gep.src, align 1 %t = trunc i64 %l to i1 - br i1 %t, label %exit.0, label %loop.latch + br i1 %t, label %exit.0, label %loop.latch, !prof !0 loop.latch: %iv.next = add i64 %iv, 1 @@ -120,4 +120,6 @@ exit.1: ret i64 0 } +!0 = !{!"branch_weights", i32 1, i32 1} + attributes #1 = { "target-features"="+sve" vscale_range(1,16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index cc7b4aecc3642..71c2a05af964f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -274,69 +274,11 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; ; PRED-LABEL: define void @iv_trunc( ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; PRED-NEXT: [[ENTRY:.*:]] +; PRED-NEXT: [[ENTRY:.*]]: ; PRED-NEXT: [[MUL_X:%.*]] = add i32 [[X]], 1 -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] -; PRED: [[VECTOR_SCEVCHECK]]: -; PRED-NEXT: [[TMP1:%.*]] = sub i32 -1, [[X]] -; PRED-NEXT: [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0 -; PRED-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]] -; PRED-NEXT: [[TMP4:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; PRED-NEXT: [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0 -; PRED-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false -; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0 -; PRED-NEXT: [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]] -; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]] -; PRED-NEXT: br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 2 -; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2 -; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL_X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; PRED-NEXT: br label %[[VECTOR_BODY:.*]] -; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ] -; PRED-NEXT: [[TMP16:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP17:%.*]] = zext <2 x i32> [[TMP16]] to <2 x i64> -; PRED-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP17]], i32 0 -; PRED-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]] -; PRED-NEXT: store i32 1, ptr [[TMP20]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] -; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]] -; PRED: [[PRED_STORE_IF1]]: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP17]], i32 1 -; PRED-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]] -; PRED-NEXT: store i32 1, ptr [[TMP23]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; PRED: [[PRED_STORE_CONTINUE2]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; PRED-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP25:%.*]] = xor i1 [[TMP24]], true -; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) -; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br label %[[EXIT:.*]] -; PRED: [[SCALAR_PH]]: ; PRED-NEXT: br label %[[FOR_BODY:.*]] ; PRED: [[FOR_BODY]]: -; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; PRED-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32 ; PRED-NEXT: [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]] ; PRED-NEXT: [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64 @@ -344,7 +286,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: store i32 1, ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[FOR_BODY]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; @@ -440,101 +382,21 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; ; PRED-LABEL: define void @trunc_ivs_and_store( ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; PRED-NEXT: [[ENTRY:.*:]] -; PRED-NEXT: [[MUL:%.*]] = mul i32 [[X]], [[X]] -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] -; PRED: [[VECTOR_SCEVCHECK]]: +; PRED-NEXT: [[ENTRY:.*]]: ; PRED-NEXT: [[TMP1:%.*]] = mul i32 [[X]], [[X]] -; PRED-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP1]] -; PRED-NEXT: [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0 -; PRED-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]] -; PRED-NEXT: [[TMP5:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 -; PRED-NEXT: [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0 -; PRED-NEXT: [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false -; PRED-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0 -; PRED-NEXT: [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]] -; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]] -; PRED-NEXT: br i1 [[TMP13]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; PRED-NEXT: br label %[[VECTOR_BODY:.*]] -; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ] -; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; PRED-NEXT: [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64> -; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0 -; PRED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]] -; PRED-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0 -; PRED-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] -; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]] -; PRED: [[PRED_STORE_IF2]]: -; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1 -; PRED-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]] -; PRED-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1 -; PRED-NEXT: store i32 [[TMP26]], ptr [[TMP25]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE3]] -; PRED: [[PRED_STORE_CONTINUE3]]: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; PRED-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]] -; PRED: [[PRED_STORE_IF4]]: -; PRED-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2 -; PRED-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]] -; PRED-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP30]], ptr [[TMP29]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE5]] -; PRED: [[PRED_STORE_CONTINUE5]]: -; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; PRED-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]] -; PRED: [[PRED_STORE_IF6]]: -; PRED-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3 -; PRED-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]] -; PRED-NEXT: [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3 -; PRED-NEXT: store i32 [[TMP34]], ptr [[TMP33]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE7]] -; PRED: [[PRED_STORE_CONTINUE7]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]]) -; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true -; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; PRED-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br label %[[EXIT:.*]] -; PRED: [[SCALAR_PH]]: ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: -; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] -; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; PRED-NEXT: [[IV_1_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32 -; PRED-NEXT: [[IV_1_MUL:%.*]] = mul i32 [[MUL]], [[IV_1_TRUNC]] +; PRED-NEXT: [[IV_1_MUL:%.*]] = mul i32 [[TMP1]], [[IV_1_TRUNC]] ; PRED-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1 ; PRED-NEXT: [[MUL_EXT:%.*]] = zext i32 [[IV_1_MUL]] to i64 ; PRED-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[MUL_EXT]] ; PRED-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1 ; PRED-NEXT: [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]] -; PRED-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; PRED-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT:.*]], label %[[LOOP]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; @@ -630,91 +492,12 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; ; PRED-LABEL: define void @ivs_trunc_and_ext( ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; PRED-NEXT: [[ENTRY:.*:]] +; PRED-NEXT: [[ENTRY:.*]]: ; PRED-NEXT: [[ADD:%.*]] = add i32 [[X]], 1 -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] -; PRED: [[VECTOR_SCEVCHECK]]: -; PRED-NEXT: [[TMP1:%.*]] = sub i32 -1, [[X]] -; PRED-NEXT: [[TMP2:%.*]] = icmp slt i32 [[ADD]], 0 -; PRED-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[ADD]] -; PRED-NEXT: [[TMP4:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; PRED-NEXT: [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0 -; PRED-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false -; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP10:%.*]] = icmp ne i32 [[ADD]], 0 -; PRED-NEXT: [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]] -; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]] -; PRED-NEXT: br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; PRED-NEXT: br label %[[VECTOR_BODY:.*]] -; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] -; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; PRED-NEXT: [[TMP16:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP17:%.*]] = zext <4 x i32> [[TMP16]] to <4 x i64> -; PRED-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 -; PRED-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]] -; PRED-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0 -; PRED-NEXT: store i32 [[TMP21]], ptr [[TMP20]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] -; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] -; PRED: [[PRED_STORE_IF1]]: -; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 -; PRED-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]] -; PRED-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1 -; PRED-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; PRED: [[PRED_STORE_CONTINUE2]]: -; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; PRED-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; PRED: [[PRED_STORE_IF3]]: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 -; PRED-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]] -; PRED-NEXT: [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE4]] -; PRED: [[PRED_STORE_CONTINUE4]]: -; PRED-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; PRED-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] -; PRED: [[PRED_STORE_IF5]]: -; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 -; PRED-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]] -; PRED-NEXT: [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3 -; PRED-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE6]] -; PRED: [[PRED_STORE_CONTINUE6]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; PRED-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP35:%.*]] = xor i1 [[TMP34]], true -; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; PRED-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br label %[[EXIT:.*]] -; PRED: [[SCALAR_PH]]: ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: -; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] -; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; PRED-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32 ; PRED-NEXT: [[IV_MUL:%.*]] = mul i32 [[ADD]], [[IV_TRUNC]] ; PRED-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1 @@ -723,7 +506,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]] -; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; @@ -812,7 +595,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 1 ; PRED-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 ; PRED-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] -; PRED-NEXT: br i1 [[TMP5]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; PRED-NEXT: br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: ; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1 ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 @@ -845,7 +628,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED: [[PRED_STORE_CONTINUE5]]: ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; PRED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; PRED-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; PRED-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: @@ -858,7 +641,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 ; PRED-NEXT: [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64 ; PRED-NEXT: [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]] -; PRED-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] +; PRED-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll index 1bacae764f760..9bc371be8496d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll @@ -30,7 +30,7 @@ for.body: ; preds = %for.body.preheader, %i.07 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.07 %0 = load i32, ptr %arrayidx, align 4 - %tobool.not = icmp eq i32 %0, 0 + %tobool.not = icmp eq i32 %0, 1 br i1 %tobool.not, label %for.inc, label %if.then if.then: ; preds = %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll index 4e989c5d3eca8..5073669e9876f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll @@ -425,7 +425,7 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[END]] to i10 ; CHECK-NEXT: [[TMP1:%.*]] = zext i10 [[TMP0]] to i64 ; CHECK-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 12 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[END_CLAMPED]], i32 1) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-predicated-costs.ll new file mode 100644 index 0000000000000..aa5e74dffdbb9 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-predicated-costs.ll @@ -0,0 +1,154 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; RUN: opt -p loop-vectorize -mtriple=aarch64 -mattr=+sve -S %s | FileCheck %s + +define void @nested(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) { +; CHECK-LABEL: define void @nested( +; CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[X:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] +; CHECK-NEXT: br i1 [[C0]], label %[[THEN_0:.*]], label %[[LATCH]] +; CHECK: [[THEN_0]]: +; CHECK-NEXT: br i1 [[C1]], label %[[THEN_1:.*]], label %[[LATCH]] +; CHECK: [[THEN_1]]: +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr i64, ptr [[P0]], i32 [[X]] +; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[GEP0]], align 8 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[P1]], i32 [[X]] +; CHECK-NEXT: [[Y:%.*]] = load i64, ptr [[GEP1]], align 8 +; CHECK-NEXT: [[Z:%.*]] = udiv i64 [[X1]], [[Y]] +; CHECK-NEXT: store i64 [[Z]], ptr [[GEP1]], align 8 +; CHECK-NEXT: br label %[[LATCH]] +; CHECK: [[LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[X]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ] + br i1 %c0, label %then.0, label %latch + +then.0: + br i1 %c1, label %then.1, label %latch + +then.1: + %gep0 = getelementptr i64, ptr %p0, i32 %iv + %x = load i64, ptr %gep0 + %gep1 = getelementptr i64, ptr %p1, i32 %iv + %y = load i64, ptr %gep1 + %z = udiv i64 %x, %y + store i64 %z, ptr %gep1 + br label %latch + +latch: + %iv.next = add i32 %iv, 1 + %done = icmp eq i32 %iv.next, 1024 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @always_taken(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) { +; CHECK-LABEL: define void @always_taken( +; CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 1024, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP5]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i1 [[C1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i1 [[C0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[P0]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP8]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP10]], i64 [[TMP7]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, [[TMP6]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP20]], i32 8, [[TMP6]], poison) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P1]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP13]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[TMP6]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, [[TMP6]], poison) +; CHECK-NEXT: [[TMP21:%.*]] = select [[TMP6]], [[WIDE_MASKED_LOAD4]], splat (i64 1) +; CHECK-NEXT: [[TMP14:%.*]] = select [[TMP6]], [[WIDE_MASKED_LOAD5]], splat (i64 1) +; CHECK-NEXT: [[TMP15:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = udiv [[WIDE_MASKED_LOAD3]], [[TMP14]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP15]], ptr [[TMP9]], i32 8, [[TMP6]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP22]], ptr [[TMP12]], i32 8, [[TMP6]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LATCH:.*]] ] +; CHECK-NEXT: br i1 [[C0]], label %[[THEN_0:.*]], label %[[LATCH]], !prof [[PROF3:![0-9]+]] +; CHECK: [[THEN_0]]: +; CHECK-NEXT: br i1 [[C1]], label %[[THEN_1:.*]], label %[[LATCH]], !prof [[PROF3]] +; CHECK: [[THEN_1]]: +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr i64, ptr [[P0]], i32 [[IV1]] +; CHECK-NEXT: [[X:%.*]] = load i64, ptr [[GEP0]], align 8 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[P1]], i32 [[IV1]] +; CHECK-NEXT: [[Y:%.*]] = load i64, ptr [[GEP1]], align 8 +; CHECK-NEXT: [[Z:%.*]] = udiv i64 [[X]], [[Y]] +; CHECK-NEXT: store i64 [[Z]], ptr [[GEP1]], align 8 +; CHECK-NEXT: br label %[[LATCH]] +; CHECK: [[LATCH]]: +; CHECK-NEXT: [[IV_NEXT1]] = add i32 [[IV1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i32 [[IV_NEXT1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ] + br i1 %c0, label %then.0, label %latch, !prof !4 + +then.0: + br i1 %c1, label %then.1, label %latch, !prof !4 + +then.1: + %gep0 = getelementptr i64, ptr %p0, i32 %iv + %x = load i64, ptr %gep0 + %gep1 = getelementptr i64, ptr %p1, i32 %iv + %y = load i64, ptr %gep1 + %z = udiv i64 %x, %y + store i64 %z, ptr %gep1 + br label %latch + +latch: + %iv.next = add i32 %iv, 1 + %done = icmp eq i32 %iv.next, 1024 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +!4 = !{!"branch_weights", i32 1, i32 0} + diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll index e1b7b935a47f6..ee3f9efd46e2b 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll @@ -21,7 +21,7 @@ for.body386: ; preds = %entry, %l77 %arrayidx388 = getelementptr inbounds [101 x i32], ptr %src, i32 0, i32 %add387 %l41 = load i32, ptr %arrayidx388, align 4 %l42 = and i32 %l41, 65535 - %l43 = icmp eq i32 %l42, 0 + %l43 = icmp eq i32 %l42, 1 br i1 %l43, label %l77, label %l44 l44: ; preds = %for.body386 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll index 3426fb16841c5..e988323505781 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll @@ -214,182 +214,10 @@ for.cond.cleanup: define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) { ; DEFAULT-LABEL: define void @tail_predicate_without_optsize( ; DEFAULT-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) { -; DEFAULT-NEXT: [[ENTRY:.*:]] -; DEFAULT-NEXT: br label %[[VECTOR_PH:.*]] -; DEFAULT: [[VECTOR_PH]]: -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] -; DEFAULT: [[VECTOR_BODY]]: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ] -; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE35]] ] -; DEFAULT-NEXT: [[VEC_IND1:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE35]] ] -; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14) -; DEFAULT-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]] -; DEFAULT-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1) -; DEFAULT-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]] -; DEFAULT-NEXT: [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]] -; DEFAULT-NEXT: [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2) -; DEFAULT-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]] -; DEFAULT-NEXT: [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]] -; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0 -; DEFAULT-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; DEFAULT: [[PRED_STORE_IF]]: -; DEFAULT-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] -; DEFAULT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0 -; DEFAULT-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]] -; DEFAULT: [[PRED_STORE_CONTINUE]]: -; DEFAULT-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1 -; DEFAULT-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]] -; DEFAULT: [[PRED_STORE_IF6]]: -; DEFAULT-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 -; DEFAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]] -; DEFAULT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1 -; DEFAULT-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE7]] -; DEFAULT: [[PRED_STORE_CONTINUE7]]: -; DEFAULT-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2 -; DEFAULT-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] -; DEFAULT: [[PRED_STORE_IF8]]: -; DEFAULT-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 -; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]] -; DEFAULT-NEXT: [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2 -; DEFAULT-NEXT: store i8 [[TMP19]], ptr [[TMP18]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE9]] -; DEFAULT: [[PRED_STORE_CONTINUE9]]: -; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3 -; DEFAULT-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]] -; DEFAULT: [[PRED_STORE_IF10]]: -; DEFAULT-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3 -; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]] -; DEFAULT-NEXT: [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3 -; DEFAULT-NEXT: store i8 [[TMP23]], ptr [[TMP22]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE11]] -; DEFAULT: [[PRED_STORE_CONTINUE11]]: -; DEFAULT-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4 -; DEFAULT-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] -; DEFAULT: [[PRED_STORE_IF12]]: -; DEFAULT-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 4 -; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]] -; DEFAULT-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4 -; DEFAULT-NEXT: store i8 [[TMP27]], ptr [[TMP26]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE13]] -; DEFAULT: [[PRED_STORE_CONTINUE13]]: -; DEFAULT-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5 -; DEFAULT-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] -; DEFAULT: [[PRED_STORE_IF14]]: -; DEFAULT-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 5 -; DEFAULT-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]] -; DEFAULT-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5 -; DEFAULT-NEXT: store i8 [[TMP31]], ptr [[TMP30]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE15]] -; DEFAULT: [[PRED_STORE_CONTINUE15]]: -; DEFAULT-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6 -; DEFAULT-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]] -; DEFAULT: [[PRED_STORE_IF16]]: -; DEFAULT-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 6 -; DEFAULT-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]] -; DEFAULT-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6 -; DEFAULT-NEXT: store i8 [[TMP35]], ptr [[TMP34]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE17]] -; DEFAULT: [[PRED_STORE_CONTINUE17]]: -; DEFAULT-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7 -; DEFAULT-NEXT: br i1 [[TMP36]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]] -; DEFAULT: [[PRED_STORE_IF18]]: -; DEFAULT-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], 7 -; DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]] -; DEFAULT-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7 -; DEFAULT-NEXT: store i8 [[TMP39]], ptr [[TMP38]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE19]] -; DEFAULT: [[PRED_STORE_CONTINUE19]]: -; DEFAULT-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8 -; DEFAULT-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]] -; DEFAULT: [[PRED_STORE_IF20]]: -; DEFAULT-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], 8 -; DEFAULT-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]] -; DEFAULT-NEXT: [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8 -; DEFAULT-NEXT: store i8 [[TMP43]], ptr [[TMP42]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE21]] -; DEFAULT: [[PRED_STORE_CONTINUE21]]: -; DEFAULT-NEXT: [[TMP44:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9 -; DEFAULT-NEXT: br i1 [[TMP44]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]] -; DEFAULT: [[PRED_STORE_IF22]]: -; DEFAULT-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], 9 -; DEFAULT-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]] -; DEFAULT-NEXT: [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9 -; DEFAULT-NEXT: store i8 [[TMP47]], ptr [[TMP46]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE23]] -; DEFAULT: [[PRED_STORE_CONTINUE23]]: -; DEFAULT-NEXT: [[TMP48:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10 -; DEFAULT-NEXT: br i1 [[TMP48]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]] -; DEFAULT: [[PRED_STORE_IF24]]: -; DEFAULT-NEXT: [[TMP49:%.*]] = add i64 [[INDEX]], 10 -; DEFAULT-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]] -; DEFAULT-NEXT: [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10 -; DEFAULT-NEXT: store i8 [[TMP51]], ptr [[TMP50]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE25]] -; DEFAULT: [[PRED_STORE_CONTINUE25]]: -; DEFAULT-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11 -; DEFAULT-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]] -; DEFAULT: [[PRED_STORE_IF26]]: -; DEFAULT-NEXT: [[TMP53:%.*]] = add i64 [[INDEX]], 11 -; DEFAULT-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]] -; DEFAULT-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11 -; DEFAULT-NEXT: store i8 [[TMP55]], ptr [[TMP54]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE27]] -; DEFAULT: [[PRED_STORE_CONTINUE27]]: -; DEFAULT-NEXT: [[TMP56:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12 -; DEFAULT-NEXT: br i1 [[TMP56]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]] -; DEFAULT: [[PRED_STORE_IF28]]: -; DEFAULT-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 12 -; DEFAULT-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]] -; DEFAULT-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12 -; DEFAULT-NEXT: store i8 [[TMP59]], ptr [[TMP58]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE29]] -; DEFAULT: [[PRED_STORE_CONTINUE29]]: -; DEFAULT-NEXT: [[TMP60:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13 -; DEFAULT-NEXT: br i1 [[TMP60]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]] -; DEFAULT: [[PRED_STORE_IF30]]: -; DEFAULT-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 13 -; DEFAULT-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]] -; DEFAULT-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13 -; DEFAULT-NEXT: store i8 [[TMP63]], ptr [[TMP62]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE31]] -; DEFAULT: [[PRED_STORE_CONTINUE31]]: -; DEFAULT-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14 -; DEFAULT-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]] -; DEFAULT: [[PRED_STORE_IF32]]: -; DEFAULT-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 14 -; DEFAULT-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]] -; DEFAULT-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14 -; DEFAULT-NEXT: store i8 [[TMP67]], ptr [[TMP66]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE33]] -; DEFAULT: [[PRED_STORE_CONTINUE33]]: -; DEFAULT-NEXT: [[TMP68:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15 -; DEFAULT-NEXT: br i1 [[TMP68]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]] -; DEFAULT: [[PRED_STORE_IF34]]: -; DEFAULT-NEXT: [[TMP69:%.*]] = add i64 [[INDEX]], 15 -; DEFAULT-NEXT: [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]] -; DEFAULT-NEXT: [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15 -; DEFAULT-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]] -; DEFAULT: [[PRED_STORE_CONTINUE35]]: -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16) -; DEFAULT-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16) -; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] -; DEFAULT: [[SCALAR_PH:.*]]: +; DEFAULT-NEXT: [[ENTRY:.*]]: ; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] ; DEFAULT: [[FOR_BODY]]: -; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; DEFAULT-NEXT: [[TMP72:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i8 ; DEFAULT-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP72]] ; DEFAULT-NEXT: [[SHR:%.*]] = lshr i8 [[TMP72]], 1 @@ -402,7 +230,7 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) ; DEFAULT-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 ; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 -; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]] ; DEFAULT: [[FOR_COND_CLEANUP]]: ; DEFAULT-NEXT: ret void ; @@ -499,7 +327,7 @@ define void @dont_vectorize_with_minsize() { ; DEFAULT-NEXT: store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2 ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4 ; DEFAULT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[SCALAR_PH:.*]]: @@ -656,7 +484,7 @@ define void @vectorization_forced() { ; DEFAULT-NEXT: store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2 ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4 ; DEFAULT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[SCALAR_PH:.*]]: @@ -675,7 +503,7 @@ define void @vectorization_forced() { ; DEFAULT-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 ; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 -; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; DEFAULT: [[FOR_COND_CLEANUP]]: ; DEFAULT-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll index c35a3d7b9269f..3401fa65a01ef 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll @@ -6,10 +6,49 @@ define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d) { ; CHECK-LABEL: define void @pr154103( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul [[TMP0]], splat (i64 7) +; CHECK-NEXT: [[INDUCTION:%.*]] = add splat (i64 1), [[TMP1]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ -7905747460161236406, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 7, [[TMP3]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[A]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i8.nxv4p0( align 1 [[TMP5]], splat (i1 true), i32 [[TMP2]]) +; CHECK-NEXT: [[TMP6:%.*]] = zext [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vp.merge.nxv4i64( splat (i1 true), [[TMP6]], splat (i64 1), i32 [[TMP2]]) +; CHECK-NEXT: [[TMP8:%.*]] = sdiv zeroinitializer, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt [[TMP8]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.vp.gather.nxv4i8.nxv4p0( align 1 [[BROADCAST_SPLAT]], [[TMP9]], i32 [[TMP2]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext [[WIDE_MASKED_GATHER5]] to +; CHECK-NEXT: [[TMP11:%.*]] = xor [[TMP10]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = trunc [[PREDPHI]] to +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i16.nxv4p0( [[TMP12]], align 2 [[BROADCAST_SPLAT2]], splat (i1 true), i32 [[TMP2]]) +; CHECK-NEXT: store i32 0, ptr [[D]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP2]] to i64 +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SCALAR_PH:.*]]: ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[GEP]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[X]] to i64 @@ -28,7 +67,7 @@ define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalia ; CHECK-NEXT: store i32 0, ptr [[D]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 7 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV]], 0 -; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/predicated-costs.ll new file mode 100644 index 0000000000000..00117be0619ad --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/predicated-costs.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; RUN: opt < %s -S -p loop-vectorize -mtriple=riscv64 -mattr=+v | FileCheck %s + +define void @nested(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) { +; CHECK-LABEL: define void @nested( +; CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] +; CHECK-NEXT: br i1 [[C0]], label %[[THEN_0:.*]], label %[[LATCH]] +; CHECK: [[THEN_0]]: +; CHECK-NEXT: br i1 [[C1]], label %[[THEN_1:.*]], label %[[LATCH]] +; CHECK: [[THEN_1]]: +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[P0]], i32 [[IV1]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[P1]], i32 [[X]] +; CHECK-NEXT: store i32 0, ptr [[GEP1]], align 4 +; CHECK-NEXT: br label %[[LATCH]] +; CHECK: [[LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ] + br i1 %c0, label %then.0, label %latch + +then.0: + br i1 %c1, label %then.1, label %latch + +then.1: + %gep0 = getelementptr i32, ptr %p0, i32 %iv + %x = load i32, ptr %gep0 + %gep1 = getelementptr i32, ptr %p1, i32 %x + store i32 0, ptr %gep1 + br label %latch + +latch: + %iv.next = add i32 %iv, 1 + %done = icmp eq i32 %iv.next, 1024 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @always_taken(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) { +; CHECK-LABEL: define void @always_taken( +; CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i1 [[C1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i1 [[C0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = select [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 1024, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P0]], i32 [[EVL_BASED_IV]] +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP2]], [[TMP0]], i32 [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[P1]], [[VP_OP_LOAD]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( zeroinitializer, align 4 [[TMP3]], [[TMP0]], i32 [[TMP1]]) +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SCALAR_PH:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] +; CHECK-NEXT: br i1 [[C0]], label %[[THEN_0:.*]], label %[[LATCH]], !prof [[PROF3:![0-9]+]] +; CHECK: [[THEN_0]]: +; CHECK-NEXT: br i1 [[C1]], label %[[THEN_1:.*]], label %[[LATCH]], !prof [[PROF3]] +; CHECK: [[THEN_1]]: +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr i32, ptr [[P0]], i32 [[IV]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[GEP0]], align 4 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[P1]], i32 [[X]] +; CHECK-NEXT: store i32 0, ptr [[GEP1]], align 4 +; CHECK-NEXT: br label %[[LATCH]] +; CHECK: [[LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ] + br i1 %c0, label %then.0, label %latch, !prof !0 + +then.0: + br i1 %c1, label %then.1, label %latch, !prof !0 + +then.1: + %gep0 = getelementptr i32, ptr %p0, i32 %iv + %x = load i32, ptr %gep0 + %gep1 = getelementptr i32, ptr %p1, i32 %x + store i32 0, ptr %gep1 + br label %latch + +latch: + %iv.next = add i32 %iv, 1 + %done = icmp eq i32 %iv.next, 1024 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +!0 = !{!"branch_weights", i32 1, i32 0} diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll index a286df9bc2fc7..4685d6367e374 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll @@ -85,13 +85,13 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 10 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 21 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 35 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 43 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 71 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED-LABEL: 'test2' @@ -99,8 +99,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 @@ -154,18 +154,18 @@ define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly ; DISABLED_MASKED_STRIDED-LABEL: 'test' ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 16 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 16 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 33 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2 ; ; ENABLED_MASKED_STRIDED-LABEL: 'test' ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 16 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 12 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 22 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll index 5e67bd57754e4..d285bc2357702 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll @@ -18,38 +18,43 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 -; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 -; SSE2: LV: Found an estimated cost of 5 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 -; SSE2: LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 -; SSE2: LV: Found an estimated cost of 22 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 +; SSE2: LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 +; SSE2: LV: Found an estimated cost of 11 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 +; SSE2: LV: Found an estimated cost of 22 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 +; SSE2: LV: Found an estimated cost of 44 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 ; ; SSE42-LABEL: 'test' ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 -; SSE42: LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 -; SSE42: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 -; SSE42: LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 -; SSE42: LV: Found an estimated cost of 16 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 +; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 +; SSE42: LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 +; SSE42: LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 +; SSE42: LV: Found an estimated cost of 16 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 +; SSE42: LV: Found an estimated cost of 32 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 -; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 -; AVX1: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 -; AVX1: LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 -; AVX1: LV: Found an estimated cost of 17 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 -; AVX1: LV: Found an estimated cost of 34 for VF 32 For instruction: store i32 %valB, ptr %out, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 +; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 +; AVX1: LV: Found an estimated cost of 17 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 +; AVX1: LV: Found an estimated cost of 34 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 +; AVX1: LV: Found an estimated cost of 68 for VF 32 For instruction: store i32 %valB, ptr %out, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 -; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 -; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 -; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 -; AVX2: LV: Found an estimated cost of 17 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 -; AVX2: LV: Found an estimated cost of 34 for VF 32 For instruction: store i32 %valB, ptr %out, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: store i32 %valB, ptr %out, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 -; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 -; AVX512: LV: Found an estimated cost of 10 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 +; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 +; AVX512: LV: Found an estimated cost of 17 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 ; AVX512: LV: Found an estimated cost of 18 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 ; AVX512: LV: Found an estimated cost of 36 for VF 32 For instruction: store i32 %valB, ptr %out, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll index faa2aa43d4934..0ef2703f3d16f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll @@ -18,38 +18,43 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 -; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 -; SSE2: LV: Found an estimated cost of 5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 -; SSE2: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 -; SSE2: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 +; SSE2: LV: Found an estimated cost of 5 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 +; SSE2: LV: Found an estimated cost of 10 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 +; SSE2: LV: Found an estimated cost of 20 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 +; SSE2: LV: Found an estimated cost of 40 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 ; ; SSE42-LABEL: 'test' ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 -; SSE42: LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 -; SSE42: LV: Found an estimated cost of 4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 -; SSE42: LV: Found an estimated cost of 8 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 -; SSE42: LV: Found an estimated cost of 16 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 +; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 +; SSE42: LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 +; SSE42: LV: Found an estimated cost of 8 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 +; SSE42: LV: Found an estimated cost of 16 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 +; SSE42: LV: Found an estimated cost of 32 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 -; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 -; AVX1: LV: Found an estimated cost of 4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 -; AVX1: LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 -; AVX1: LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 -; AVX1: LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 +; AVX1: LV: Found an estimated cost of 9 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 +; AVX1: LV: Found an estimated cost of 18 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 +; AVX1: LV: Found an estimated cost of 36 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 +; AVX1: LV: Found an estimated cost of 72 for VF 32 For instruction: store i64 %valB, ptr %out, align 8 ; -; AVX2-LABEL: 'test' -; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 -; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 -; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 -; AVX2: LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 -; AVX2: LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 -; AVX2-NOFAST: LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8 +; AVX2-NOFAST-LABEL: 'test' +; AVX2-NOFAST: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 +; AVX2-NOFAST: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 +; AVX2-NOFAST: LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 +; AVX2-NOFAST: LV: Found an estimated cost of 9 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 +; AVX2-NOFAST: LV: Found an estimated cost of 18 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 +; AVX2-NOFAST: LV: Found an estimated cost of 36 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 +; AVX2-NOFAST: LV: Found an estimated cost of 72 for VF 32 For instruction: store i64 %valB, ptr %out, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 -; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 -; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 +; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 +; AVX512: LV: Found an estimated cost of 18 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: store i64 %valB, ptr %out, align 8 @@ -86,3 +91,5 @@ end: for.cond.cleanup: ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX2: {{.*}} diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll index 1d51a32a520a9..846c563491686 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll @@ -17,29 +17,33 @@ target triple = "x86_64-unknown-linux-gnu" define void @test(ptr %C) { ; SSE-LABEL: 'test' ; SSE: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2 -; SSE: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2 -; SSE: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2 -; SSE: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2 -; SSE: LV: Found an estimated cost of 16 for VF 16 For instruction: store i16 %valB, ptr %out, align 2 +; SSE: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2 +; SSE: LV: Found an estimated cost of 4 for VF 2 For instruction: store i16 %valB, ptr %out, align 2 +; SSE: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %valB, ptr %out, align 2 +; SSE: LV: Found an estimated cost of 16 for VF 8 For instruction: store i16 %valB, ptr %out, align 2 +; SSE: LV: Found an estimated cost of 32 for VF 16 For instruction: store i16 %valB, ptr %out, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2 -; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2 -; AVX1: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2 -; AVX1: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2 -; AVX1: LV: Found an estimated cost of 16 for VF 16 For instruction: store i16 %valB, ptr %out, align 2 -; AVX1: LV: Found an estimated cost of 33 for VF 32 For instruction: store i16 %valB, ptr %out, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: store i16 %valB, ptr %out, align 2 +; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %valB, ptr %out, align 2 +; AVX1: LV: Found an estimated cost of 16 for VF 8 For instruction: store i16 %valB, ptr %out, align 2 +; AVX1: LV: Found an estimated cost of 33 for VF 16 For instruction: store i16 %valB, ptr %out, align 2 +; AVX1: LV: Found an estimated cost of 66 for VF 32 For instruction: store i16 %valB, ptr %out, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2 -; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2 -; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2 -; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2 -; AVX2: LV: Found an estimated cost of 16 for VF 16 For instruction: store i16 %valB, ptr %out, align 2 -; AVX2: LV: Found an estimated cost of 33 for VF 32 For instruction: store i16 %valB, ptr %out, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: store i16 %valB, ptr %out, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %valB, ptr %out, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: store i16 %valB, ptr %out, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: store i16 %valB, ptr %out, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: store i16 %valB, ptr %out, align 2 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2 ; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2 ; AVX512: LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %valB, ptr %out, align 2 ; AVX512: LV: Found an estimated cost of 1 for VF 8 For instruction: store i16 %valB, ptr %out, align 2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll index f011d06d319bb..56c1ad3af6d5f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll @@ -17,20 +17,23 @@ target triple = "x86_64-unknown-linux-gnu" define void @test(ptr %C) { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 -; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 -; SSE2: LV: Found an estimated cost of 5 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 -; SSE2: LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 -; SSE2: LV: Found an estimated cost of 22 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 +; SSE2: LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 +; SSE2: LV: Found an estimated cost of 11 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 +; SSE2: LV: Found an estimated cost of 22 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 +; SSE2: LV: Found an estimated cost of 44 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 ; ; SSE42-LABEL: 'test' ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 -; SSE42: LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 -; SSE42: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 -; SSE42: LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 -; SSE42: LV: Found an estimated cost of 16 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 +; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 +; SSE42: LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 +; SSE42: LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 +; SSE42: LV: Found an estimated cost of 16 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 +; SSE42: LV: Found an estimated cost of 32 for VF 16 For instruction: store i32 %valB, ptr %out, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 ; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 ; AVX1: LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 @@ -39,6 +42,7 @@ define void @test(ptr %C) { ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 ; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 ; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 ; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 @@ -47,6 +51,7 @@ define void @test(ptr %C) { ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4 ; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 4 For instruction: store i32 %valB, ptr %out, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 8 For instruction: store i32 %valB, ptr %out, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll index c004b16ae207d..e0bef719e18cc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll @@ -17,20 +17,23 @@ target triple = "x86_64-unknown-linux-gnu" define void @test(ptr %C) { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 -; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 -; SSE2: LV: Found an estimated cost of 5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 -; SSE2: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 -; SSE2: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 +; SSE2: LV: Found an estimated cost of 5 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 +; SSE2: LV: Found an estimated cost of 10 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 +; SSE2: LV: Found an estimated cost of 20 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 +; SSE2: LV: Found an estimated cost of 40 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 ; ; SSE42-LABEL: 'test' ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 -; SSE42: LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 -; SSE42: LV: Found an estimated cost of 4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 -; SSE42: LV: Found an estimated cost of 8 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 -; SSE42: LV: Found an estimated cost of 16 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 +; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 +; SSE42: LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 +; SSE42: LV: Found an estimated cost of 8 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 +; SSE42: LV: Found an estimated cost of 16 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 +; SSE42: LV: Found an estimated cost of 32 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 ; AVX1: LV: Found an estimated cost of 8 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 ; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 ; AVX1: LV: Found an estimated cost of 16 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 @@ -39,6 +42,7 @@ define void @test(ptr %C) { ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 ; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 ; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 ; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 @@ -47,6 +51,7 @@ define void @test(ptr %C) { ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: store i64 %valB, ptr %out, align 8 ; AVX512: LV: Found an estimated cost of 1 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 ; AVX512: LV: Found an estimated cost of 1 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll index 8bbe624849783..138d8c39724b3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll @@ -17,36 +17,41 @@ target triple = "x86_64-unknown-linux-gnu" define void @test(ptr %C) { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1 -; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %valB, ptr %out, align 1 -; SSE2: LV: Found an estimated cost of 5 for VF 4 For instruction: store i8 %valB, ptr %out, align 1 -; SSE2: LV: Found an estimated cost of 11 for VF 8 For instruction: store i8 %valB, ptr %out, align 1 -; SSE2: LV: Found an estimated cost of 23 for VF 16 For instruction: store i8 %valB, ptr %out, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1 +; SSE2: LV: Found an estimated cost of 5 for VF 2 For instruction: store i8 %valB, ptr %out, align 1 +; SSE2: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8 %valB, ptr %out, align 1 +; SSE2: LV: Found an estimated cost of 23 for VF 8 For instruction: store i8 %valB, ptr %out, align 1 +; SSE2: LV: Found an estimated cost of 47 for VF 16 For instruction: store i8 %valB, ptr %out, align 1 ; ; SSE42-LABEL: 'test' ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1 -; SSE42: LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %valB, ptr %out, align 1 -; SSE42: LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1 -; SSE42: LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1 -; SSE42: LV: Found an estimated cost of 16 for VF 16 For instruction: store i8 %valB, ptr %out, align 1 +; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1 +; SSE42: LV: Found an estimated cost of 4 for VF 2 For instruction: store i8 %valB, ptr %out, align 1 +; SSE42: LV: Found an estimated cost of 8 for VF 4 For instruction: store i8 %valB, ptr %out, align 1 +; SSE42: LV: Found an estimated cost of 16 for VF 8 For instruction: store i8 %valB, ptr %out, align 1 +; SSE42: LV: Found an estimated cost of 32 for VF 16 For instruction: store i8 %valB, ptr %out, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1 -; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %valB, ptr %out, align 1 -; AVX1: LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1 -; AVX1: LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1 -; AVX1: LV: Found an estimated cost of 16 for VF 16 For instruction: store i8 %valB, ptr %out, align 1 -; AVX1: LV: Found an estimated cost of 32 for VF 32 For instruction: store i8 %valB, ptr %out, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: store i8 %valB, ptr %out, align 1 +; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction: store i8 %valB, ptr %out, align 1 +; AVX1: LV: Found an estimated cost of 16 for VF 8 For instruction: store i8 %valB, ptr %out, align 1 +; AVX1: LV: Found an estimated cost of 32 for VF 16 For instruction: store i8 %valB, ptr %out, align 1 +; AVX1: LV: Found an estimated cost of 65 for VF 32 For instruction: store i8 %valB, ptr %out, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1 -; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %valB, ptr %out, align 1 -; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1 -; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1 -; AVX2: LV: Found an estimated cost of 16 for VF 16 For instruction: store i8 %valB, ptr %out, align 1 -; AVX2: LV: Found an estimated cost of 32 for VF 32 For instruction: store i8 %valB, ptr %out, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: store i8 %valB, ptr %out, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: store i8 %valB, ptr %out, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: store i8 %valB, ptr %out, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: store i8 %valB, ptr %out, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: store i8 %valB, ptr %out, align 1 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1 ; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %valB, ptr %out, align 1 ; AVX512: LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %valB, ptr %out, align 1 ; AVX512: LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %valB, ptr %out, align 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll index 21fa6ceb2cc12..4358429fc560d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll @@ -392,45 +392,24 @@ exit: define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK-LABEL: @cost_duplicate_recipe_for_sinking( -; CHECK-NEXT: iter.check: +; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[TMP0]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP4]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP5]], 2 -; CHECK-NEXT: [[TMP10:%.*]] = shl nsw i64 [[TMP6]], 2 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr nusw double, ptr [[A:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP10]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP13]], align 8 -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shl nsw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr nusw double, ptr [[A:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP14]], align 8 ; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer ; CHECK-NEXT: [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0 ; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 @@ -439,210 +418,43 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP25]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1 ; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] -; CHECK: pred.store.if8: +; CHECK: pred.store.if1: ; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP29]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]] -; CHECK: pred.store.continue9: -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2 +; CHECK: pred.store.continue2: +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2 ; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] -; CHECK: pred.store.if10: +; CHECK: pred.store.if3: ; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP33]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]] -; CHECK: pred.store.continue11: -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] -; CHECK: pred.store.if12: +; CHECK: pred.store.continue4: +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3 +; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE37]] +; CHECK: pred.store.if5: ; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP37]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE13]] -; CHECK: pred.store.continue13: -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0 -; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] -; CHECK: pred.store.if14: -; CHECK-NEXT: [[TMP88:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP39:%.*]] = shl nsw i64 [[TMP88]], 2 -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP40]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]] -; CHECK: pred.store.continue15: -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1 -; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] -; CHECK: pred.store.if16: -; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2 -; CHECK-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP44]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]] -; CHECK: pred.store.continue17: -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2 -; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] -; CHECK: pred.store.if18: -; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2 -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP48]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]] -; CHECK: pred.store.continue19: -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3 -; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] -; CHECK: pred.store.if20: -; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2 -; CHECK-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP52]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]] -; CHECK: pred.store.continue21: -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0 -; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] -; CHECK: pred.store.if22: -; CHECK-NEXT: [[TMP107:%.*]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP54:%.*]] = shl nsw i64 [[TMP107]], 2 -; CHECK-NEXT: [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP55]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE23]] -; CHECK: pred.store.continue23: -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1 -; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] -; CHECK: pred.store.if24: -; CHECK-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 9 -; CHECK-NEXT: [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2 -; CHECK-NEXT: [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP59]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE25]] -; CHECK: pred.store.continue25: -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2 -; CHECK-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] -; CHECK: pred.store.if26: -; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 10 -; CHECK-NEXT: [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2 -; CHECK-NEXT: [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP63]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE27]] -; CHECK: pred.store.continue27: -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3 -; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] -; CHECK: pred.store.if28: -; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 11 -; CHECK-NEXT: [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2 -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP67]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE29]] -; CHECK: pred.store.continue29: -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0 -; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] -; CHECK: pred.store.if30: -; CHECK-NEXT: [[TMP108:%.*]] = add i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP69:%.*]] = shl nsw i64 [[TMP108]], 2 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP70]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE31]] -; CHECK: pred.store.continue31: -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1 -; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] -; CHECK: pred.store.if32: -; CHECK-NEXT: [[TMP72:%.*]] = add i64 [[INDEX]], 13 -; CHECK-NEXT: [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2 -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP74]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE33]] -; CHECK: pred.store.continue33: -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2 -; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]] -; CHECK: pred.store.if34: -; CHECK-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], 14 -; CHECK-NEXT: [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2 -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP78]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE35]] -; CHECK: pred.store.continue35: -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3 -; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]] -; CHECK: pred.store.if36: -; CHECK-NEXT: [[TMP80:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2 +; CHECK-NEXT: [[TMP81:%.*]] = shl nsw i64 [[TMP35]], 2 ; CHECK-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP82]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE37]] -; CHECK: pred.store.continue37: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK: pred.store.continue6: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP83]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]] -; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]] -; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF38:%.*]] = urem i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[N_MOD_VF38]], 0 -; CHECK-NEXT: [[TMP85:%.*]] = select i1 [[TMP84]], i64 4, i64 [[N_MOD_VF38]] -; CHECK-NEXT: [[N_VEC39:%.*]] = sub i64 [[TMP0]], [[TMP85]] -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX40:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_STORE_CONTINUE50:%.*]] ] -; CHECK-NEXT: [[TMP87:%.*]] = shl nsw i64 [[INDEX40]], 2 -; CHECK-NEXT: [[TMP89:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP87]] -; CHECK-NEXT: [[WIDE_VEC41:%.*]] = load <16 x double>, ptr [[TMP89]], align 8 -; CHECK-NEXT: [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC41]], <16 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP90:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC42]], zeroinitializer -; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i1> [[TMP90]], i32 0 -; CHECK-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] -; CHECK: pred.store.if43: -; CHECK-NEXT: [[TMP86:%.*]] = add i64 [[INDEX40]], 0 -; CHECK-NEXT: [[TMP92:%.*]] = shl nsw i64 [[TMP86]], 2 -; CHECK-NEXT: [[TMP93:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP92]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP93]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE44]] -; CHECK: pred.store.continue44: -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP90]], i32 1 -; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] -; CHECK: pred.store.if45: -; CHECK-NEXT: [[TMP95:%.*]] = add i64 [[INDEX40]], 1 -; CHECK-NEXT: [[TMP96:%.*]] = shl nsw i64 [[TMP95]], 2 -; CHECK-NEXT: [[TMP97:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP96]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP97]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE46]] -; CHECK: pred.store.continue46: -; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i1> [[TMP90]], i32 2 -; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] -; CHECK: pred.store.if47: -; CHECK-NEXT: [[TMP99:%.*]] = add i64 [[INDEX40]], 2 -; CHECK-NEXT: [[TMP100:%.*]] = shl nsw i64 [[TMP99]], 2 -; CHECK-NEXT: [[TMP101:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP100]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP101]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE48]] -; CHECK: pred.store.continue48: -; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i1> [[TMP90]], i32 3 -; CHECK-NEXT: br i1 [[TMP102]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50]] -; CHECK: pred.store.if49: -; CHECK-NEXT: [[TMP103:%.*]] = add i64 [[INDEX40]], 3 -; CHECK-NEXT: [[TMP104:%.*]] = shl nsw i64 [[TMP103]], 2 -; CHECK-NEXT: [[TMP105:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP104]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP105]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE50]] -; CHECK: pred.store.continue50: -; CHECK-NEXT: [[INDEX_NEXT51]] = add nuw i64 [[INDEX40]], 4 -; CHECK-NEXT: [[TMP106:%.*]] = icmp eq i64 [[INDEX_NEXT51]], [[N_VEC39]] -; CHECK-NEXT: br i1 [[TMP106]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]] -; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2 ; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]] ; CHECK-NEXT: [[L:%.*]] = load double, ptr [[GEP_0]], align 8 @@ -655,7 +467,7 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -691,12 +503,58 @@ define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) { ; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i16 0, 0 ; CHECK-NEXT: [[CONV_I:%.*]] = sext i16 0 to i32 ; CHECK-NEXT: [[CONV5_I:%.*]] = sext i8 [[A:%.*]] to i32 +; CHECK-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[CMP_I]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[CONV5_I]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE8:%.*]] ] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP14]], i32 0 +; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] +; CHECK: pred.urem.if: +; CHECK-NEXT: br label [[PRED_UREM_CONTINUE]] +; CHECK: pred.urem.continue: +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP14]], i32 1 +; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] +; CHECK: pred.urem.if3: +; CHECK-NEXT: br label [[PRED_UREM_CONTINUE4]] +; CHECK: pred.urem.continue4: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP14]], i32 2 +; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] +; CHECK: pred.urem.if5: +; CHECK-NEXT: br label [[PRED_UREM_CONTINUE6]] +; CHECK: pred.urem.continue6: +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP14]], i32 3 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8]] +; CHECK: pred.urem.if7: +; CHECK-NEXT: br label [[PRED_UREM_CONTINUE8]] +; CHECK: pred.urem.continue8: +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> poison, <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i1> [[TMP5]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> zeroinitializer, <4 x i32> poison +; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[BROADCAST_SPLAT2]], [[PREDPHI]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[PREDPHI]], splat (i32 24) +; CHECK-NEXT: [[TMP10:%.*]] = ashr exact <4 x i32> [[TMP9]], splat (i32 24) +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[PREDPHI9]], i32 3 +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 100, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: br i1 [[CMP_I]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: -; CHECK-NEXT: [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER1]] ] ; CHECK-NEXT: [[SHR_I:%.*]] = ashr i32 [[CONV5_I]], [[P_1]] ; CHECK-NEXT: [[TOBOOL6_NOT_I:%.*]] = icmp eq i32 [[SHR_I]], 0 ; CHECK-NEXT: [[SEXT_I:%.*]] = shl i32 [[P_1]], 24 @@ -711,9 +569,9 @@ define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) { ; CHECK-NEXT: [[P_2:%.*]] = phi i32 [ 0, [[ELSE]] ], [ [[TMP1]], [[THEN]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], -1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]] ; CHECK: exit: -; CHECK-NEXT: [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ] +; CHECK-NEXT: [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[P_2_LCSSA]] ; entry: @@ -839,7 +697,7 @@ define void @sdiv_by_zero(ptr noalias %src, ptr noalias %dst, i32 %d) #2 { ; CHECK-NEXT: store <8 x i32> [[PREDPHI]], ptr [[TMP42]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -859,7 +717,7 @@ define void @sdiv_by_zero(ptr noalias %src, ptr noalias %dst, i32 %d) #2 { ; CHECK-NEXT: store i32 [[MERGE]], ptr [[GEP_DST]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV]], 16 -; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -1198,12 +1056,12 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i32> [[VEC_IND]], splat (i32 32) ; CHECK-NEXT: [[TMP163:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 -; CHECK-NEXT: br i1 [[TMP163]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP163]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP164:%.*]] = extractelement <32 x i64> [[PREDPHI]], i32 31 ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF13:![0-9]+]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF12:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT63:%.*]] = insertelement <8 x i1> poison, i1 [[C]], i64 0 @@ -1294,7 +1152,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 { ; CHECK-NEXT: [[INDEX_NEXT86]] = add nuw i32 [[INDEX67]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT87]] = add <8 x i32> [[VEC_IND68]], splat (i32 8) ; CHECK-NEXT: [[TMP208:%.*]] = icmp eq i32 [[INDEX_NEXT86]], 1000 -; CHECK-NEXT: br i1 [[TMP208]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP208]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP209:%.*]] = extractelement <8 x i64> [[PREDPHI85]], i32 7 ; CHECK-NEXT: br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -1313,7 +1171,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 { ; CHECK-NEXT: [[MERGE:%.*]] = phi i64 [ [[ZEXT]], [[THEN]] ], [ 0, [[LOOP_HEADER]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 1000 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[MERGE_LCSSA:%.*]] = phi i64 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP164]], [[MIDDLE_BLOCK]] ], [ [[TMP209]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[MERGE_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll index d0c311eb4521f..fbb7c89c462b2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll @@ -435,78 +435,16 @@ define void @test_first_order_recurrence_tried_to_scalarized(ptr %dst, i1 %c, i3 ; CHECK-LABEL: @test_first_order_recurrence_tried_to_scalarized( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[N:%.*]] = select i1 [[C:%.*]], i32 8, i32 9 -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] -; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND]], [[PRED_STORE_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw i32 10, [[TMP5]] -; CHECK-NEXT: store i32 [[TMP6]], ptr [[TMP4]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; CHECK: pred.store.if1: -; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = sub nsw i32 10, [[TMP10]] -; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP9]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = sub nsw i32 10, [[TMP15]] -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP14]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 3 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP21:%.*]] = sub nsw i32 10, [[TMP20]] -; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP19]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[EXIT:%.*]] -; CHECK: scalar.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 4, [[SCALAR_PH]] ], [ [[IV]], [[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 10, [[FOR]] -; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[IV]] ; CHECK-NEXT: store i32 [[SUB]], ptr [[GEP_DST]], align 4 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index cf04cd21c16be..557caa98ad434 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -1073,27 +1073,49 @@ for.end: ; preds = %for.inc define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: define void @foo6( ; AVX1-SAME: ptr readonly captures(none) [[IN:%.*]], ptr captures(none) [[OUT:%.*]], i32 [[SIZE:%.*]], ptr readonly captures(none) [[TRIGGER:%.*]]) local_unnamed_addr #[[ATTR0]] { -; AVX1-NEXT: [[ENTRY:.*]]: -; AVX1-NEXT: br label %[[FOR_BODY:.*]] -; AVX1: [[FOR_BODY]]: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 4095, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; AVX1-NEXT: [[ENTRY:.*:]] +; AVX1-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; AVX1: [[VECTOR_MEMCHECK]]: +; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT]], i64 32768 +; AVX1-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER]], i64 16384 +; AVX1-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN]], i64 32768 +; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]] +; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX1-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP2]] +; AVX1-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]] +; AVX1-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; AVX1: [[VECTOR_PH]]: +; AVX1-NEXT: br label %[[VECTOR_BODY:.*]] +; AVX1: [[VECTOR_BODY]]: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; AVX1-NEXT: [[INDVARS_IV:%.*]] = sub i64 4095, [[INDEX]] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 -; AVX1-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]] -; AVX1: [[IF_THEN]]: -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 -; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e-01 -; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] -; AVX1-NEXT: store double [[ADD]], ptr [[ARRAYIDX5]], align 8 -; AVX1-NEXT: br label %[[FOR_INC]] -; AVX1: [[FOR_INC]]: -; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; AVX1-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX1-NEXT: br i1 [[CMP]], label %[[FOR_END:.*]], label %[[FOR_BODY]] -; AVX1: [[FOR_END]]: -; AVX1-NEXT: ret void +; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0 +; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -3 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META18:![0-9]+]] +; AVX1-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[IN]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr double, ptr [[TMP5]], i32 -3 +; AVX1-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP6]], i32 8, <4 x i1> [[REVERSE6]], <4 x double> poison), !alias.scope [[META21:![0-9]+]] +; AVX1-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> +; AVX1-NEXT: [[TMP7:%.*]] = fadd <4 x double> [[REVERSE7]], splat (double 5.000000e-01) +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[TMP8]], i32 0 +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[TMP9]], i32 -3 +; AVX1-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE9]], ptr [[TMP10]], i32 8, <4 x i1> [[REVERSE6]]), !alias.scope [[META23:![0-9]+]], !noalias [[META25:![0-9]+]] +; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; AVX1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX1-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; AVX1: [[MIDDLE_BLOCK]]: +; AVX1-NEXT: br [[FOR_END:label %.*]] +; AVX1: [[SCALAR_PH]]: ; ; AVX2-LABEL: define void @foo6( ; AVX2-SAME: ptr readonly captures(none) [[IN:%.*]], ptr captures(none) [[OUT:%.*]], i32 [[SIZE:%.*]], ptr readonly captures(none) [[TRIGGER:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -1373,14 +1395,14 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; AVX1: [[MIDDLE_BLOCK]]: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; AVX1: [[VEC_EPILOG_ITER_CHECK]]: ; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF19:![0-9]+]] +; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF29:![0-9]+]] ; AVX1: [[VEC_EPILOG_PH]]: ; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX1-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 @@ -1400,7 +1422,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]]) ; AVX1-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 ; AVX1-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] -; AVX1-NEXT: br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; AVX1: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; AVX1-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] ; AVX1-NEXT: br i1 [[CMP_N14]], [[FOR_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]] @@ -1697,14 +1719,14 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; AVX1: [[MIDDLE_BLOCK]]: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; AVX1: [[VEC_EPILOG_ITER_CHECK]]: ; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF19]] +; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF29]] ; AVX1: [[VEC_EPILOG_PH]]: ; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX1-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 @@ -1724,7 +1746,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]]) ; AVX1-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 ; AVX1-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] -; AVX1-NEXT: br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; AVX1: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; AVX1-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] ; AVX1-NEXT: br i1 [[CMP_N14]], [[FOR_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll index e25be6f867862..466a41b205eeb 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll @@ -1116,9 +1116,51 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end, ; COST-LABEL: define void @br_under_switch_default_common_dest_with_case( ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { ; COST-NEXT: [[ENTRY:.*]]: +; COST-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; COST-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; COST-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; COST-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; COST-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; COST-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; COST-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; COST-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; COST: [[VECTOR_PH]]: +; COST-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; COST-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; COST-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; COST-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; COST-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0 +; COST-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; COST-NEXT: br label %[[VECTOR_BODY:.*]] +; COST: [[VECTOR_BODY]]: +; COST-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; COST-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; COST-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]] +; COST-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1 +; COST-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12) +; COST-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13) +; COST-NEXT: [[TMP8:%.*]] = or <4 x i1> [[TMP6]], [[TMP7]] +; COST-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) +; COST-NEXT: [[TMP10:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; COST-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true) +; COST-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer +; COST-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP12]], [[TMP7]] +; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP13]]) +; COST-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP10]], <4 x i1> zeroinitializer +; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP14]]) +; COST-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP9]] +; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP15]]) +; COST-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; COST-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; COST-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; COST: [[MIDDLE_BLOCK]]: +; COST-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; COST-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; COST: [[SCALAR_PH]]: +; COST-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; COST-NEXT: br label %[[LOOP_HEADER:.*]] ; COST: [[LOOP_HEADER]]: -; COST-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; COST-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; COST-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; COST-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; COST-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -1140,7 +1182,7 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end, ; COST: [[LOOP_LATCH]]: ; COST-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; COST-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; COST-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; COST-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]] ; COST: [[EXIT]]: ; COST-NEXT: ret void ; @@ -1433,6 +1475,8 @@ exit: ; COST: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ; COST: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} ; COST: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; COST: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; COST: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} ;. ; FORCED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; FORCED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll index 414394a8942e5..7d31b81e3b45d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll @@ -362,50 +362,17 @@ define dso_local void @test(ptr noalias nocapture %points, ptr noalias nocapture ; ENABLED_MASKED_STRIDED-NEXT: entry: ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD]], zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul nuw nsw <4 x i64> [[VEC_IND]], splat (i64 3) -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; ENABLED_MASKED_STRIDED: pred.store.if: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS:%.*]], i64 [[TMP4]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 0 -; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP6]], ptr [[TMP5]], align 2 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] -; ENABLED_MASKED_STRIDED: pred.store.continue: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; ENABLED_MASKED_STRIDED: pred.store.if1: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP8]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 1 -; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP10]], ptr [[TMP9]], align 2 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE2]] -; ENABLED_MASKED_STRIDED: pred.store.continue2: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; ENABLED_MASKED_STRIDED: pred.store.if3: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP12]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 2 -; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP14]], ptr [[TMP13]], align 2 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE4]] -; ENABLED_MASKED_STRIDED: pred.store.continue4: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] -; ENABLED_MASKED_STRIDED: pred.store.if5: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP16]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3 -; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP18]], ptr [[TMP17]], align 2 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE6]] -; ENABLED_MASKED_STRIDED: pred.store.continue6: +; ENABLED_MASKED_STRIDED-NEXT: [[DOTIDX:%.*]] = mul i64 [[INDEX]], 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[POINTS:%.*]], i64 [[DOTIDX]] +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> poison, <12 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <12 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <12 x i1> [[INTERLEAVED_MASK]], +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v12i16.p0(<12 x i16> [[INTERLEAVED_VEC]], ptr [[TMP2]], i32 2, <12 x i1> [[TMP3]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; ENABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP19]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 9deab9063d710..89aebb931244c 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -49,6 +49,8 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize ; CHECK-NEXT: loop.0: ; CHECK-NEXT: WIDEN-CAST ir<%conv> = sext vp<[[PRED1]]> to i32 ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%0>, ir<%conv> +; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> +; CHECK-NEXT: WIDEN ir<%add> = add ir<%conv>, ir<%rem> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -57,9 +59,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%add> = add ir<%conv>, ir<%rem> ; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.dst> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: @@ -125,6 +125,8 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next> +; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> +; CHECK-NEXT: WIDEN ir<%add> = add ir<%rem>, ir<%recur.next> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -134,9 +136,7 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%add> = add ir<%rem>, ir<%recur.next> ; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: @@ -284,27 +284,44 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr ; CHECK-NEXT: loop.0: ; CHECK-NEXT: WIDEN-CAST ir<%conv> = sext vp<[[PRED]]> to i32 ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%0>, ir<%conv> -; CHECK-NEXT: Successor(s): pred.store +; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> +; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: -; CHECK: pred.store: { -; CHECK-NEXT: pred.store.entry: +; CHECK: pred.load: { +; CHECK-NEXT: pred.load.entry: ; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue ; CHECK-EMPTY: -; CHECK: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%lv.2> = load ir<%gep> -; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> -; CHECK-NEXT: REPLICATE ir<%conv.lv.2> = sext ir<%lv.2> -; CHECK-NEXT: REPLICATE ir<%add.1> = add ir<%conv>, ir<%rem> -; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%add> = add ir<%add.1>, ir<%conv.lv.2> -; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.dst> -; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%lv.2> = load ir<%gep> (S->V) +; CHECK-NEXT: Successor(s): pred.load.continue ; CHECK-EMPTY: -; CHECK: pred.store.continue: +; CHECK: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%lv.2> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.2 +; CHECK-NEXT: Successor(s): loop.1 +; CHECK-EMPTY: +; CHECK-NEXT: loop.1: +; CHECK-NEXT: WIDEN ir<%add.1> = add ir<%conv>, ir<%rem> +; CHECK-NEXT: WIDEN-CAST ir<%conv.lv.2> = sext vp<%9> to i32 +; CHECK-NEXT: WIDEN ir<%add> = add ir<%add.1>, ir<%conv.lv.2> +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]> +; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.dst> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.2 ; CHECK-EMPTY: ; CHECK: loop.2: ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> @@ -368,6 +385,7 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next> +; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -377,7 +395,6 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: REPLICATE ir<%rem.div> = sdiv ir<20>, ir<%rem> ; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%rem.div>, ir<%gep> @@ -448,6 +465,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp ule vp<[[WIDE_IV]]>, vp<[[BTC]]> ; CHECK-NEXT: CLONE ir<[[L]]> = load ir<%src> ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%.pn>, ir<[[L]]> +; CHECK-NEXT: WIDEN ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -458,7 +476,6 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias ; CHECK-NEXT: pred.store.if: ; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[SCALAR_STEPS]]> -; CHECK-NEXT: REPLICATE ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: REPLICATE store ir<%val>, ir<%gep.dst> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 994e9c1ce64fa..c919b2e6f7a9c 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -28,28 +28,45 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> -; CHECK-NEXT: Successor(s): pred.store - -; CHECK: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue - -; CHECK: pred.store.if: -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr inbounds ir<@b>, ir<0>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> -; CHECK-NEXT: REPLICATE ir<%add> = add ir<%lv.b>, ir<10> -; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr inbounds ir<@a>, ir<0>, vp<[[STEPS]] -; CHECK-NEXT: REPLICATE ir<%mul> = mul ir<2>, ir<%add> -; CHECK-NEXT: REPLICATE store ir<%mul>, ir<%gep.a> -; CHECK-NEXT: Successor(s): pred.store.continue - -; CHECK: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } - +; CHECK-NEXT: Successor(s): pred.load +; CHECK-EMPTY: +; CHECK-NEXT: pred.load: { +; CHECK-NEXT: pred.load.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr inbounds ir<@b>, ir<0>, vp<[[STEPS]]> +; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> (S->V) +; CHECK-NEXT: Successor(s): pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%8> = ir<%lv.b> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.0 +; CHECK-EMPTY: +; CHECK-NEXT: loop.0: +; CHECK-NEXT: WIDEN ir<%add> = add vp<%8>, ir<10> +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<2>, ir<%add> +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr inbounds ir<@a>, ir<0>, vp<[[STEPS]]> +; CHECK-NEXT: REPLICATE store ir<%mul>, ir<%gep.a> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.1 ; CHECK: loop.1: ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> @@ -760,28 +777,46 @@ define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> -; CHECK-NEXT: Successor(s): pred.store -; CHECK-EMPTY: -; CHECK-NEXT: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr inbounds ir<@a>, ir<0>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%lv.a> = load ir<%gep.a> -; CHECK-NEXT: REPLICATE ir<%div> = sdiv ir<%lv.a>, ir<%lv.a> -; CHECK-NEXT: REPLICATE store ir<%div>, ir<%gep.a> -; CHECK-NEXT: Successor(s): pred.store.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.2 -; CHECK-EMPTY: -; CHECK-NEXT: loop.2: +; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr inbounds ir<@a>, ir<0>, vp<[[STEPS]]> +; CHECK-NEXT: Successor(s): pred.load +; CHECK-EMPTY: +; CHECK-NEXT: pred.load: { +; CHECK-NEXT: pred.load.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%lv.a> = load ir<%gep.a> (S->V) +; CHECK-NEXT: Successor(s): pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%lv.a> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.0 +; CHECK-EMPTY: +; CHECK-NEXT: loop.0: +; CHECK-NEXT: EMIT vp<[[SELECT:%.+]]> = select vp<[[MASK]]>, vp<[[PRED]]>, ir<1> +; CHECK-NEXT: WIDEN ir<%div> = sdiv vp<[[PRED]]>, vp<[[SELECT]]> +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE store ir<%div>, ir<%gep.a> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.1 +; CHECK-EMPTY: +; CHECK-NEXT: loop.1: ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -849,6 +884,8 @@ define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) { ; CHECK-EMPTY: ; CHECK-NEXT: loop.0: ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%for>, vp<[[PRED]]> +; CHECK-NEXT: EMIT vp<[[SELECT:%.+]]> = select vp<[[MASK]]>, vp<[[PRED]]>, ir<1> +; CHECK-NEXT: WIDEN ir<%div> = sdiv vp<[[SPLICE]]>, vp<[[SELECT]]> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -857,16 +894,15 @@ define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) { ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%div> = sdiv vp<[[SPLICE]]>, vp<[[PRED]]> ; CHECK-NEXT: REPLICATE store ir<%div>, ir<%gep.a> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.continue: ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.2 +; CHECK-NEXT: Successor(s): loop.1 ; CHECK-EMPTY: -; CHECK-NEXT: loop.2: +; CHECK-NEXT: loop.1: ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll index 2fe420183c683..efd7b4afe255c 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll @@ -12,8 +12,86 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX-NEXT: entry: ; AVX-NEXT: [[I11_NOT1:%.*]] = icmp eq ptr [[START:%.*]], [[END:%.*]] ; AVX-NEXT: br i1 [[I11_NOT1]], label [[EXIT:%.*]], label [[BB12:%.*]] +; AVX: iter.check: +; AVX-NEXT: [[END3:%.*]] = ptrtoint ptr [[END]] to i64 +; AVX-NEXT: [[START4:%.*]] = ptrtoint ptr [[START]] to i64 +; AVX-NEXT: [[TMP0:%.*]] = add i64 [[END3]], -4 +; AVX-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START4]] +; AVX-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 +; AVX-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; AVX-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 28 +; AVX-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[BB12_PREHEADER:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX: vector.main.loop.iter.check: +; AVX-NEXT: [[MIN_ITERS_CHECK5:%.*]] = icmp ult i64 [[TMP1]], 124 +; AVX-NEXT: br i1 [[MIN_ITERS_CHECK5]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX: vector.ph: +; AVX-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775776 +; AVX-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX: vector.body: +; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 2 +; AVX-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 +; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[NEXT_GEP]], align 4 +; AVX-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 +; AVX-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 +; AVX-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4 +; AVX-NEXT: [[TMP8:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], splat (i32 -12) +; AVX-NEXT: [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD6]], splat (i32 -12) +; AVX-NEXT: [[TMP10:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD7]], splat (i32 -12) +; AVX-NEXT: [[TMP11:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD8]], splat (i32 -12) +; AVX-NEXT: [[TMP12:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], splat (i32 13) +; AVX-NEXT: [[TMP13:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD6]], splat (i32 13) +; AVX-NEXT: [[TMP14:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD7]], splat (i32 13) +; AVX-NEXT: [[TMP15:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD8]], splat (i32 13) +; AVX-NEXT: [[TMP16:%.*]] = or <8 x i1> [[TMP8]], [[TMP12]] +; AVX-NEXT: [[TMP17:%.*]] = or <8 x i1> [[TMP9]], [[TMP13]] +; AVX-NEXT: [[TMP18:%.*]] = or <8 x i1> [[TMP10]], [[TMP14]] +; AVX-NEXT: [[TMP19:%.*]] = or <8 x i1> [[TMP11]], [[TMP15]] +; AVX-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[NEXT_GEP]], i32 4, <8 x i1> [[TMP16]]) +; AVX-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[TMP5]], i32 4, <8 x i1> [[TMP17]]) +; AVX-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[TMP6]], i32 4, <8 x i1> [[TMP18]]) +; AVX-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[TMP7]], i32 4, <8 x i1> [[TMP19]]) +; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; AVX-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX: middle.block: +; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; AVX-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX: vec.epilog.iter.check: +; AVX-NEXT: [[TMP21:%.*]] = shl i64 [[N_VEC]], 2 +; AVX-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP21]] +; AVX-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24 +; AVX-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; AVX-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[BB12_PREHEADER]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] +; AVX: vec.epilog.ph: +; AVX-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX-NEXT: [[N_VEC10:%.*]] = and i64 [[TMP3]], 9223372036854775800 +; AVX-NEXT: [[TMP22:%.*]] = shl i64 [[N_VEC10]], 2 +; AVX-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP22]] +; AVX-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX: vec.epilog.vector.body: +; AVX-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX11]], 2 +; AVX-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]] +; AVX-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[NEXT_GEP12]], align 4 +; AVX-NEXT: [[TMP24:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD13]], splat (i32 -12) +; AVX-NEXT: [[TMP25:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD13]], splat (i32 13) +; AVX-NEXT: [[TMP26:%.*]] = or <8 x i1> [[TMP24]], [[TMP25]] +; AVX-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[NEXT_GEP12]], i32 4, <8 x i1> [[TMP26]]) +; AVX-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; AVX-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]] +; AVX-NEXT: br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; AVX: vec.epilog.middle.block: +; AVX-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]] +; AVX-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[BB12_PREHEADER]] +; AVX: bb12.preheader: +; AVX-NEXT: [[PTR2_PH:%.*]] = phi ptr [ [[START]], [[BB12]] ], [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; AVX-NEXT: br label [[BB13:%.*]] ; AVX: bb12: -; AVX-NEXT: [[PTR2:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LATCH:%.*]] ], [ [[START]], [[ENTRY:%.*]] ] +; AVX-NEXT: [[PTR2:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LATCH:%.*]] ], [ [[PTR2_PH]], [[BB12_PREHEADER]] ] ; AVX-NEXT: [[VAL:%.*]] = load i32, ptr [[PTR2]], align 4 ; AVX-NEXT: switch i32 [[VAL]], label [[LATCH]] [ ; AVX-NEXT: i32 -12, label [[STORE:%.*]] @@ -25,7 +103,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX: latch: ; AVX-NEXT: [[PTR_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR2]], i64 4 ; AVX-NEXT: [[I11_NOT:%.*]] = icmp eq ptr [[PTR_NEXT]], [[END]] -; AVX-NEXT: br i1 [[I11_NOT]], label [[EXIT]], label [[BB12]] +; AVX-NEXT: br i1 [[I11_NOT]], label [[EXIT]], label [[BB13]], !llvm.loop [[LOOP5:![0-9]+]] ; AVX: exit: ; AVX-NEXT: ret void ; @@ -86,7 +164,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP26]] ; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24 ; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 -; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[BB12_PREHEADER1]], label [[BB12_PREHEADER11]] +; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[BB12_PREHEADER1]], label [[BB12_PREHEADER11]], !prof [[PROF3:![0-9]+]] ; AVX2: vec.epilog.ph: ; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX2-NEXT: [[N_VEC10:%.*]] = and i64 [[TMP3]], 9223372036854775800 @@ -104,7 +182,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[NEXT_GEP14]], i32 4, <8 x i1> [[TMP24]]) ; AVX2-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX12]], 8 ; AVX2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC10]] -; AVX2-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[BB12]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[BB12]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX2: vec.epilog.middle.block: ; AVX2-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]] ; AVX2-NEXT: br i1 [[CMP_N17]], label [[EXIT]], label [[BB12_PREHEADER1]] @@ -124,7 +202,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2: latch: ; AVX2-NEXT: [[PTR_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR2]], i64 4 ; AVX2-NEXT: [[I11_NOT:%.*]] = icmp eq ptr [[PTR_NEXT]], [[END]] -; AVX2-NEXT: br i1 [[I11_NOT]], label [[EXIT]], label [[BB13]], !llvm.loop [[LOOP4:![0-9]+]] +; AVX2-NEXT: br i1 [[I11_NOT]], label [[EXIT]], label [[BB13]], !llvm.loop [[LOOP5:![0-9]+]] ; AVX2: exit: ; AVX2-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/loop-vectorize-bfi.ll b/llvm/test/Transforms/PhaseOrdering/loop-vectorize-bfi.ll new file mode 100644 index 0000000000000..50c123ac28f2e --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/loop-vectorize-bfi.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; REQUIRES: riscv-registered-target +; RUN: opt -p 'lto' -mtriple riscv64 -mattr=+v -S < %s | FileCheck %s + +; Test that BlockFrequencyInfo is invalidated after loop passes, so it's +; accurate whenever LoopVectorize uses it. LoopVectorizer requires that +; innermost loop headers have a greater than or equal to frequency than any +; block it dominates. + +define void @f(i1 %0) !prof !0 { +; CHECK-LABEL: define void @f( +; CHECK-SAME: i1 [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {{.*}}{ +; CHECK-NEXT: [[VECTOR_PH:.*]]: +; CHECK-NEXT: [[DOTSCALAR:%.*]] = xor i1 [[TMP0]], true +; CHECK-NEXT: [[TMP1:%.*]] = insertelement poison, i1 [[DOTSCALAR]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector [[TMP1]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 65, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr null, i64 [[EVL_BASED_IV]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( poison, ptr align 8 [[TMP4]], [[TMP2]], i32 [[TMP3]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( poison, ptr align 8 [[TMP4]], [[TMP2]], i32 [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[EVL_BASED_IV]], [[TMP5]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw nsw i64 [[AVL]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: ret void +; + br label %2 + +2: ; preds = %9, %1 + %3 = phi i64 [ %10, %9 ], [ 0, %1 ] + %4 = getelementptr i64, ptr null, i64 %3 + br label %5 + +5: ; preds = %8, %2 + %6 = phi i1 [ false, %2 ], [ true, %8 ] + br i1 %0, label %8, label %7 + +7: ; preds = %5 + store i64 0, ptr %4, align 8 + br label %8 + +8: ; preds = %7, %5 + br i1 %6, label %9, label %5 + +9: ; preds = %8 + %10 = add i64 %3, 1 + %11 = icmp eq i64 %3, 64 + br i1 %11, label %12, label %2 + +12: ; preds = %9 + ret void +} + +!0 = !{!"function_entry_count", i64 1}