diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 179a2c38d9d3c..328926f0b7aa6 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -2376,6 +2376,10 @@ class PredicatedScalarEvolution { /// Get the (predicated) symbolic max backedge count for the analyzed loop. const SCEV *getSymbolicMaxBackedgeTakenCount(); + /// Returns the upper bound of the loop trip count as a normal unsigned + /// value, or 0 if the trip count is unknown. + unsigned getSmallConstantMaxTripCount(); + /// Adds a new predicate. void addPredicate(const SCEVPredicate &Pred); @@ -2447,6 +2451,9 @@ class PredicatedScalarEvolution { /// The symbolic backedge taken count. const SCEV *SymbolicMaxBackedgeCount = nullptr; + + /// The constant max trip count for the loop. + std::optional SmallConstantMaxTripCount; }; template <> struct DenseMapInfo { diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index c939270ed39a6..515acd70d9b03 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15051,6 +15051,16 @@ const SCEV *PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount() { return SymbolicMaxBackedgeCount; } +unsigned PredicatedScalarEvolution::getSmallConstantMaxTripCount() { + if (!SmallConstantMaxTripCount) { + SmallVector Preds; + SmallConstantMaxTripCount = SE.getSmallConstantMaxTripCount(&L, &Preds); + for (const auto *P : Preds) + addPredicate(*P); + } + return *SmallConstantMaxTripCount; +} + void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) { if (Preds->implies(&Pred)) return; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 001c8987667df..22c1fa9020cdf 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -411,10 +411,10 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { /// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax. /// 4) Returns std::nullopt if all of the above failed. static std::optional -getSmallBestKnownTC(ScalarEvolution &SE, Loop *L, +getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax = true) { // Check if exact trip count is known. - if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) + if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L)) return ExpectedTC; // Check if there is an expected trip count available from profile data. @@ -426,7 +426,7 @@ getSmallBestKnownTC(ScalarEvolution &SE, Loop *L, return std::nullopt; // Check if upper bound estimate is known. - if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) + if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount()) return ExpectedTC; return std::nullopt; @@ -1787,12 +1787,15 @@ class GeneratedRTChecks { Loop *OuterLoop = nullptr; + PredicatedScalarEvolution &PSE; + public: - GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, - TargetTransformInfo *TTI, const DataLayout &DL, - bool AddBranchWeights) - : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), - MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {} + GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, + LoopInfo *LI, TargetTransformInfo *TTI, + const DataLayout &DL, bool AddBranchWeights) + : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"), + MemCheckExp(*PSE.getSE(), DL, "scev.check"), + AddBranchWeights(AddBranchWeights), PSE(PSE) {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -1939,7 +1942,7 @@ class GeneratedRTChecks { // Get the best known TC estimate. if (auto EstimatedTC = getSmallBestKnownTC( - *SE, OuterLoop, /* CanUseConstantMax = */ false)) + PSE, OuterLoop, /* CanUseConstantMax = */ false)) BestTripCount = *EstimatedTC; BestTripCount = std::max(BestTripCount, 1U); @@ -2270,8 +2273,7 @@ static bool isIndvarOverflowCheckKnownFalse( // We know the runtime overflow check is known false iff the (max) trip-count // is known and (max) trip-count + (VF * UF) does not overflow in the type of // the vector loop induction variable. - if (unsigned TC = - Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { + if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) { uint64_t MaxVF = VF.getKnownMinValue(); if (VF.isScalable()) { std::optional MaxVScale = @@ -3956,8 +3958,10 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); + unsigned MaxTC = PSE.getSmallConstantMaxTripCount(); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); + if (TC != MaxTC) + LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n'); if (TC == 1) { reportVectorizationFailure("Single iteration (non) loop", "loop trip count is one, irrelevant for vectorization", @@ -4251,7 +4255,7 @@ bool LoopVectorizationPlanner::isMoreProfitable( InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; - unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); + unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); @@ -4839,7 +4843,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (!Legal->isSafeForAnyVectorWidth()) return 1; - auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); + auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); const bool HasReductions = !Legal->getReductionVars().empty(); // If we did not calculate the cost for VF (because the user selected the VF) @@ -9583,8 +9587,8 @@ static bool processLoopInVPlanNativePath( { bool AddBranchWeights = hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getDataLayout(), AddBranchWeights); + GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), + AddBranchWeights); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" @@ -9648,7 +9652,7 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional VScale, Loop *L, - ScalarEvolution &SE, + PredicatedScalarEvolution &PSE, ScalarEpilogueLowering SEL) { InstructionCost CheckCost = Checks.getCost(); if (!CheckCost.isValid()) @@ -9733,7 +9737,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // Skip vectorization if the expected trip count is less than the minimum // required trip count. - if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { + if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) { if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), VF.MinProfitableTripCount)) { LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " @@ -9840,7 +9844,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. - auto ExpectedTC = getSmallBestKnownTC(*SE, L); + auto ExpectedTC = getSmallBestKnownTC(PSE, L); if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is worth vectorizing only if no scalar " @@ -9938,8 +9942,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool AddBranchWeights = hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getDataLayout(), AddBranchWeights); + GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), + AddBranchWeights); if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. IC = CM.selectInterleaveCount(VF.Width, VF.Cost); @@ -9955,7 +9959,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, - *PSE.getSE(), SEL)) { + PSE, SEL)) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll new file mode 100644 index 0000000000000..1ec384b05779a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll @@ -0,0 +1,397 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; REQUIRES: asserts +; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -mattr=+sve 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG + +target triple = "aarch64-unknown-linux-gnu" + +; DEBUG-LABEL: LV: Checking a loop in 'low_vf_ic_is_better' +; DEBUG: LV: Found trip count: 0 +; DEBUG: LV: Found maximum trip count: 19 +; DEBUG: LV: IC is 1 +; DEBUG: LV: VF is vscale x 8 +; DEBUG: Main Loop VF:vscale x 8, Main Loop UF:1, Epilogue Loop VF:vscale x 4, Epilogue Loop UF:1 + +; DEBUG-LABEL: LV: Checking a loop in 'trip_count_too_small' +; DEBUG: LV: Found a loop with a very small trip count. This loop is worth vectorizing only if no scalar iteration overheads are incurred. +; DEBUG: LV: Not vectorizing: The trip count is below the minial threshold value.. + +; DEBUG-LABEL: LV: Checking a loop in 'too_many_runtime_checks' +; DEBUG: LV: Found trip count: 0 +; DEBUG: LV: Found maximum trip count: 16 +; DEBUG: LV: Clamping the MaxVF to maximum power of two not exceeding the constant trip count: 16 +; DEBUG: LV: IC is 1 +; DEBUG: LV: VF is 16 +; DEBUG: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (16 < 32) +; DEBUG: LV: Too many memory checks needed. + +; DEBUG-LABEL: LV: Checking a loop in 'overflow_indvar_known_false' +; DEBUG: LV: Found trip count: 0 +; DEBUG: LV: Found maximum trip count: 1027 +; DEBUG: LV: can fold tail by masking. +; DEBUG: Executing best plan with VF=vscale x 16, UF=1 + +define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef %val) { +; CHECK-LABEL: define void @low_vf_ic_is_better( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TC]], 19 +; CHECK-NEXT: br i1 [[CMP7]], label %[[ITER_CHECK:.*]], label %[[WHILE_END:.*]] +; CHECK: [[ITER_CHECK]]: +; CHECK-NEXT: [[CONV:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 19, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt i64 [[TMP8]], 4294967295 +; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP11]], [[TMP12]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP17]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[CONV]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP23]], ptr [[TMP22]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[IND_END5:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] +; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP33]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 +; CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], [[TMP35]] +; CHECK-NEXT: [[N_VEC4:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF3]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC4]] +; CHECK-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement poison, i8 [[CONV]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector [[BROADCAST_SPLATINSERT8]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX7:%.*]] = add i64 [[TMP0]], [[INDEX6]] +; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[OFFSET_IDX7]], 0 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP39]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP40]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = add [[WIDE_LOAD7]], [[BROADCAST_SPLAT9]] +; CHECK-NEXT: store [[TMP41]], ptr [[TMP40]], align 1 +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX6]], [[TMP37]] +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[CMP_N12]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP43]], [[CONV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP44]], 19 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp ult i32 %tc, 19 + br i1 %cmp7, label %while.preheader, label %while.end + +while.preheader: + %conv = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %0 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %1 = load i8, ptr %arrayidx, align 1 + %add = add i8 %1, %conv + store i8 %add, ptr %arrayidx, align 1 + %2 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %2, 19 + br i1 %exitcond.not, label %while.end, label %while.body + +while.end: + ret void +} + +define void @trip_count_too_small(ptr nocapture noundef %p, i32 noundef %tc, i16 noundef %val) { +; CHECK-LABEL: define void @trip_count_too_small( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TC]], 3 +; CHECK-NEXT: br i1 [[CMP7]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]] +; CHECK: [[WHILE_PREHEADER]]: +; CHECK-NEXT: [[CONV:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP43]], [[CONV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP44]], 3 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp ult i32 %tc, 3 + br i1 %cmp7, label %while.preheader, label %while.end + +while.preheader: + %conv = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %0 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %1 = load i8, ptr %arrayidx, align 1 + %add = add i8 %1, %conv + store i8 %add, ptr %arrayidx, align 1 + %2 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %2, 3 + br i1 %exitcond.not, label %while.end, label %while.body + +while.end: + ret void +} + +define void @too_many_runtime_checks(ptr nocapture noundef %p, ptr nocapture noundef %p1, ptr nocapture noundef readonly %p2, ptr nocapture noundef readonly %p3, i32 noundef %tc, i16 noundef %val) { +; CHECK-LABEL: define void @too_many_runtime_checks( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], ptr nocapture noundef [[P1:%.*]], ptr nocapture noundef readonly [[P2:%.*]], ptr nocapture noundef readonly [[P3:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[TC]], 16 +; CHECK-NEXT: br i1 [[CMP20]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]] +; CHECK: [[WHILE_PREHEADER]]: +; CHECK-NEXT: [[CONV8:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP1]], %[[WHILE_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P2]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP60:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P3]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP61:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP61]], [[TMP60]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P1]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP62:%.*]] = load i8, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP62]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP63:%.*]] = load i8, ptr [[ARRAYIDX10]], align 1 +; CHECK-NEXT: [[ADD12:%.*]] = add i8 [[TMP63]], [[CONV8]] +; CHECK-NEXT: store i8 [[ADD12]], ptr [[ARRAYIDX10]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TMP64:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP64]], 16 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp20 = icmp ult i32 %tc, 16 + br i1 %cmp20, label %while.preheader, label %while.end + +while.preheader: + %0 = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %1 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %1, %while.preheader ], [ %iv.next, %while.body ] + %arrayidx = getelementptr inbounds nuw i8, ptr %p2, i64 %iv + %2 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds nuw i8, ptr %p3, i64 %iv + %3 = load i8, ptr %arrayidx2, align 1 + %mul = mul i8 %3, %2 + %arrayidx5 = getelementptr inbounds nuw i8, ptr %p1, i64 %iv + %4 = load i8, ptr %arrayidx5, align 1 + %add = add i8 %mul, %4 + store i8 %add, ptr %arrayidx5, align 1 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %5 = load i8, ptr %arrayidx10, align 1 + %add12 = add i8 %5, %0 + store i8 %add12, ptr %arrayidx10, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %6 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %6, 16 + br i1 %exitcond.not, label %while.end, label %while.body + +while.end: + ret void +} + +define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %tc, i16 noundef %val) vscale_range(1,16) { +; CHECK-LABEL: define void @overflow_indvar_known_false( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TC]], 1027 +; CHECK-NEXT: br i1 [[CMP7]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]] +; CHECK: [[WHILE_PREHEADER]]: +; CHECK-NEXT: [[CONV:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1028, [[TMP20]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = sub i64 1027, [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP21]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp ugt i64 [[TMP23]], 4294967295 +; CHECK-NEXT: [[TMP28:%.*]] = or i1 [[TMP26]], [[TMP27]] +; CHECK-NEXT: br i1 [[TMP28]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP1]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[CONV]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP15]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP1]]) +; CHECK-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[WHILE_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP18]], [[CONV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP29:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP29]], 1027 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp ult i32 %tc, 1027 + br i1 %cmp7, label %while.preheader, label %while.end + +while.preheader: + %conv = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %0 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %1 = load i8, ptr %arrayidx, align 1 + %add = add i8 %1, %conv + store i8 %add, ptr %arrayidx, align 1 + %2 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %2, 1027 + br i1 %exitcond.not, label %while.end, label %while.body, !llvm.loop !0 + +while.end: + ret void +} + + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 1d5e6c117a2ea..9a716f7756072 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -20,6 +20,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: LV: Found trip count: 0 +; CHECK-NEXT: LV: Found maximum trip count: 4294967295 ; CHECK-NEXT: LV: Scalable vectorization is available ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. @@ -224,6 +225,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: LV: Found trip count: 0 +; CHECK-NEXT: LV: Found maximum trip count: 4294967295 ; CHECK-NEXT: LV: Scalable vectorization is available ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295.