diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 623814c038a78..3c712ead95318 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -12778,10 +12778,23 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS, // The positive stride case is the same as isKnownPositive(Stride) returning // true (original behavior of the function). // - if (PredicatedIV || !NoWrap || !loopIsFiniteByAssumption(L) || + if (PredicatedIV || !loopIsFiniteByAssumption(L) || !loopHasNoAbnormalExits(L)) return getCouldNotCompute(); + // Adding Stride equal to one Predicate when there is no wrap flags. + // It might enable strided access versioning in LAA and calculate BECount + // with Stride = 1. + if (!NoWrap) { + if (AllowPredicates) { + const auto *One = + static_cast(getOne(Stride->getType())); + Predicates.insert(getEqualPredicate(Stride, One)); + Stride = One; + } else + return getCouldNotCompute(); + } + if (!isKnownNonZero(Stride)) { // If we have a step of zero, and RHS isn't invariant in L, we don't know // if it might eventually be greater than start and if so, on which diff --git a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll index 7bf4fbd89b0ee..8396d62570f21 100644 --- a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll +++ b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll @@ -92,3 +92,57 @@ for.end.loopexit: for.end: ret void } + +; We can vectorize the loop by using stride = 1 to calculate iteration count +; and generate the runtime check to guard the vectorized loop. + +; CHECK-LABEL: s172 +; CHECK: vector.scevcheck: +; CHECK: [[CHECK:%.*]] = icmp ne i32 %xb, 1 +; CHECK: br i1 [[CHECK]], label %scalar.ph, label %vector.ph +; CHECK: vector.body + +@b = global [32000 x float] zeroinitializer, align 64 +@a = global [32000 x float] zeroinitializer, align 64 + +; for (int i = xa - 1; i < 32000; i += xb) +; a[i] += b[i]; +; +define float @s172(i32 signext %xa, i32 signext %xb) mustprogress { +entry: + %cmp214 = icmp slt i32 %xa, 32001 + br i1 %cmp214, label %for.body.us.preheader, label %for.cond.cleanup + +for.body.us.preheader: ; preds = %entry + %sub = add i32 %xa, -1 + %0 = sext i32 %sub to i64 + %1 = sext i32 %xb to i64 + br label %for.body.us + +for.body.us: ; preds = %for.body.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us + %nl.016.us = phi i32 [ %inc.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ] + br label %for.body4.us + +for.body4.us: ; preds = %for.body.us, %for.body4.us + %indvars.iv = phi i64 [ %0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ] + %arrayidx.us = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv + %2 = load float, ptr %arrayidx.us, align 4 + %arrayidx6.us = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv + %3 = load float, ptr %arrayidx6.us, align 4 + %add.us = fadd fast float %3, %2 + store float %add.us, ptr %arrayidx6.us, align 4 + %indvars.iv.next = add i64 %indvars.iv, %1 + %cmp2.us = icmp slt i64 %indvars.iv.next, 32000 + br i1 %cmp2.us, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %inc.us = add nuw nsw i32 %nl.016.us, 1 + %exitcond.not = icmp eq i32 %inc.us, 100000 + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body.us + +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret float undef +}