diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f3de24aa4c3d1..fb6640d5cfcf8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -175,6 +175,7 @@ const char LLVMLoopVectorizeFollowupEpilogue[] = STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); +STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized"); static cl::opt EnableEpilogueVectorization( "enable-epilogue-vectorization", cl::init(true), cl::Hidden, @@ -7205,6 +7206,8 @@ DenseMap LoopVectorizationPlanner::executePlan( "Trying to execute plan with unsupported VF"); assert(BestVPlan.hasUF(BestUF) && "Trying to execute plan with unsupported UF"); + if (BestVPlan.hasEarlyExit()) + ++LoopsEarlyExitVectorized; // TODO: Move to VPlan transform stage once the transition to the VPlan-based // cost model is complete for better cost estimates. VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9a6e4b36397b3..85741b977bb77 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4183,7 +4183,8 @@ class VPlan { /// block with multiple predecessors (one for the exit via the latch and one /// via the other early exit). bool hasEarlyExit() const { - return ExitBlocks.size() > 1 || ExitBlocks[0]->getNumPredecessors() > 1; + return ExitBlocks.size() > 1 || + (ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1); } /// Returns true if the scalar tail may execute after the vector loop. Note diff --git a/llvm/test/Transforms/LoopVectorize/vect.stats.ll b/llvm/test/Transforms/LoopVectorize/vect.stats.ll index 9a55dc99c316b..018e2c213ddf2 100644 --- a/llvm/test/Transforms/LoopVectorize/vect.stats.ll +++ b/llvm/test/Transforms/LoopVectorize/vect.stats.ll @@ -1,12 +1,12 @@ -; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize --disable-output -stats -S 2>&1 | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -enable-early-exit-vectorization --disable-output -stats -S 2>&1 | FileCheck %s ; REQUIRES: asserts -; -; We have 2 loops, one of them is vectorizable and the second one is not. -; +; We have 3 loops, two of them are vectorizable (with one being early-exit +; vectorized) and the third one is not. -; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization -; CHECK: 1 loop-vectorize - Number of loops vectorized +; CHECK: 3 loop-vectorize - Number of loops analyzed for vectorization +; CHECK: 1 loop-vectorize - Number of early exit loops vectorized +; CHECK: 2 loop-vectorize - Number of loops vectorized target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -31,6 +31,36 @@ for.end: ; preds = %entry, %for.body ret void } +define i32 @early_exit_vectorized(i64 %end) { +entry: + %p1 = alloca [1024 x i32] + %p2 = alloca [1024 x i32] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + %end.clamped = and i64 %end, 1023 + br label %for.body + +for.body: + %ind = phi i64 [ %ind.next, %for.inc ], [ 0, %entry ] + %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %ind + %0 = load i32, ptr %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %ind + %1 = load i32, ptr %arrayidx2, align 4 + %cmp.early = icmp eq i32 %0, %1 + br i1 %cmp.early, label %found, label %for.inc + +for.inc: + %ind.next = add i64 %ind, 1 + %cmp = icmp ult i64 %ind.next, %end.clamped + br i1 %cmp, label %for.body, label %exit + +found: + ret i32 1 + +exit: + ret i32 0 +} + define void @not_vectorized(ptr nocapture %a, i64 %size) { entry: %cmp1 = icmp sle i64 %size, 0 @@ -56,3 +86,5 @@ for.body: ; preds = %entry, %for.body for.end: ; preds = %entry, %for.body ret void } + +declare void @init_mem(ptr, i64);