diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index dd6b88fee415a..67beec09949f4 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1917,6 +1917,74 @@ isLoopVariantIndirectAddress(ArrayRef UnderlyingObjects, }); } +static bool isAffectedByLoop(const SCEV *Expr, const Loop *L, + ScalarEvolution &SE) { + const SCEVAddRecExpr *AddRec = dyn_cast(Expr); + if (!AddRec) + return false; + + if (AddRec->getLoop() == L) + return true; + + const SCEV *Start = AddRec->getStart(); + const SCEV *Step = AddRec->getStepRecurrence(SE); + return isAffectedByLoop(Start, L, SE) || isAffectedByLoop(Step, L, SE); +} + +// Consider the following case: +// +// for (int j = 0; j < 256; j++) // Loop j +// for (int i = j+1; i < 256; i++)// Loop i +// a[i] -= aa[j][i] * a[j]; +// +// Given that SCEV of &a[j] is {@a,+,4}, a[j] will be treated as scalar +// when vectorizing Loop i. If the accessing size of a[j] <= Dist(a[j], a[i]), +// there is no overlapped and can be vectorized. +// +// In this case, accessing size of a[j] is 4 byte(float) and Dist(a[j], a[i]) +// is {4,+,4} which bring the minimum distance as 4. +// +// Return true if Dist is equal or greater than the accessing size of Src. +static bool isSrcNoOverlap(const SCEV *Src, Instruction *AInst, + const SCEV *Dist, const Loop *InnermostLoop, + ScalarEvolution &SE) { + // If the Src is not affected by InnermostLoop, when vectorizing + // InnermostLoop, Src will be treated as scalar instead of widening to vector. + if (isAffectedByLoop(Src, InnermostLoop, SE)) + return false; + + if (!isa(Dist)) + return false; + + auto *Diff = cast(Dist); + + if (Diff->getLoop() != InnermostLoop) + return false; + + if (!isa(Diff->getStart())) + return false; + + if (!isa(Diff->getStepRecurrence(SE))) + return false; + + const SCEVConstant *DiffInc = cast(Diff->getStepRecurrence(SE)); + if (DiffInc->getAPInt().isNegative()) + return false; + + // If the step of Diff is positve and the Start of diff is constant, + // we can get the minimum diff between Src and Dst. + const SCEVConstant *MinDiff = cast(Diff->getStart()); + + // If we get here, Src won't be vectorized, so we only need to consider the + // scalar load/store size. If the minimum diff between Src and Dst is equal + // or greater than the load/store size, there is no overlapped. + if (MinDiff->getAPInt().getSExtValue() >= + getLoadStoreType(AInst)->getScalarSizeInBits() / 8) + return true; + + return false; +} + // Get the dependence distance, stride, type size in whether i is a write for // the dependence between A and B. Returns a DepType, if we can prove there's // no dependence or the analysis fails. Outlined to lambda to limit he scope @@ -1979,6 +2047,9 @@ getDependenceDistanceStrideAndSize( InnermostLoop)) return MemoryDepChecker::Dependence::IndirectUnsafe; + if (isSrcNoOverlap(Src, AInst, Dist, InnermostLoop, SE)) + return MemoryDepChecker::Dependence::NoDep; + // Need accesses with constant stride. We don't want to vectorize // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap // in the address space. diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-use-after-free.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-use-after-free.ll index d5239d5a4e33d..3b9d53ca600b6 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-use-after-free.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-use-after-free.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -debug-only=loop-accesses -force-vector-width=4 -disable-output %s 2>&1 | FileCheck %s -check-prefix=LOOP-ACCESS -; RUN: opt -passes=loop-vectorize -debug-only=vectorutils -force-vector-width=4 -disable-output %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -debug-only=loop-accesses -force-vector-width=4 -max-dependences=97 -disable-output %s 2>&1 | FileCheck %s -check-prefix=LOOP-ACCESS +; RUN: opt -passes=loop-vectorize -debug-only=vectorutils -force-vector-width=4 -max-dependences=97 -disable-output %s 2>&1 | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-redhat-linux-gnu" diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll b/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll new file mode 100644 index 0000000000000..e17236a8f23a1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll @@ -0,0 +1,58 @@ +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -S | FileCheck %s + +@aa = global [256 x [256 x float]] zeroinitializer, align 4 +@a = global [32000 x float] zeroinitializer, align 4 + +;; Given that SCEV of &a[j] is {@a,+,4}, a[j] will be treated as scalar +;; when vectorizing Loop i. If the accessing size of a[j] <= Dist(a[j], a[i]), +;; there is no overlapped and can be vectorized. +;; +;; In this case, accessing size of a[j] is 4 byte(float) and Dist(a[j], a[i]) +;; is {4,+,4} which bring the minimum distance as 4. +;; +;; for (int j = 0; j < 256; j++) // Loop j +;; for (int i = j+1; i < 256; i++)// Loop i +;; a[i] -= aa[j][i] * a[j]; + +; CHECK: vector.body: + +define signext i32 @s115() { +entry: + br label %for.body + +for.cond.loopexit.loopexit: ; preds = %for.body4 + br label %for.cond.loopexit + +for.cond.loopexit: ; preds = %for.cond.loopexit.loopexit, %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond28.not = icmp eq i64 %indvars.iv.next27, 256 + br i1 %exitcond28.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.cond.loopexit + ret i32 0 + +for.body: ; preds = %entry, %for.cond.loopexit + %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ] + %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.cond.loopexit ] + %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1 + %cmp221 = icmp ult i64 %indvars.iv26, 255 + br i1 %cmp221, label %for.body4.lr.ph, label %for.cond.loopexit + +for.body4.lr.ph: ; preds = %for.body + %arrayidx8 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv26 + br label %for.body4 + +for.body4: ; preds = %for.body4.lr.ph, %for.body4 + %indvars.iv24 = phi i64 [ %indvars.iv, %for.body4.lr.ph ], [ %indvars.iv.next25, %for.body4 ] + %arrayidx6 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv26, i64 %indvars.iv24 + %0 = load float, ptr %arrayidx6, align 4 + %1 = load float, ptr %arrayidx8, align 4 + %arrayidx10 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv24 + %2 = load float, ptr %arrayidx10, align 4 + %neg = fneg float %0 + %3 = tail call float @llvm.fmuladd.f32(float %neg, float %1, float %2) + store float %3, ptr %arrayidx10, align 4 + %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next25, 256 + br i1 %exitcond.not, label %for.cond.loopexit.loopexit, label %for.body4 +}