Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions llvm/lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1917,6 +1917,74 @@ isLoopVariantIndirectAddress(ArrayRef<const Value *> UnderlyingObjects,
});
}

static bool isAffectedByLoop(const SCEV *Expr, const Loop *L,
ScalarEvolution &SE) {
const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
if (!AddRec)
return false;

if (AddRec->getLoop() == L)
return true;

const SCEV *Start = AddRec->getStart();
const SCEV *Step = AddRec->getStepRecurrence(SE);
return isAffectedByLoop(Start, L, SE) || isAffectedByLoop(Step, L, SE);
}

// Consider the following case:
//
// for (int j = 0; j < 256; j++) // Loop j
// for (int i = j+1; i < 256; i++)// Loop i
// a[i] -= aa[j][i] * a[j];
//
// Given that SCEV of &a[j] is {@a,+,4}<Loop j>, a[j] will be treated as scalar
// when vectorizing Loop i. If the accessing size of a[j] <= Dist(a[j], a[i]),
// there is no overlapped and can be vectorized.
//
// In this case, accessing size of a[j] is 4 byte(float) and Dist(a[j], a[i])
// is {4,+,4} which bring the minimum distance as 4.
//
// Return true if Dist is equal or greater than the accessing size of Src.
static bool isSrcNoOverlap(const SCEV *Src, Instruction *AInst,
const SCEV *Dist, const Loop *InnermostLoop,
ScalarEvolution &SE) {
// If the Src is not affected by InnermostLoop, when vectorizing
// InnermostLoop, Src will be treated as scalar instead of widening to vector.
if (isAffectedByLoop(Src, InnermostLoop, SE))
return false;

if (!isa<SCEVAddRecExpr>(Dist))
return false;

auto *Diff = cast<SCEVAddRecExpr>(Dist);

if (Diff->getLoop() != InnermostLoop)
return false;

if (!isa<SCEVConstant>(Diff->getStart()))
return false;

if (!isa<SCEVConstant>(Diff->getStepRecurrence(SE)))
return false;

const SCEVConstant *DiffInc = cast<SCEVConstant>(Diff->getStepRecurrence(SE));
if (DiffInc->getAPInt().isNegative())
return false;

// If the step of Diff is positve and the Start of diff is constant,
// we can get the minimum diff between Src and Dst.
const SCEVConstant *MinDiff = cast<SCEVConstant>(Diff->getStart());

// If we get here, Src won't be vectorized, so we only need to consider the
// scalar load/store size. If the minimum diff between Src and Dst is equal
// or greater than the load/store size, there is no overlapped.
if (MinDiff->getAPInt().getSExtValue() >=
getLoadStoreType(AInst)->getScalarSizeInBits() / 8)
return true;

return false;
}

// Get the dependence distance, stride, type size in whether i is a write for
// the dependence between A and B. Returns a DepType, if we can prove there's
// no dependence or the analysis fails. Outlined to lambda to limit he scope
Expand Down Expand Up @@ -1979,6 +2047,9 @@ getDependenceDistanceStrideAndSize(
InnermostLoop))
return MemoryDepChecker::Dependence::IndirectUnsafe;

if (isSrcNoOverlap(Src, AInst, Dist, InnermostLoop, SE))
return MemoryDepChecker::Dependence::NoDep;

// Need accesses with constant stride. We don't want to vectorize
// "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap
// in the address space.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; REQUIRES: asserts
; RUN: opt -passes=loop-vectorize -debug-only=loop-accesses -force-vector-width=4 -disable-output %s 2>&1 | FileCheck %s -check-prefix=LOOP-ACCESS
; RUN: opt -passes=loop-vectorize -debug-only=vectorutils -force-vector-width=4 -disable-output %s 2>&1 | FileCheck %s
; RUN: opt -passes=loop-vectorize -debug-only=loop-accesses -force-vector-width=4 -max-dependences=97 -disable-output %s 2>&1 | FileCheck %s -check-prefix=LOOP-ACCESS
; RUN: opt -passes=loop-vectorize -debug-only=vectorutils -force-vector-width=4 -max-dependences=97 -disable-output %s 2>&1 | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-redhat-linux-gnu"

Expand Down
58 changes: 58 additions & 0 deletions llvm/test/Transforms/LoopVectorize/vectorize-s115.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -S | FileCheck %s

@aa = global [256 x [256 x float]] zeroinitializer, align 4
@a = global [32000 x float] zeroinitializer, align 4

;; Given that SCEV of &a[j] is {@a,+,4}<Loop j>, a[j] will be treated as scalar
;; when vectorizing Loop i. If the accessing size of a[j] <= Dist(a[j], a[i]),
;; there is no overlapped and can be vectorized.
;;
;; In this case, accessing size of a[j] is 4 byte(float) and Dist(a[j], a[i])
;; is {4,+,4} which bring the minimum distance as 4.
;;
;; for (int j = 0; j < 256; j++) // Loop j
;; for (int i = j+1; i < 256; i++)// Loop i
;; a[i] -= aa[j][i] * a[j];

; CHECK: vector.body:

define signext i32 @s115() {
entry:
br label %for.body

for.cond.loopexit.loopexit: ; preds = %for.body4
br label %for.cond.loopexit

for.cond.loopexit: ; preds = %for.cond.loopexit.loopexit, %for.body
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond28.not = icmp eq i64 %indvars.iv.next27, 256
br i1 %exitcond28.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup: ; preds = %for.cond.loopexit
ret i32 0

for.body: ; preds = %entry, %for.cond.loopexit
%indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ]
%indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.cond.loopexit ]
%indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
%cmp221 = icmp ult i64 %indvars.iv26, 255
br i1 %cmp221, label %for.body4.lr.ph, label %for.cond.loopexit

for.body4.lr.ph: ; preds = %for.body
%arrayidx8 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv26
br label %for.body4

for.body4: ; preds = %for.body4.lr.ph, %for.body4
%indvars.iv24 = phi i64 [ %indvars.iv, %for.body4.lr.ph ], [ %indvars.iv.next25, %for.body4 ]
%arrayidx6 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv26, i64 %indvars.iv24
%0 = load float, ptr %arrayidx6, align 4
%1 = load float, ptr %arrayidx8, align 4
%arrayidx10 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv24
%2 = load float, ptr %arrayidx10, align 4
%neg = fneg float %0
%3 = tail call float @llvm.fmuladd.f32(float %neg, float %1, float %2)
store float %3, ptr %arrayidx10, align 4
%indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
%exitcond.not = icmp eq i64 %indvars.iv.next25, 256
br i1 %exitcond.not, label %for.cond.loopexit.loopexit, label %for.body4
}