Skip to content

Commit af9a426

Browse files
authored
[LAA] Only use inbounds/nusw in isNoWrap if the GEP is dereferenced. (llvm#161445)
Update isNoWrap to only use the inbounds/nusw flags from GEPs that are guaranteed to be dereferenced on every iteration. This fixes a case where we incorrectly determine no dependence. I think the issue is isolated to code that evaluates the resulting AddRec at BTC, just using it to compute the distance between accesses should still be fine; if the access does not execute in a given iteration, there's no dependence in that iteration. But isolating the code is not straight-forward, so be conservative for now. The practical impact should be very minor (only one loop changed across a corpus with 27k modules from large C/C++ workloads. Fixes llvm#160912. PR: llvm#161445
1 parent 83d1599 commit af9a426

File tree

10 files changed

+188
-57
lines changed

10 files changed

+188
-57
lines changed

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -893,7 +893,7 @@ replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
893893
/// result of this function is undefined.
894894
LLVM_ABI std::optional<int64_t>
895895
getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
896-
const Loop *Lp,
896+
const Loop *Lp, const DominatorTree &DT,
897897
const DenseMap<Value *, const SCEV *> &StridesMap =
898898
DenseMap<Value *, const SCEV *>(),
899899
bool Assume = false, bool ShouldCheckWrap = true);

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -806,11 +806,11 @@ class AccessAnalysis {
806806
typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
807807

808808
AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
809-
MemoryDepChecker::DepCandidates &DA,
809+
DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
810810
PredicatedScalarEvolution &PSE,
811811
SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
812-
: TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
813-
LoopAliasScopes(LoopAliasScopes) {
812+
: TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
813+
PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
814814
// We're analyzing dependences across loop iterations.
815815
BAA.enableCrossIterationMode();
816816
}
@@ -934,6 +934,9 @@ class AccessAnalysis {
934934
/// The LoopInfo of the loop being checked.
935935
const LoopInfo *LI;
936936

937+
/// The dominator tree of the function.
938+
DominatorTree &DT;
939+
937940
/// Sets of potentially dependent accesses - members of one set share an
938941
/// underlying pointer. The set "CheckDeps" identfies which sets really need a
939942
/// dependence check.
@@ -1015,6 +1018,7 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
10151018
/// informating from the IR pointer value to determine no-wrap.
10161019
static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
10171020
Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
1021+
const DominatorTree &DT,
10181022
std::optional<int64_t> Stride = std::nullopt) {
10191023
// FIXME: This should probably only return true for NUW.
10201024
if (AR->getNoWrapFlags(SCEV::NoWrapMask))
@@ -1029,8 +1033,18 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
10291033
// case, the GEP would be poison and any memory access dependent on it would
10301034
// be immediate UB when executed.
10311035
if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr);
1032-
GEP && GEP->hasNoUnsignedSignedWrap())
1033-
return true;
1036+
GEP && GEP->hasNoUnsignedSignedWrap()) {
1037+
// For the above reasoning to apply, the pointer must be dereferenced in
1038+
// every iteration.
1039+
if (L->getHeader() == L->getLoopLatch() ||
1040+
any_of(GEP->users(), [L, &DT, GEP](User *U) {
1041+
if (getLoadStorePointerOperand(U) != GEP)
1042+
return false;
1043+
BasicBlock *UserBB = cast<Instruction>(U)->getParent();
1044+
return !LoopAccessInfo::blockNeedsPredication(UserBB, L, &DT);
1045+
}))
1046+
return true;
1047+
}
10341048

10351049
if (!Stride)
10361050
Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE);
@@ -1293,7 +1307,7 @@ bool AccessAnalysis::createCheckForAccess(
12931307
}
12941308

12951309
if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
1296-
TheLoop, Assume))
1310+
TheLoop, Assume, DT))
12971311
return false;
12981312
}
12991313

@@ -1606,7 +1620,7 @@ void AccessAnalysis::processMemAccesses() {
16061620
/// Check whether the access through \p Ptr has a constant stride.
16071621
std::optional<int64_t>
16081622
llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
1609-
const Loop *Lp,
1623+
const Loop *Lp, const DominatorTree &DT,
16101624
const DenseMap<Value *, const SCEV *> &StridesMap,
16111625
bool Assume, bool ShouldCheckWrap) {
16121626
const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
@@ -1630,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
16301644
if (!ShouldCheckWrap || !Stride)
16311645
return Stride;
16321646

1633-
if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride))
1647+
if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, DT, Stride))
16341648
return Stride;
16351649

16361650
LLVM_DEBUG(
@@ -2047,10 +2061,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
20472061
BPtr->getType()->getPointerAddressSpace())
20482062
return MemoryDepChecker::Dependence::Unknown;
20492063

2050-
std::optional<int64_t> StrideAPtr =
2051-
getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true);
2052-
std::optional<int64_t> StrideBPtr =
2053-
getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true);
2064+
std::optional<int64_t> StrideAPtr = getPtrStride(
2065+
PSE, ATy, APtr, InnermostLoop, *DT, SymbolicStrides, true, true);
2066+
std::optional<int64_t> StrideBPtr = getPtrStride(
2067+
PSE, BTy, BPtr, InnermostLoop, *DT, SymbolicStrides, true, true);
20542068

20552069
const SCEV *Src = PSE.getSCEV(APtr);
20562070
const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -2627,7 +2641,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
26272641
}
26282642

26292643
MemoryDepChecker::DepCandidates DepCands;
2630-
AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes);
2644+
AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
2645+
LoopAliasScopes);
26312646

26322647
// Holds the analyzed pointers. We don't want to call getUnderlyingObjects
26332648
// multiple times on the same object. If the ptr is accessed twice, once
@@ -2691,7 +2706,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
26912706
bool IsReadOnlyPtr = false;
26922707
Type *AccessTy = getLoadStoreType(LD);
26932708
if (Seen.insert({Ptr, AccessTy}).second ||
2694-
!getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) {
2709+
!getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false,
2710+
true)) {
26952711
++NumReads;
26962712
IsReadOnlyPtr = true;
26972713
}

llvm/lib/Analysis/VectorUtils.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1387,9 +1387,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
13871387
// wrap around the address space we would do a memory access at nullptr
13881388
// even without the transformation. The wrapping checks are therefore
13891389
// deferred until after we've formed the interleaved groups.
1390-
int64_t Stride =
1391-
getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides,
1392-
/*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0);
1390+
int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides,
1391+
/*Assume=*/true, /*ShouldCheckWrap=*/false)
1392+
.value_or(0);
13931393

13941394
const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
13951395
AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size,
@@ -1643,8 +1643,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
16431643
assert(Member && "Group member does not exist");
16441644
Value *MemberPtr = getLoadStorePointerOperand(Member);
16451645
Type *AccessTy = getLoadStoreType(Member);
1646-
if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides,
1647-
/*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0))
1646+
if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides,
1647+
/*Assume=*/false, /*ShouldCheckWrap=*/true)
1648+
.value_or(0))
16481649
return false;
16491650
LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
16501651
<< FirstOrLast

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6207,7 +6207,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
62076207
}
62086208

62096209
static bool containsDecreasingPointers(Loop *TheLoop,
6210-
PredicatedScalarEvolution *PSE) {
6210+
PredicatedScalarEvolution *PSE,
6211+
const DominatorTree &DT) {
62116212
const auto &Strides = DenseMap<Value *, const SCEV *>();
62126213
for (BasicBlock *BB : TheLoop->blocks()) {
62136214
// Scan the instructions in the block and look for addresses that are
@@ -6216,8 +6217,8 @@ static bool containsDecreasingPointers(Loop *TheLoop,
62166217
if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
62176218
Value *Ptr = getLoadStorePointerOperand(&I);
62186219
Type *AccessTy = getLoadStoreType(&I);
6219-
if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
6220-
/*ShouldCheckWrap=*/false)
6220+
if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6221+
/*Assume=*/true, /*ShouldCheckWrap=*/false)
62216222
.value_or(0) < 0)
62226223
return true;
62236224
}
@@ -6262,7 +6263,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
62626263
// negative strides. This will require extra work to reverse the loop
62636264
// predicate, which may be expensive.
62646265
if (containsDecreasingPointers(TFI->LVL->getLoop(),
6265-
TFI->LVL->getPredicatedScalarEvolution()))
6266+
TFI->LVL->getPredicatedScalarEvolution(),
6267+
*TFI->LVL->getDominatorTree()))
62666268
Required |= TailFoldingOpts::Reverse;
62676269
if (Required == TailFoldingOpts::Disabled)
62686270
Required |= TailFoldingOpts::Simple;

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2448,7 +2448,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
24482448
//
24492449
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
24502450
const DataLayout &DL,
2451-
const LoopAccessInfo *LAI) {
2451+
const LoopAccessInfo *LAI,
2452+
const DominatorTree &DT) {
24522453
LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
24532454

24542455
// If there are live-out values, it is probably a reduction. We can predicate
@@ -2498,7 +2499,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
24982499
if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
24992500
Value *Ptr = getLoadStorePointerOperand(&I);
25002501
Type *AccessTy = getLoadStoreType(&I);
2501-
int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2502+
int64_t NextStride =
2503+
getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);
25022504
if (NextStride == 1) {
25032505
// TODO: for now only allow consecutive strides of 1. We could support
25042506
// other strides as long as it is uniform, but let's keep it simple
@@ -2585,7 +2587,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
25852587
return false;
25862588
}
25872589

2588-
return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
2590+
return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
2591+
*LVL->getDominatorTree());
25892592
}
25902593

25912594
TailFoldingStyle

llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate {
8989
/// Return true if the dependence from the store to the load has an
9090
/// absolute distance of one.
9191
/// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop)
92-
bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
93-
Loop *L) const {
92+
bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L,
93+
const DominatorTree &DT) const {
9494
Value *LoadPtr = Load->getPointerOperand();
9595
Value *StorePtr = Store->getPointerOperand();
9696
Type *LoadType = getLoadStoreType(Load);
@@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate {
102102
DL.getTypeSizeInBits(getLoadStoreType(Store)) &&
103103
"Should be a known dependence");
104104

105-
int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0);
106-
int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0);
105+
int64_t StrideLoad =
106+
getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0);
107+
int64_t StrideStore =
108+
getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0);
107109
if (!StrideLoad || !StrideStore || StrideLoad != StrideStore)
108110
return false;
109111

@@ -287,8 +289,8 @@ class LoadEliminationForLoop {
287289
// so deciding which one forwards is easy. The later one forwards as
288290
// long as they both have a dependence distance of one to the load.
289291
if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
290-
Cand.isDependenceDistanceOfOne(PSE, L) &&
291-
OtherCand->isDependenceDistanceOfOne(PSE, L)) {
292+
Cand.isDependenceDistanceOfOne(PSE, L, *DT) &&
293+
OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) {
292294
// They are in the same block, the later one will forward to the load.
293295
if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
294296
OtherCand = &Cand;
@@ -538,7 +540,7 @@ class LoadEliminationForLoop {
538540

539541
// Check whether the SCEV difference is the same as the induction step,
540542
// thus we load the value in the next iteration.
541-
if (!Cand.isDependenceDistanceOfOne(PSE, L))
543+
if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT))
542544
continue;
543545

544546
assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
462462

463463
bool CanAddPredicate = !llvm::shouldOptimizeForSize(
464464
TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465-
int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
466-
CanAddPredicate, false).value_or(0);
465+
int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides,
466+
CanAddPredicate, false)
467+
.value_or(0);
467468
if (Stride == 1 || Stride == -1)
468469
return Stride;
469470
return 0;

llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
; s0 += (1ULL << 62) + 1;
1111
; s1 += (1ULL << 62) + 2;
1212
; }
13-
; FIXME: We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine
13+
; We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine
1414
; there are no dependences), as the pointers are not dereferenced in all loop iterations.
1515
define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
1616
; CHECK-LABEL: 'test_inbounds_gep_used_in_predicated_block'
@@ -19,9 +19,14 @@ define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
1919
; CHECK-NEXT: Dependences:
2020
; CHECK-NEXT: Run-time memory checks:
2121
; CHECK-NEXT: Grouped accesses:
22+
; CHECK-NEXT: Group GRP0:
23+
; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A))
24+
; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header>
25+
; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header>
2226
; CHECK-EMPTY:
2327
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
2428
; CHECK-NEXT: SCEV assumptions:
29+
; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
2530
; CHECK-EMPTY:
2631
; CHECK-NEXT: Expressions re-written:
2732
;
@@ -63,9 +68,14 @@ define void @test_inbounds_gep_used_in_predicated_block_stored_value_operand(ptr
6368
; CHECK-NEXT: Dependences:
6469
; CHECK-NEXT: Run-time memory checks:
6570
; CHECK-NEXT: Grouped accesses:
71+
; CHECK-NEXT: Group GRP0:
72+
; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A))
73+
; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header>
74+
; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header>
6675
; CHECK-EMPTY:
6776
; CHECK-NEXT: Non vectorizable stores to invariant address were found in loop.
6877
; CHECK-NEXT: SCEV assumptions:
78+
; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
6979
; CHECK-EMPTY:
7080
; CHECK-NEXT: Expressions re-written:
7181
;
@@ -109,9 +119,14 @@ define void @test_inbounds_gep_used_in_predicated_block_non_memop_user(ptr %A, i
109119
; CHECK-NEXT: Dependences:
110120
; CHECK-NEXT: Run-time memory checks:
111121
; CHECK-NEXT: Grouped accesses:
122+
; CHECK-NEXT: Group GRP0:
123+
; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A))
124+
; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header>
125+
; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header>
112126
; CHECK-EMPTY:
113127
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
114128
; CHECK-NEXT: SCEV assumptions:
129+
; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
115130
; CHECK-EMPTY:
116131
; CHECK-NEXT: Expressions re-written:
117132
;

0 commit comments

Comments
 (0)