Skip to content

Commit 3c810b7

Browse files
authored
[LV] Add initial legality checks for early exit loops with side effects (#145663)
This adds initial support to LoopVectorizationLegality to analyze loops with side effects (particularly stores to memory) and an uncountable exit. This patch alone doesn't enable any new transformations, but does give clearer reasons for rejecting vectorization for such a loop. The intent is for a loop like the following to pass the specific checks, and only be rejected at the end until the transformation code is committed: ``` // Assume a is marked restrict // Assume b is known to be large enough to access up to b[N-1] for (int i = 0; i < N; ++) { a[i]++; if (b[i] > threshold) break; } ```
1 parent 5b81367 commit 3c810b7

File tree

6 files changed

+417
-37
lines changed

6 files changed

+417
-37
lines changed

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 97 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -251,15 +251,18 @@ struct HistogramInfo {
251251
/// induction variable and the different reduction variables.
252252
class LoopVectorizationLegality {
253253
public:
254-
LoopVectorizationLegality(
255-
Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
256-
TargetTransformInfo *TTI, TargetLibraryInfo *TLI, Function *F,
257-
LoopAccessInfoManager &LAIs, LoopInfo *LI, OptimizationRemarkEmitter *ORE,
258-
LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB,
259-
AssumptionCache *AC, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
254+
LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
255+
DominatorTree *DT, TargetTransformInfo *TTI,
256+
TargetLibraryInfo *TLI, Function *F,
257+
LoopAccessInfoManager &LAIs, LoopInfo *LI,
258+
OptimizationRemarkEmitter *ORE,
259+
LoopVectorizationRequirements *R,
260+
LoopVectorizeHints *H, DemandedBits *DB,
261+
AssumptionCache *AC, BlockFrequencyInfo *BFI,
262+
ProfileSummaryInfo *PSI, AAResults *AA)
260263
: TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT), LAIs(LAIs),
261-
ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC), BFI(BFI),
262-
PSI(PSI) {}
264+
ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC), BFI(BFI), PSI(PSI),
265+
AA(AA) {}
263266

264267
/// ReductionList contains the reduction descriptors for all
265268
/// of the reductions that were found in the loop.
@@ -407,6 +410,14 @@ class LoopVectorizationLegality {
407410
return UncountableExitingBB;
408411
}
409412

413+
/// Returns true if this is an early exit loop with state-changing or
414+
/// potentially-faulting operations and the condition for the uncountable
415+
/// exit must be determined before any of the state changes or potentially
416+
/// faulting operations take place.
417+
bool hasUncountableExitWithSideEffects() const {
418+
return UncountableExitWithSideEffects;
419+
}
420+
410421
/// Return true if there is store-load forwarding dependencies.
411422
bool isSafeForAnyStoreLoadForwardDistances() const {
412423
return LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances();
@@ -524,20 +535,87 @@ class LoopVectorizationLegality {
524535
/// Returns true if this is an early exit loop that can be vectorized.
525536
/// Currently, a loop with an uncountable early exit is considered
526537
/// vectorizable if:
527-
/// 1. There are no writes to memory in the loop.
538+
/// 1. Writes to memory will access different underlying objects than
539+
/// any load used as part of the uncountable exit condition.
528540
/// 2. The loop has only one early uncountable exit
529541
/// 3. The early exit block dominates the latch block.
530542
/// 4. The latch block has an exact exit count.
531543
/// 5. The loop does not contain reductions or recurrences.
532544
/// 6. We can prove at compile-time that loops will not contain faulting
533-
/// loads.
545+
/// loads, or that any faulting loads would also occur in a purely
546+
/// scalar loop.
534547
/// 7. It is safe to speculatively execute instructions such as divide or
535-
/// call instructions.
548+
/// call instructions.
536549
/// The list above is not based on theoretical limitations of vectorization,
537550
/// but simply a statement that more work is needed to support these
538551
/// additional cases safely.
539552
bool isVectorizableEarlyExitLoop();
540553

554+
/// When vectorizing an early exit loop containing side effects, we need to
555+
/// determine whether an uncounted exit will be taken before any operation
556+
/// that has side effects.
557+
///
558+
/// Consider a loop like the following:
559+
/// for (int i = 0; i < N; ++i) {
560+
/// a[i] = b[i];
561+
/// if (c[i] == 0)
562+
/// break;
563+
/// }
564+
///
565+
/// We have both a load and a store operation occurring before the condition
566+
/// is checked for early termination. We could potentially restrict
567+
/// vectorization to cases where we know all addresses are guaranteed to be
568+
/// dereferenceable, which would allow the load before the condition check to
569+
/// be vectorized.
570+
///
571+
/// The store, however, should not execute across all lanes if early
572+
/// termination occurs before the end of the vector. We must only store to the
573+
/// locations that would have been stored to by a scalar loop. So we need to
574+
/// know what the result of 'c[i] == 0' is before performing the vector store,
575+
/// with or without masking.
576+
///
577+
/// We can either do this by moving the condition load to the top of the
578+
/// vector body and using the comparison to create masks for other operations
579+
/// in the loop, or by looking ahead one vector iteration and bailing out to
580+
/// the scalar loop if an exit would occur.
581+
///
582+
/// Using the latter approach (applicable to more targets), we need to hoist
583+
/// the first load (of c[0]) out of the loop then rotate the load within the
584+
/// loop to the next iteration, remembering to adjust the vector trip count.
585+
/// Something like the following:
586+
///
587+
/// vec.ph:
588+
/// %ci.0 = load <4 x i32>, ptr %c
589+
/// %cmp.0 = icmp eq <4 x i32> %ci.0, zeroinitializer
590+
/// %any.of.0 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %cmp.0)
591+
/// br i1 %any.of.0, label %scalar.ph, label %vec.body
592+
/// vec.body:
593+
/// %iv = phi...
594+
/// phi for c[i] if used elsewhere in the loop...
595+
/// other operations in the loop...
596+
/// %iv.next = add i64 %iv, 4
597+
/// %addr.next = getelementptr i32, ptr %c, i64 %iv.next
598+
/// %ci.next = load <4 x i32>, ptr %addr.next
599+
/// %cmp.next = icmp eq <4 x i32> %ci.next, zeroinitializer
600+
/// %any.of.next = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %cmp.next)
601+
/// iv.next compared with shortened vector tripcount...
602+
/// uncountable condition combined with counted condition...
603+
/// br...
604+
///
605+
/// Doing this means the last few iterations will always be performed by a
606+
/// scalar loop regardless of which exit is taken, and so vector iterations
607+
/// will never execute a memory operation to a location that the scalar loop
608+
/// would not have.
609+
///
610+
/// This means we must ensure that it is safe to move the load for 'c[i]'
611+
/// before other memory operations (or any other observable side effects) in
612+
/// the loop.
613+
///
614+
/// Currently, c[i] must have only one user (the comparison used for the
615+
/// uncountable exit) since we would otherwise need to introduce a PHI node
616+
/// for it.
617+
bool canUncountableExitConditionLoadBeMoved(BasicBlock *ExitingBlock);
618+
541619
/// Return true if all of the instructions in the block can be speculatively
542620
/// executed, and record the loads/stores that require masking.
543621
/// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -646,6 +724,10 @@ class LoopVectorizationLegality {
646724
BlockFrequencyInfo *BFI;
647725
ProfileSummaryInfo *PSI;
648726

727+
// Alias Analysis results used to check for possible aliasing with loads
728+
// used in uncountable exit conditions.
729+
AAResults *AA;
730+
649731
/// If we discover function calls within the loop which have a valid
650732
/// vectorized variant, record that fact so that LoopVectorize can
651733
/// (potentially) make a better decision on the maximum VF and enable
@@ -659,6 +741,10 @@ class LoopVectorizationLegality {
659741
/// Keep track of an uncountable exiting block, if there is exactly one early
660742
/// exit.
661743
BasicBlock *UncountableExitingBB = nullptr;
744+
745+
/// If true, the loop has at least one uncountable exit and operations within
746+
/// the loop may have observable side effects.
747+
bool UncountableExitWithSideEffects = false;
662748
};
663749

664750
} // namespace llvm

llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ struct LoopVectorizePass : public PassInfoMixin<LoopVectorizePass> {
152152
LoopAccessInfoManager *LAIs;
153153
OptimizationRemarkEmitter *ORE;
154154
ProfileSummaryInfo *PSI;
155+
AAResults *AA;
155156

156157
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
157158
LLVM_ABI void

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 142 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
//
1616

1717
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18+
#include "llvm/Analysis/AliasAnalysis.h"
1819
#include "llvm/Analysis/Loads.h"
1920
#include "llvm/Analysis/LoopInfo.h"
21+
#include "llvm/Analysis/MustExecute.h"
2022
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
2123
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
2224
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -1223,8 +1225,18 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
12231225
});
12241226
}
12251227

1226-
if (!LAI->canVectorizeMemory())
1228+
if (!LAI->canVectorizeMemory()) {
1229+
if (hasUncountableExitWithSideEffects()) {
1230+
reportVectorizationFailure(
1231+
"Cannot vectorize unsafe dependencies in uncountable exit loop with "
1232+
"side effects",
1233+
"CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE,
1234+
TheLoop);
1235+
return false;
1236+
}
1237+
12271238
return canVectorizeIndirectUnsafeDependences();
1239+
}
12281240

12291241
if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
12301242
reportVectorizationFailure("We don't allow storing to uniform addresses",
@@ -1755,16 +1767,24 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
17551767
}
17561768
};
17571769

1770+
bool HasSideEffects = false;
17581771
for (auto *BB : TheLoop->blocks())
17591772
for (auto &I : *BB) {
17601773
if (I.mayWriteToMemory()) {
1761-
// We don't support writes to memory.
1774+
if (isa<StoreInst>(&I) && cast<StoreInst>(&I)->isSimple()) {
1775+
HasSideEffects = true;
1776+
continue;
1777+
}
1778+
1779+
// We don't support complex writes to memory.
17621780
reportVectorizationFailure(
1763-
"Writes to memory unsupported in early exit loops",
1764-
"Cannot vectorize early exit loop with writes to memory",
1781+
"Complex writes to memory unsupported in early exit loops",
1782+
"Cannot vectorize early exit loop with complex writes to memory",
17651783
"WritesInEarlyExitLoop", ORE, TheLoop);
17661784
return false;
1767-
} else if (!IsSafeOperation(&I)) {
1785+
}
1786+
1787+
if (!IsSafeOperation(&I)) {
17681788
reportVectorizationFailure("Early exit loop contains operations that "
17691789
"cannot be speculatively executed",
17701790
"UnsafeOperationsEarlyExitLoop", ORE,
@@ -1777,15 +1797,22 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
17771797
assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock &&
17781798
"Expected latch predecessor to be the early exiting block");
17791799

1780-
Predicates.clear();
17811800
SmallVector<LoadInst *, 4> NonDerefLoads;
1782-
if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads,
1783-
&Predicates)) {
1784-
reportVectorizationFailure("Loop may fault",
1785-
"Cannot vectorize non-read-only early exit loop",
1786-
"NonReadOnlyEarlyExitLoop", ORE, TheLoop);
1801+
// TODO: Handle loops that may fault.
1802+
if (!HasSideEffects) {
1803+
// Read-only loop.
1804+
Predicates.clear();
1805+
if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads,
1806+
&Predicates)) {
1807+
reportVectorizationFailure(
1808+
"Loop may fault", "Cannot vectorize non-read-only early exit loop",
1809+
"NonReadOnlyEarlyExitLoop", ORE, TheLoop);
1810+
return false;
1811+
}
1812+
} else if (!canUncountableExitConditionLoadBeMoved(
1813+
SingleUncountableExitingBlock))
17871814
return false;
1788-
}
1815+
17891816
// Check non-dereferenceable loads if any.
17901817
for (LoadInst *LI : NonDerefLoads) {
17911818
// Only support unit-stride access for now.
@@ -1813,6 +1840,99 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
18131840
"backedge taken count: "
18141841
<< *SymbolicMaxBTC << '\n');
18151842
UncountableExitingBB = SingleUncountableExitingBlock;
1843+
UncountableExitWithSideEffects = HasSideEffects;
1844+
return true;
1845+
}
1846+
1847+
bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
1848+
BasicBlock *ExitingBlock) {
1849+
// Try to find a load in the critical path for the uncountable exit condition.
1850+
// This is currently matching about the simplest form we can, expecting
1851+
// only one in-loop load, the result of which is directly compared against
1852+
// a loop-invariant value.
1853+
// FIXME: We're insisting on a single use for now, because otherwise we will
1854+
// need to make PHI nodes for other users. That can be done once the initial
1855+
// transform code lands.
1856+
auto *Br = cast<BranchInst>(ExitingBlock->getTerminator());
1857+
1858+
using namespace llvm::PatternMatch;
1859+
Instruction *L = nullptr;
1860+
Value *Ptr = nullptr;
1861+
Value *R = nullptr;
1862+
if (!match(Br->getCondition(),
1863+
m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))),
1864+
m_Value(R))))) {
1865+
reportVectorizationFailure(
1866+
"Early exit loop with store but no supported condition load",
1867+
"NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1868+
return false;
1869+
}
1870+
1871+
// FIXME: Don't rely on operand ordering for the comparison.
1872+
if (!TheLoop->isLoopInvariant(R)) {
1873+
reportVectorizationFailure(
1874+
"Early exit loop with store but no supported condition load",
1875+
"NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1876+
return false;
1877+
}
1878+
1879+
// Make sure that the load address is not loop invariant; we want an
1880+
// address calculation that we can rotate to the next vector iteration.
1881+
const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
1882+
if (!isa<SCEVAddRecExpr>(PtrScev)) {
1883+
reportVectorizationFailure(
1884+
"Uncountable exit condition depends on load with an address that is "
1885+
"not an add recurrence",
1886+
"EarlyExitLoadInvariantAddress", ORE, TheLoop);
1887+
return false;
1888+
}
1889+
1890+
// FIXME: Support gathers after first-faulting load support lands.
1891+
SmallVector<const SCEVPredicate *, 4> Predicates;
1892+
LoadInst *Load = cast<LoadInst>(L);
1893+
if (!isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(), *DT, AC,
1894+
&Predicates)) {
1895+
reportVectorizationFailure(
1896+
"Loop may fault",
1897+
"Cannot vectorize potentially faulting early exit loop",
1898+
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1899+
return false;
1900+
}
1901+
1902+
ICFLoopSafetyInfo SafetyInfo;
1903+
SafetyInfo.computeLoopSafetyInfo(TheLoop);
1904+
// We need to know that load will be executed before we can hoist a
1905+
// copy out to run just before the first iteration.
1906+
// FIXME: Currently, other restrictions prevent us from reaching this point
1907+
// with a loop where the uncountable exit condition is determined
1908+
// by a conditional load.
1909+
assert(SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop) &&
1910+
"Unhandled control flow in uncountable exit loop with side effects");
1911+
1912+
// Prohibit any potential aliasing with any instruction in the loop which
1913+
// might store to memory.
1914+
// FIXME: Relax this constraint where possible.
1915+
for (auto *BB : TheLoop->blocks()) {
1916+
for (auto &I : *BB) {
1917+
if (&I == Load)
1918+
continue;
1919+
1920+
if (I.mayWriteToMemory()) {
1921+
if (auto *SI = dyn_cast<StoreInst>(&I)) {
1922+
AliasResult AR = AA->alias(Ptr, SI->getPointerOperand());
1923+
if (AR == AliasResult::NoAlias)
1924+
continue;
1925+
}
1926+
1927+
reportVectorizationFailure(
1928+
"Cannot determine whether critical uncountable exit load address "
1929+
"does not alias with a memory write",
1930+
"CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop);
1931+
return false;
1932+
}
1933+
}
1934+
}
1935+
18161936
return true;
18171937
}
18181938

@@ -1885,6 +2005,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
18852005
} else {
18862006
if (!isVectorizableEarlyExitLoop()) {
18872007
assert(!hasUncountableEarlyExit() &&
2008+
!hasUncountableExitWithSideEffects() &&
18882009
"Must be false without vectorizable early-exit loop");
18892010
if (DoExtraAnalysis)
18902011
Result = false;
@@ -1903,6 +2024,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
19032024
return false;
19042025
}
19052026

2027+
// Bail out for state-changing loops with uncountable exits for now.
2028+
if (UncountableExitWithSideEffects) {
2029+
reportVectorizationFailure(
2030+
"Writes to memory unsupported in early exit loops",
2031+
"Cannot vectorize early exit loop with writes to memory",
2032+
"WritesInEarlyExitLoop", ORE, TheLoop);
2033+
return false;
2034+
}
2035+
19062036
if (Result) {
19072037
LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
19082038
<< (LAI->getRuntimePointerChecking()->Need

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9810,7 +9810,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
98109810
// Check if it is legal to vectorize the loop.
98119811
LoopVectorizationRequirements Requirements;
98129812
LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9813-
&Requirements, &Hints, DB, AC, BFI, PSI);
9813+
&Requirements, &Hints, DB, AC, BFI, PSI, AA);
98149814
if (!LVL.canVectorize(EnableVPlanNativePath)) {
98159815
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
98169816
Hints.emitRemarkWithHints();
@@ -10247,6 +10247,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
1024710247
DB = &AM.getResult<DemandedBitsAnalysis>(F);
1024810248
ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1024910249
LAIs = &AM.getResult<LoopAccessAnalysis>(F);
10250+
AA = &AM.getResult<AAManager>(F);
1025010251

1025110252
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
1025210253
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());

0 commit comments

Comments
 (0)