-
Notifications
You must be signed in to change notification settings - Fork 15k
[LV] Add initial legality checks for early exit loops with side effects #145663
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3c6f7fe
255fdb6
b991d44
7cae713
70769de
553cc93
a6189e2
3bb93d2
231d17a
4e5d4c2
1a9360d
23770b0
9c5436a
022f3e6
21a5682
e80821e
2233dcf
83e10d9
d5aa5ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -251,15 +251,18 @@ struct HistogramInfo { | |
| /// induction variable and the different reduction variables. | ||
| class LoopVectorizationLegality { | ||
| public: | ||
| LoopVectorizationLegality( | ||
| Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, | ||
| TargetTransformInfo *TTI, TargetLibraryInfo *TLI, Function *F, | ||
| LoopAccessInfoManager &LAIs, LoopInfo *LI, OptimizationRemarkEmitter *ORE, | ||
| LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB, | ||
| AssumptionCache *AC, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) | ||
| LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE, | ||
| DominatorTree *DT, TargetTransformInfo *TTI, | ||
| TargetLibraryInfo *TLI, Function *F, | ||
| LoopAccessInfoManager &LAIs, LoopInfo *LI, | ||
| OptimizationRemarkEmitter *ORE, | ||
| LoopVectorizationRequirements *R, | ||
| LoopVectorizeHints *H, DemandedBits *DB, | ||
| AssumptionCache *AC, BlockFrequencyInfo *BFI, | ||
| ProfileSummaryInfo *PSI, AAResults *AA) | ||
| : TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT), LAIs(LAIs), | ||
| ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC), BFI(BFI), | ||
| PSI(PSI) {} | ||
| ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC), BFI(BFI), PSI(PSI), | ||
| AA(AA) {} | ||
|
|
||
| /// ReductionList contains the reduction descriptors for all | ||
| /// of the reductions that were found in the loop. | ||
|
|
@@ -407,6 +410,14 @@ class LoopVectorizationLegality { | |
| return UncountableExitingBB; | ||
| } | ||
|
|
||
| /// Returns true if this is an early exit loop with state-changing or | ||
| /// potentially-faulting operations and the condition for the uncountable | ||
| /// exit must be determined before any of the state changes or potentially | ||
| /// faulting operations take place. | ||
| bool hasUncountableExitWithSideEffects() const { | ||
| return UncountableExitWithSideEffects; | ||
| } | ||
|
|
||
| /// Return true if there is store-load forwarding dependencies. | ||
| bool isSafeForAnyStoreLoadForwardDistances() const { | ||
| return LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances(); | ||
|
|
@@ -524,20 +535,87 @@ class LoopVectorizationLegality { | |
| /// Returns true if this is an early exit loop that can be vectorized. | ||
| /// Currently, a loop with an uncountable early exit is considered | ||
| /// vectorizable if: | ||
| /// 1. There are no writes to memory in the loop. | ||
| /// 1. Writes to memory will access different underlying objects than | ||
| /// any load used as part of the uncountable exit condition. | ||
| /// 2. The loop has only one early uncountable exit | ||
| /// 3. The early exit block dominates the latch block. | ||
| /// 4. The latch block has an exact exit count. | ||
| /// 5. The loop does not contain reductions or recurrences. | ||
| /// 6. We can prove at compile-time that loops will not contain faulting | ||
| /// loads. | ||
| /// loads, or that any faulting loads would also occur in a purely | ||
| /// scalar loop. | ||
| /// 7. It is safe to speculatively execute instructions such as divide or | ||
| /// call instructions. | ||
| /// call instructions. | ||
| /// The list above is not based on theoretical limitations of vectorization, | ||
| /// but simply a statement that more work is needed to support these | ||
| /// additional cases safely. | ||
| bool isVectorizableEarlyExitLoop(); | ||
|
|
||
| /// When vectorizing an early exit loop containing side effects, we need to | ||
| /// determine whether an uncounted exit will be taken before any operation | ||
| /// that has side effects. | ||
| /// | ||
| /// Consider a loop like the following: | ||
| /// for (int i = 0; i < N; ++i) { | ||
| /// a[i] = b[i]; | ||
| /// if (c[i] == 0) | ||
| /// break; | ||
| /// } | ||
| /// | ||
| /// We have both a load and a store operation occurring before the condition | ||
| /// is checked for early termination. We could potentially restrict | ||
| /// vectorization to cases where we know all addresses are guaranteed to be | ||
| /// dereferenceable, which would allow the load before the condition check to | ||
| /// be vectorized. | ||
| /// | ||
| /// The store, however, should not execute across all lanes if early | ||
| /// termination occurs before the end of the vector. We must only store to the | ||
| /// locations that would have been stored to by a scalar loop. So we need to | ||
| /// know what the result of 'c[i] == 0' is before performing the vector store, | ||
| /// with or without masking. | ||
| /// | ||
| /// We can either do this by moving the condition load to the top of the | ||
| /// vector body and using the comparison to create masks for other operations | ||
| /// in the loop, or by looking ahead one vector iteration and bailing out to | ||
| /// the scalar loop if an exit would occur. | ||
| /// | ||
| /// Using the latter approach (applicable to more targets), we need to hoist | ||
| /// the first load (of c[0]) out of the loop then rotate the load within the | ||
| /// loop to the next iteration, remembering to adjust the vector trip count. | ||
| /// Something like the following: | ||
| /// | ||
| /// vec.ph: | ||
| /// %ci.0 = load <4 x i32>, ptr %c | ||
| /// %cmp.0 = icmp eq <4 x i32> %ci.0, zeroinitializer | ||
| /// %any.of.0 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %cmp.0) | ||
| /// br i1 %any.of.0, label %scalar.ph, label %vec.body | ||
| /// vec.body: | ||
| /// %iv = phi... | ||
| /// phi for c[i] if used elsewhere in the loop... | ||
| /// other operations in the loop... | ||
| /// %iv.next = add i64 %iv, 4 | ||
| /// %addr.next = getelementptr i32, ptr %c, i64 %iv.next | ||
| /// %ci.next = load <4 x i32>, ptr %addr.next | ||
| /// %cmp.next = icmp eq <4 x i32> %ci.next, zeroinitializer | ||
| /// %any.of.next = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %cmp.next) | ||
| /// iv.next compared with shortened vector tripcount... | ||
| /// uncountable condition combined with counted condition... | ||
| /// br... | ||
| /// | ||
| /// Doing this means the last few iterations will always be performed by a | ||
| /// scalar loop regardless of which exit is taken, and so vector iterations | ||
| /// will never execute a memory operation to a location that the scalar loop | ||
| /// would not have. | ||
| /// | ||
| /// This means we must ensure that it is safe to move the load for 'c[i]' | ||
| /// before other memory operations (or any other observable side effects) in | ||
| /// the loop. | ||
|
||
| /// | ||
| /// Currently, c[i] must have only one user (the comparison used for the | ||
| /// uncountable exit) since we would otherwise need to introduce a PHI node | ||
| /// for it. | ||
| bool canUncountableExitConditionLoadBeMoved(BasicBlock *ExitingBlock); | ||
|
|
||
| /// Return true if all of the instructions in the block can be speculatively | ||
| /// executed, and record the loads/stores that require masking. | ||
| /// \p SafePtrs is a list of addresses that are known to be legal and we know | ||
|
|
@@ -646,6 +724,10 @@ class LoopVectorizationLegality { | |
| BlockFrequencyInfo *BFI; | ||
| ProfileSummaryInfo *PSI; | ||
|
|
||
| // Alias Analysis results used to check for possible aliasing with loads | ||
| // used in uncountable exit conditions. | ||
| AAResults *AA; | ||
|
|
||
| /// If we discover function calls within the loop which have a valid | ||
| /// vectorized variant, record that fact so that LoopVectorize can | ||
| /// (potentially) make a better decision on the maximum VF and enable | ||
|
|
@@ -659,6 +741,10 @@ class LoopVectorizationLegality { | |
| /// Keep track of an uncountable exiting block, if there is exactly one early | ||
| /// exit. | ||
| BasicBlock *UncountableExitingBB = nullptr; | ||
|
|
||
| /// If true, the loop has at least one uncountable exit and operations within | ||
| /// the loop may have observable side effects. | ||
| bool UncountableExitWithSideEffects = false; | ||
| }; | ||
|
|
||
| } // namespace llvm | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,8 +15,10 @@ | |
| // | ||
|
|
||
| #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" | ||
| #include "llvm/Analysis/AliasAnalysis.h" | ||
| #include "llvm/Analysis/Loads.h" | ||
| #include "llvm/Analysis/LoopInfo.h" | ||
| #include "llvm/Analysis/MustExecute.h" | ||
| #include "llvm/Analysis/OptimizationRemarkEmitter.h" | ||
| #include "llvm/Analysis/ScalarEvolutionExpressions.h" | ||
| #include "llvm/Analysis/TargetLibraryInfo.h" | ||
|
|
@@ -1223,8 +1225,18 @@ bool LoopVectorizationLegality::canVectorizeMemory() { | |
| }); | ||
| } | ||
|
|
||
| if (!LAI->canVectorizeMemory()) | ||
| if (!LAI->canVectorizeMemory()) { | ||
| if (hasUncountableExitWithSideEffects()) { | ||
| reportVectorizationFailure( | ||
| "Cannot vectorize unsafe dependencies in uncountable exit loop with " | ||
| "side effects", | ||
| "CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE, | ||
| TheLoop); | ||
| return false; | ||
| } | ||
|
|
||
| return canVectorizeIndirectUnsafeDependences(); | ||
| } | ||
|
|
||
| if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) { | ||
| reportVectorizationFailure("We don't allow storing to uniform addresses", | ||
|
|
@@ -1755,16 +1767,24 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { | |
| } | ||
| }; | ||
|
|
||
| bool HasSideEffects = false; | ||
| for (auto *BB : TheLoop->blocks()) | ||
| for (auto &I : *BB) { | ||
| if (I.mayWriteToMemory()) { | ||
| // We don't support writes to memory. | ||
| if (isa<StoreInst>(&I) && cast<StoreInst>(&I)->isSimple()) { | ||
| HasSideEffects = true; | ||
| continue; | ||
| } | ||
|
|
||
| // We don't support complex writes to memory. | ||
| reportVectorizationFailure( | ||
| "Writes to memory unsupported in early exit loops", | ||
| "Cannot vectorize early exit loop with writes to memory", | ||
| "Complex writes to memory unsupported in early exit loops", | ||
| "Cannot vectorize early exit loop with complex writes to memory", | ||
| "WritesInEarlyExitLoop", ORE, TheLoop); | ||
| return false; | ||
| } else if (!IsSafeOperation(&I)) { | ||
| } | ||
|
||
|
|
||
| if (!IsSafeOperation(&I)) { | ||
| reportVectorizationFailure("Early exit loop contains operations that " | ||
| "cannot be speculatively executed", | ||
| "UnsafeOperationsEarlyExitLoop", ORE, | ||
|
|
@@ -1777,15 +1797,22 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { | |
| assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock && | ||
| "Expected latch predecessor to be the early exiting block"); | ||
|
|
||
| Predicates.clear(); | ||
| SmallVector<LoadInst *, 4> NonDerefLoads; | ||
| if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads, | ||
| &Predicates)) { | ||
| reportVectorizationFailure("Loop may fault", | ||
| "Cannot vectorize non-read-only early exit loop", | ||
| "NonReadOnlyEarlyExitLoop", ORE, TheLoop); | ||
| // TODO: Handle loops that may fault. | ||
| if (!HasSideEffects) { | ||
| // Read-only loop. | ||
| Predicates.clear(); | ||
| if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads, | ||
| &Predicates)) { | ||
| reportVectorizationFailure( | ||
| "Loop may fault", "Cannot vectorize non-read-only early exit loop", | ||
| "NonReadOnlyEarlyExitLoop", ORE, TheLoop); | ||
| return false; | ||
| } | ||
| } else if (!canUncountableExitConditionLoadBeMoved( | ||
| SingleUncountableExitingBlock)) | ||
| return false; | ||
| } | ||
|
|
||
| // Check non-dereferenceable loads if any. | ||
| for (LoadInst *LI : NonDerefLoads) { | ||
| // Only support unit-stride access for now. | ||
|
|
@@ -1813,6 +1840,99 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { | |
| "backedge taken count: " | ||
| << *SymbolicMaxBTC << '\n'); | ||
| UncountableExitingBB = SingleUncountableExitingBlock; | ||
| UncountableExitWithSideEffects = HasSideEffects; | ||
| return true; | ||
| } | ||
|
|
||
| bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved( | ||
| BasicBlock *ExitingBlock) { | ||
| // Try to find a load in the critical path for the uncountable exit condition. | ||
| // This is currently matching about the simplest form we can, expecting | ||
| // only one in-loop load, the result of which is directly compared against | ||
| // a loop-invariant value. | ||
| // FIXME: We're insisting on a single use for now, because otherwise we will | ||
| // need to make PHI nodes for other users. That can be done once the initial | ||
| // transform code lands. | ||
| auto *Br = cast<BranchInst>(ExitingBlock->getTerminator()); | ||
|
|
||
| using namespace llvm::PatternMatch; | ||
| Instruction *L = nullptr; | ||
| Value *Ptr = nullptr; | ||
| Value *R = nullptr; | ||
| if (!match(Br->getCondition(), | ||
| m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))), | ||
| m_Value(R))))) { | ||
| reportVectorizationFailure( | ||
| "Early exit loop with store but no supported condition load", | ||
| "NoConditionLoadForEarlyExitLoop", ORE, TheLoop); | ||
| return false; | ||
| } | ||
|
|
||
| // FIXME: Don't rely on operand ordering for the comparison. | ||
| if (!TheLoop->isLoopInvariant(R)) { | ||
| reportVectorizationFailure( | ||
| "Early exit loop with store but no supported condition load", | ||
| "NoConditionLoadForEarlyExitLoop", ORE, TheLoop); | ||
| return false; | ||
| } | ||
|
|
||
| // Make sure that the load address is not loop invariant; we want an | ||
| // address calculation that we can rotate to the next vector iteration. | ||
| const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr); | ||
| if (!isa<SCEVAddRecExpr>(PtrScev)) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is missing a check that the AddRec is in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @huntergr-arm I see you pushed a fix, would be helpful for visibility to mention it somewhere here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (same when other comments have been addressed, there appear to be no email notifications when reacting with emojis, so there won't be any notifications when the comments have been addressed and it is ready for another look) |
||
| reportVectorizationFailure( | ||
| "Uncountable exit condition depends on load with an address that is " | ||
| "not an add recurrence", | ||
| "EarlyExitLoadInvariantAddress", ORE, TheLoop); | ||
| return false; | ||
| } | ||
|
|
||
| // FIXME: Support gathers after first-faulting load support lands. | ||
| SmallVector<const SCEVPredicate *, 4> Predicates; | ||
| LoadInst *Load = cast<LoadInst>(L); | ||
| if (!isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(), *DT, AC, | ||
| &Predicates)) { | ||
| reportVectorizationFailure( | ||
| "Loop may fault", | ||
| "Cannot vectorize potentially faulting early exit loop", | ||
| "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); | ||
| return false; | ||
| } | ||
|
|
||
| ICFLoopSafetyInfo SafetyInfo; | ||
| SafetyInfo.computeLoopSafetyInfo(TheLoop); | ||
| // We need to know that load will be executed before we can hoist a | ||
| // copy out to run just before the first iteration. | ||
| // FIXME: Currently, other restrictions prevent us from reaching this point | ||
| // with a loop where the uncountable exit condition is determined | ||
| // by a conditional load. | ||
| assert(SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop) && | ||
| "Unhandled control flow in uncountable exit loop with side effects"); | ||
|
|
||
| // Prohibit any potential aliasing with any instruction in the loop which | ||
| // might store to memory. | ||
| // FIXME: Relax this constraint where possible. | ||
| for (auto *BB : TheLoop->blocks()) { | ||
| for (auto &I : *BB) { | ||
| if (&I == Load) | ||
| continue; | ||
|
|
||
| if (I.mayWriteToMemory()) { | ||
| if (auto *SI = dyn_cast<StoreInst>(&I)) { | ||
| AliasResult AR = AA->alias(Ptr, SI->getPointerOperand()); | ||
| if (AR == AliasResult::NoAlias) | ||
| continue; | ||
| } | ||
|
|
||
| reportVectorizationFailure( | ||
| "Cannot determine whether critical uncountable exit load address " | ||
| "does not alias with a memory write", | ||
| "CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop); | ||
| return false; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
|
|
@@ -1885,6 +2005,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { | |
| } else { | ||
| if (!isVectorizableEarlyExitLoop()) { | ||
| assert(!hasUncountableEarlyExit() && | ||
| !hasUncountableExitWithSideEffects() && | ||
| "Must be false without vectorizable early-exit loop"); | ||
| if (DoExtraAnalysis) | ||
| Result = false; | ||
|
|
@@ -1903,6 +2024,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { | |
| return false; | ||
| } | ||
|
|
||
| // Bail out for state-changing loops with uncountable exits for now. | ||
| if (UncountableExitWithSideEffects) { | ||
| reportVectorizationFailure( | ||
| "Writes to memory unsupported in early exit loops", | ||
| "Cannot vectorize early exit loop with writes to memory", | ||
| "WritesInEarlyExitLoop", ORE, TheLoop); | ||
| return false; | ||
| } | ||
|
|
||
| if (Result) { | ||
| LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" | ||
| << (LAI->getRuntimePointerChecking()->Need | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.