Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
3c6f7fe
[LV] Add initial legality checks for ee loops with stores
huntergr-arm Jun 24, 2025
255fdb6
* Remove load tracking from LVL class, make it a local passed as needed
huntergr-arm Jul 18, 2025
b991d44
Name new parameter in function prototypes
huntergr-arm Jul 31, 2025
7cae713
Improve remarks, remove contentious FIXME
huntergr-arm Jul 31, 2025
70769de
Add FIXME for comparison operator ordering assumption
huntergr-arm Jul 31, 2025
553cc93
Added test with a gather load for the uncounted exit condition
huntergr-arm Jul 31, 2025
a6189e2
Added test with a switch for the uncounted exit
huntergr-arm Jul 31, 2025
3bb93d2
Added remark for non-branch terminator on uncounted exit
huntergr-arm Jul 31, 2025
231d17a
Initialize LAI earlier if we have an EE loop with side effects
huntergr-arm Aug 5, 2025
4e5d4c2
Add maxdeps=1 test
huntergr-arm Aug 5, 2025
1a9360d
Separate out ee-with-side-effect checking and try to explain it better
huntergr-arm Aug 6, 2025
23770b0
Improve comments, simplify condition checking
huntergr-arm Aug 12, 2025
9c5436a
Use AA directly instead of through dependencies
huntergr-arm Aug 20, 2025
022f3e6
Flatten exec guarantee check a bit
huntergr-arm Aug 21, 2025
21a5682
Flatten condition IR detection
huntergr-arm Aug 26, 2025
e80821e
Assert for execute guarantee check for now
huntergr-arm Aug 27, 2025
2233dcf
Obtain AA via LoopVectorize
huntergr-arm Sep 1, 2025
83e10d9
formatting fixup after rebase
huntergr-arm Sep 9, 2025
d5aa5ef
* Improved comments
huntergr-arm Sep 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 97 additions & 11 deletions llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,15 +251,18 @@ struct HistogramInfo {
/// induction variable and the different reduction variables.
class LoopVectorizationLegality {
public:
LoopVectorizationLegality(
Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
TargetTransformInfo *TTI, TargetLibraryInfo *TLI, Function *F,
LoopAccessInfoManager &LAIs, LoopInfo *LI, OptimizationRemarkEmitter *ORE,
LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB,
AssumptionCache *AC, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
DominatorTree *DT, TargetTransformInfo *TTI,
TargetLibraryInfo *TLI, Function *F,
LoopAccessInfoManager &LAIs, LoopInfo *LI,
OptimizationRemarkEmitter *ORE,
LoopVectorizationRequirements *R,
LoopVectorizeHints *H, DemandedBits *DB,
AssumptionCache *AC, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, AAResults *AA)
: TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT), LAIs(LAIs),
ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC), BFI(BFI),
PSI(PSI) {}
ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC), BFI(BFI), PSI(PSI),
AA(AA) {}

/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
Expand Down Expand Up @@ -407,6 +410,14 @@ class LoopVectorizationLegality {
return UncountableExitingBB;
}

/// Returns true if this is an early exit loop with state-changing or
/// potentially-faulting operations and the condition for the uncountable
/// exit must be determined before any of the state changes or potentially
/// faulting operations take place.
bool hasUncountableExitWithSideEffects() const {
return UncountableExitWithSideEffects;
}

/// Return true if there is store-load forwarding dependencies.
bool isSafeForAnyStoreLoadForwardDistances() const {
return LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances();
Expand Down Expand Up @@ -524,20 +535,87 @@ class LoopVectorizationLegality {
/// Returns true if this is an early exit loop that can be vectorized.
/// Currently, a loop with an uncountable early exit is considered
/// vectorizable if:
/// 1. There are no writes to memory in the loop.
/// 1. Writes to memory will access different underlying objects than
/// any load used as part of the uncountable exit condition.
/// 2. The loop has only one early uncountable exit
/// 3. The early exit block dominates the latch block.
/// 4. The latch block has an exact exit count.
/// 5. The loop does not contain reductions or recurrences.
/// 6. We can prove at compile-time that loops will not contain faulting
/// loads.
/// loads, or that any faulting loads would also occur in a purely
/// scalar loop.
/// 7. It is safe to speculatively execute instructions such as divide or
/// call instructions.
/// call instructions.
/// The list above is not based on theoretical limitations of vectorization,
/// but simply a statement that more work is needed to support these
/// additional cases safely.
bool isVectorizableEarlyExitLoop();

/// When vectorizing an early exit loop containing side effects, we need to
/// determine whether an uncounted exit will be taken before any operation
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// determine whether an uncounted exit will be taken before any operation
/// determine whether an uncountable exit will be taken before any operation

/// that has side effects.
///
/// Consider a loop like the following:
/// for (int i = 0; i < N; ++i) {
/// a[i] = b[i];
/// if (c[i] == 0)
/// break;
/// }
///
/// We have both a load and a store operation occurring before the condition
/// is checked for early termination. We could potentially restrict
/// vectorization to cases where we know all addresses are guaranteed to be
/// dereferenceable, which would allow the load before the condition check to
/// be vectorized.
///
/// The store, however, should not execute across all lanes if early
/// termination occurs before the end of the vector. We must only store to the
/// locations that would have been stored to by a scalar loop. So we need to
/// know what the result of 'c[i] == 0' is before performing the vector store,
/// with or without masking.
///
/// We can either do this by moving the condition load to the top of the
/// vector body and using the comparison to create masks for other operations
/// in the loop, or by looking ahead one vector iteration and bailing out to
/// the scalar loop if an exit would occur.
///
/// Using the latter approach (applicable to more targets), we need to hoist
/// the first load (of c[0]) out of the loop then rotate the load within the
/// loop to the next iteration, remembering to adjust the vector trip count.
/// Something like the following:
///
/// vec.ph:
/// %ci.0 = load <4 x i32>, ptr %c
/// %cmp.0 = icmp eq <4 x i32> %ci.0, zeroinitializer
/// %any.of.0 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %cmp.0)
/// br i1 %any.of.0, label %scalar.ph, label %vec.body
/// vec.body:
/// %iv = phi...
/// phi for c[i] if used elsewhere in the loop...
/// other operations in the loop...
/// %iv.next = add i64 %iv, 4
/// %addr.next = getelementptr i32, ptr %c, i64 %iv.next
/// %ci.next = load <4 x i32>, ptr %addr.next
/// %cmp.next = icmp eq <4 x i32> %ci.next, zeroinitializer
/// %any.of.next = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %cmp.next)
/// iv.next compared with shortened vector tripcount...
/// uncountable condition combined with counted condition...
/// br...
///
/// Doing this means the last few iterations will always be performed by a
/// scalar loop regardless of which exit is taken, and so vector iterations
/// will never execute a memory operation to a location that the scalar loop
/// would not have.
///
/// This means we must ensure that it is safe to move the load for 'c[i]'
/// before other memory operations (or any other observable side effects) in
/// the loop.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment looks really helpful, thanks!

Is it worth adding as well that c[i] is not permitted to have more than one use for now, because an outside use complicates things since you now have to use phi for c[i] instead of c[i] itself.

///
/// Currently, c[i] must have only one user (the comparison used for the
/// uncountable exit) since we would otherwise need to introduce a PHI node
/// for it.
bool canUncountableExitConditionLoadBeMoved(BasicBlock *ExitingBlock);

/// Return true if all of the instructions in the block can be speculatively
/// executed, and record the loads/stores that require masking.
/// \p SafePtrs is a list of addresses that are known to be legal and we know
Expand Down Expand Up @@ -646,6 +724,10 @@ class LoopVectorizationLegality {
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;

// Alias Analysis results used to check for possible aliasing with loads
// used in uncountable exit conditions.
AAResults *AA;

/// If we discover function calls within the loop which have a valid
/// vectorized variant, record that fact so that LoopVectorize can
/// (potentially) make a better decision on the maximum VF and enable
Expand All @@ -659,6 +741,10 @@ class LoopVectorizationLegality {
/// Keep track of an uncountable exiting block, if there is exactly one early
/// exit.
BasicBlock *UncountableExitingBB = nullptr;

/// If true, the loop has at least one uncountable exit and operations within
/// the loop may have observable side effects.
bool UncountableExitWithSideEffects = false;
};

} // namespace llvm
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ struct LoopVectorizePass : public PassInfoMixin<LoopVectorizePass> {
LoopAccessInfoManager *LAIs;
OptimizationRemarkEmitter *ORE;
ProfileSummaryInfo *PSI;
AAResults *AA;

LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
LLVM_ABI void
Expand Down
154 changes: 142 additions & 12 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
//

#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
Expand Down Expand Up @@ -1223,8 +1225,18 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
});
}

if (!LAI->canVectorizeMemory())
if (!LAI->canVectorizeMemory()) {
if (hasUncountableExitWithSideEffects()) {
reportVectorizationFailure(
"Cannot vectorize unsafe dependencies in uncountable exit loop with "
"side effects",
"CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE,
TheLoop);
return false;
}

return canVectorizeIndirectUnsafeDependences();
}

if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
reportVectorizationFailure("We don't allow storing to uniform addresses",
Expand Down Expand Up @@ -1755,16 +1767,24 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
}
};

bool HasSideEffects = false;
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
if (I.mayWriteToMemory()) {
// We don't support writes to memory.
if (isa<StoreInst>(&I) && cast<StoreInst>(&I)->isSimple()) {
HasSideEffects = true;
continue;
}

// We don't support complex writes to memory.
reportVectorizationFailure(
"Writes to memory unsupported in early exit loops",
"Cannot vectorize early exit loop with writes to memory",
"Complex writes to memory unsupported in early exit loops",
"Cannot vectorize early exit loop with complex writes to memory",
"WritesInEarlyExitLoop", ORE, TheLoop);
return false;
} else if (!IsSafeOperation(&I)) {
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this code change?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, that was just an unrelated NFC change to comply with the coding standards (no else/else if after a return).


if (!IsSafeOperation(&I)) {
reportVectorizationFailure("Early exit loop contains operations that "
"cannot be speculatively executed",
"UnsafeOperationsEarlyExitLoop", ORE,
Expand All @@ -1777,15 +1797,22 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock &&
"Expected latch predecessor to be the early exiting block");

Predicates.clear();
SmallVector<LoadInst *, 4> NonDerefLoads;
if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads,
&Predicates)) {
reportVectorizationFailure("Loop may fault",
"Cannot vectorize non-read-only early exit loop",
"NonReadOnlyEarlyExitLoop", ORE, TheLoop);
// TODO: Handle loops that may fault.
if (!HasSideEffects) {
// Read-only loop.
Predicates.clear();
if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads,
&Predicates)) {
reportVectorizationFailure(
"Loop may fault", "Cannot vectorize non-read-only early exit loop",
"NonReadOnlyEarlyExitLoop", ORE, TheLoop);
return false;
}
} else if (!canUncountableExitConditionLoadBeMoved(
SingleUncountableExitingBlock))
return false;
}

// Check non-dereferenceable loads if any.
for (LoadInst *LI : NonDerefLoads) {
// Only support unit-stride access for now.
Expand Down Expand Up @@ -1813,6 +1840,99 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
"backedge taken count: "
<< *SymbolicMaxBTC << '\n');
UncountableExitingBB = SingleUncountableExitingBlock;
UncountableExitWithSideEffects = HasSideEffects;
return true;
}

bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
BasicBlock *ExitingBlock) {
// Try to find a load in the critical path for the uncountable exit condition.
// This is currently matching about the simplest form we can, expecting
// only one in-loop load, the result of which is directly compared against
// a loop-invariant value.
// FIXME: We're insisting on a single use for now, because otherwise we will
// need to make PHI nodes for other users. That can be done once the initial
// transform code lands.
auto *Br = cast<BranchInst>(ExitingBlock->getTerminator());

using namespace llvm::PatternMatch;
Instruction *L = nullptr;
Value *Ptr = nullptr;
Value *R = nullptr;
if (!match(Br->getCondition(),
m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))),
m_Value(R))))) {
reportVectorizationFailure(
"Early exit loop with store but no supported condition load",
"NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
return false;
}

// FIXME: Don't rely on operand ordering for the comparison.
if (!TheLoop->isLoopInvariant(R)) {
reportVectorizationFailure(
"Early exit loop with store but no supported condition load",
"NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
return false;
}

// Make sure that the load address is not loop invariant; we want an
// address calculation that we can rotate to the next vector iteration.
const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
if (!isa<SCEVAddRecExpr>(PtrScev)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is missing a check that the AddRec is in TheLoop, in case Ptr is an AddRec of a parent loop, in which case it would be invariant in TheLoop

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@huntergr-arm I see you pushed a fix, would be helpful for visibility to mention it somewhere here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(same when other comments have been addressed, there appear to be no email notifications when reacting with emojis, so there won't be any notifications when the comments have been addressed and it is ready for another look)

reportVectorizationFailure(
"Uncountable exit condition depends on load with an address that is "
"not an add recurrence",
"EarlyExitLoadInvariantAddress", ORE, TheLoop);
return false;
}

// FIXME: Support gathers after first-faulting load support lands.
SmallVector<const SCEVPredicate *, 4> Predicates;
LoadInst *Load = cast<LoadInst>(L);
if (!isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(), *DT, AC,
&Predicates)) {
reportVectorizationFailure(
"Loop may fault",
"Cannot vectorize potentially faulting early exit loop",
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
return false;
}

ICFLoopSafetyInfo SafetyInfo;
SafetyInfo.computeLoopSafetyInfo(TheLoop);
// We need to know that load will be executed before we can hoist a
// copy out to run just before the first iteration.
// FIXME: Currently, other restrictions prevent us from reaching this point
// with a loop where the uncountable exit condition is determined
// by a conditional load.
assert(SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop) &&
"Unhandled control flow in uncountable exit loop with side effects");

// Prohibit any potential aliasing with any instruction in the loop which
// might store to memory.
// FIXME: Relax this constraint where possible.
for (auto *BB : TheLoop->blocks()) {
for (auto &I : *BB) {
if (&I == Load)
continue;

if (I.mayWriteToMemory()) {
if (auto *SI = dyn_cast<StoreInst>(&I)) {
AliasResult AR = AA->alias(Ptr, SI->getPointerOperand());
if (AR == AliasResult::NoAlias)
continue;
}

reportVectorizationFailure(
"Cannot determine whether critical uncountable exit load address "
"does not alias with a memory write",
"CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop);
return false;
}
}
}

return true;
}

Expand Down Expand Up @@ -1885,6 +2005,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
} else {
if (!isVectorizableEarlyExitLoop()) {
assert(!hasUncountableEarlyExit() &&
!hasUncountableExitWithSideEffects() &&
"Must be false without vectorizable early-exit loop");
if (DoExtraAnalysis)
Result = false;
Expand All @@ -1903,6 +2024,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
return false;
}

// Bail out for state-changing loops with uncountable exits for now.
if (UncountableExitWithSideEffects) {
reportVectorizationFailure(
"Writes to memory unsupported in early exit loops",
"Cannot vectorize early exit loop with writes to memory",
"WritesInEarlyExitLoop", ORE, TheLoop);
return false;
}

if (Result) {
LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerChecking()->Need
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9811,7 +9811,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements;
LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
&Requirements, &Hints, DB, AC, BFI, PSI);
&Requirements, &Hints, DB, AC, BFI, PSI, AA);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
Hints.emitRemarkWithHints();
Expand Down Expand Up @@ -10248,6 +10248,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
DB = &AM.getResult<DemandedBitsAnalysis>(F);
ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
LAIs = &AM.getResult<LoopAccessAnalysis>(F);
AA = &AM.getResult<AAManager>(F);

auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
Expand Down
Loading