Skip to content

Commit 2954354

Browse files
author
git apple-llvm automerger
committed
Merge commit '3c810b76b974' from llvm.org/main into next
2 parents ee3feec + 3c810b7 commit 2954354

File tree

6 files changed

+417
-37
lines changed

6 files changed

+417
-37
lines changed

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 97 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -251,15 +251,18 @@ struct HistogramInfo {
251251
/// induction variable and the different reduction variables.
252252
class LoopVectorizationLegality {
253253
public:
254-
LoopVectorizationLegality(
255-
Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
256-
TargetTransformInfo *TTI, TargetLibraryInfo *TLI, Function *F,
257-
LoopAccessInfoManager &LAIs, LoopInfo *LI, OptimizationRemarkEmitter *ORE,
258-
LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB,
259-
AssumptionCache *AC, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
254+
LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
255+
DominatorTree *DT, TargetTransformInfo *TTI,
256+
TargetLibraryInfo *TLI, Function *F,
257+
LoopAccessInfoManager &LAIs, LoopInfo *LI,
258+
OptimizationRemarkEmitter *ORE,
259+
LoopVectorizationRequirements *R,
260+
LoopVectorizeHints *H, DemandedBits *DB,
261+
AssumptionCache *AC, BlockFrequencyInfo *BFI,
262+
ProfileSummaryInfo *PSI, AAResults *AA)
260263
: TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT), LAIs(LAIs),
261-
ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC), BFI(BFI),
262-
PSI(PSI) {}
264+
ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC), BFI(BFI), PSI(PSI),
265+
AA(AA) {}
263266

264267
/// ReductionList contains the reduction descriptors for all
265268
/// of the reductions that were found in the loop.
@@ -407,6 +410,14 @@ class LoopVectorizationLegality {
407410
return UncountableExitingBB;
408411
}
409412

413+
/// Returns true if this is an early exit loop with state-changing or
414+
/// potentially-faulting operations and the condition for the uncountable
415+
/// exit must be determined before any of the state changes or potentially
416+
/// faulting operations take place.
417+
bool hasUncountableExitWithSideEffects() const {
418+
return UncountableExitWithSideEffects;
419+
}
420+
410421
/// Return true if there is store-load forwarding dependencies.
411422
bool isSafeForAnyStoreLoadForwardDistances() const {
412423
return LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances();
@@ -524,20 +535,87 @@ class LoopVectorizationLegality {
524535
/// Returns true if this is an early exit loop that can be vectorized.
525536
/// Currently, a loop with an uncountable early exit is considered
526537
/// vectorizable if:
527-
/// 1. There are no writes to memory in the loop.
538+
/// 1. Writes to memory will access different underlying objects than
539+
/// any load used as part of the uncountable exit condition.
528540
/// 2. The loop has only one early uncountable exit
529541
/// 3. The early exit block dominates the latch block.
530542
/// 4. The latch block has an exact exit count.
531543
/// 5. The loop does not contain reductions or recurrences.
532544
/// 6. We can prove at compile-time that loops will not contain faulting
533-
/// loads.
545+
/// loads, or that any faulting loads would also occur in a purely
546+
/// scalar loop.
534547
/// 7. It is safe to speculatively execute instructions such as divide or
535-
/// call instructions.
548+
/// call instructions.
536549
/// The list above is not based on theoretical limitations of vectorization,
537550
/// but simply a statement that more work is needed to support these
538551
/// additional cases safely.
539552
bool isVectorizableEarlyExitLoop();
540553

554+
/// When vectorizing an early exit loop containing side effects, we need to
555+
/// determine whether an uncounted exit will be taken before any operation
556+
/// that has side effects.
557+
///
558+
/// Consider a loop like the following:
559+
/// for (int i = 0; i < N; ++i) {
560+
/// a[i] = b[i];
561+
/// if (c[i] == 0)
562+
/// break;
563+
/// }
564+
///
565+
/// We have both a load and a store operation occurring before the condition
566+
/// is checked for early termination. We could potentially restrict
567+
/// vectorization to cases where we know all addresses are guaranteed to be
568+
/// dereferenceable, which would allow the load before the condition check to
569+
/// be vectorized.
570+
///
571+
/// The store, however, should not execute across all lanes if early
572+
/// termination occurs before the end of the vector. We must only store to the
573+
/// locations that would have been stored to by a scalar loop. So we need to
574+
/// know what the result of 'c[i] == 0' is before performing the vector store,
575+
/// with or without masking.
576+
///
577+
/// We can either do this by moving the condition load to the top of the
578+
/// vector body and using the comparison to create masks for other operations
579+
/// in the loop, or by looking ahead one vector iteration and bailing out to
580+
/// the scalar loop if an exit would occur.
581+
///
582+
/// Using the latter approach (applicable to more targets), we need to hoist
583+
/// the first load (of c[0]) out of the loop then rotate the load within the
584+
/// loop to the next iteration, remembering to adjust the vector trip count.
585+
/// Something like the following:
586+
///
587+
/// vec.ph:
588+
/// %ci.0 = load <4 x i32>, ptr %c
589+
/// %cmp.0 = icmp eq <4 x i32> %ci.0, zeroinitializer
590+
/// %any.of.0 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %cmp.0)
591+
/// br i1 %any.of.0, label %scalar.ph, label %vec.body
592+
/// vec.body:
593+
/// %iv = phi...
594+
/// phi for c[i] if used elsewhere in the loop...
595+
/// other operations in the loop...
596+
/// %iv.next = add i64 %iv, 4
597+
/// %addr.next = getelementptr i32, ptr %c, i64 %iv.next
598+
/// %ci.next = load <4 x i32>, ptr %addr.next
599+
/// %cmp.next = icmp eq <4 x i32> %ci.next, zeroinitializer
600+
/// %any.of.next = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %cmp.next)
601+
/// iv.next compared with shortened vector tripcount...
602+
/// uncountable condition combined with counted condition...
603+
/// br...
604+
///
605+
/// Doing this means the last few iterations will always be performed by a
606+
/// scalar loop regardless of which exit is taken, and so vector iterations
607+
/// will never execute a memory operation to a location that the scalar loop
608+
/// would not have.
609+
///
610+
/// This means we must ensure that it is safe to move the load for 'c[i]'
611+
/// before other memory operations (or any other observable side effects) in
612+
/// the loop.
613+
///
614+
/// Currently, c[i] must have only one user (the comparison used for the
615+
/// uncountable exit) since we would otherwise need to introduce a PHI node
616+
/// for it.
617+
bool canUncountableExitConditionLoadBeMoved(BasicBlock *ExitingBlock);
618+
541619
/// Return true if all of the instructions in the block can be speculatively
542620
/// executed, and record the loads/stores that require masking.
543621
/// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -646,6 +724,10 @@ class LoopVectorizationLegality {
646724
BlockFrequencyInfo *BFI;
647725
ProfileSummaryInfo *PSI;
648726

727+
// Alias Analysis results used to check for possible aliasing with loads
728+
// used in uncountable exit conditions.
729+
AAResults *AA;
730+
649731
/// If we discover function calls within the loop which have a valid
650732
/// vectorized variant, record that fact so that LoopVectorize can
651733
/// (potentially) make a better decision on the maximum VF and enable
@@ -659,6 +741,10 @@ class LoopVectorizationLegality {
659741
/// Keep track of an uncountable exiting block, if there is exactly one early
660742
/// exit.
661743
BasicBlock *UncountableExitingBB = nullptr;
744+
745+
/// If true, the loop has at least one uncountable exit and operations within
746+
/// the loop may have observable side effects.
747+
bool UncountableExitWithSideEffects = false;
662748
};
663749

664750
} // namespace llvm

llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ struct LoopVectorizePass : public PassInfoMixin<LoopVectorizePass> {
152152
LoopAccessInfoManager *LAIs;
153153
OptimizationRemarkEmitter *ORE;
154154
ProfileSummaryInfo *PSI;
155+
AAResults *AA;
155156

156157
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
157158
LLVM_ABI void

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 142 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
//
1616

1717
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18+
#include "llvm/Analysis/AliasAnalysis.h"
1819
#include "llvm/Analysis/Loads.h"
1920
#include "llvm/Analysis/LoopInfo.h"
21+
#include "llvm/Analysis/MustExecute.h"
2022
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
2123
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
2224
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -1223,8 +1225,18 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
12231225
});
12241226
}
12251227

1226-
if (!LAI->canVectorizeMemory())
1228+
if (!LAI->canVectorizeMemory()) {
1229+
if (hasUncountableExitWithSideEffects()) {
1230+
reportVectorizationFailure(
1231+
"Cannot vectorize unsafe dependencies in uncountable exit loop with "
1232+
"side effects",
1233+
"CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE,
1234+
TheLoop);
1235+
return false;
1236+
}
1237+
12271238
return canVectorizeIndirectUnsafeDependences();
1239+
}
12281240

12291241
if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
12301242
reportVectorizationFailure("We don't allow storing to uniform addresses",
@@ -1755,16 +1767,24 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
17551767
}
17561768
};
17571769

1770+
bool HasSideEffects = false;
17581771
for (auto *BB : TheLoop->blocks())
17591772
for (auto &I : *BB) {
17601773
if (I.mayWriteToMemory()) {
1761-
// We don't support writes to memory.
1774+
if (isa<StoreInst>(&I) && cast<StoreInst>(&I)->isSimple()) {
1775+
HasSideEffects = true;
1776+
continue;
1777+
}
1778+
1779+
// We don't support complex writes to memory.
17621780
reportVectorizationFailure(
1763-
"Writes to memory unsupported in early exit loops",
1764-
"Cannot vectorize early exit loop with writes to memory",
1781+
"Complex writes to memory unsupported in early exit loops",
1782+
"Cannot vectorize early exit loop with complex writes to memory",
17651783
"WritesInEarlyExitLoop", ORE, TheLoop);
17661784
return false;
1767-
} else if (!IsSafeOperation(&I)) {
1785+
}
1786+
1787+
if (!IsSafeOperation(&I)) {
17681788
reportVectorizationFailure("Early exit loop contains operations that "
17691789
"cannot be speculatively executed",
17701790
"UnsafeOperationsEarlyExitLoop", ORE,
@@ -1777,15 +1797,22 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
17771797
assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock &&
17781798
"Expected latch predecessor to be the early exiting block");
17791799

1780-
Predicates.clear();
17811800
SmallVector<LoadInst *, 4> NonDerefLoads;
1782-
if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads,
1783-
&Predicates)) {
1784-
reportVectorizationFailure("Loop may fault",
1785-
"Cannot vectorize non-read-only early exit loop",
1786-
"NonReadOnlyEarlyExitLoop", ORE, TheLoop);
1801+
// TODO: Handle loops that may fault.
1802+
if (!HasSideEffects) {
1803+
// Read-only loop.
1804+
Predicates.clear();
1805+
if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads,
1806+
&Predicates)) {
1807+
reportVectorizationFailure(
1808+
"Loop may fault", "Cannot vectorize non-read-only early exit loop",
1809+
"NonReadOnlyEarlyExitLoop", ORE, TheLoop);
1810+
return false;
1811+
}
1812+
} else if (!canUncountableExitConditionLoadBeMoved(
1813+
SingleUncountableExitingBlock))
17871814
return false;
1788-
}
1815+
17891816
// Check non-dereferenceable loads if any.
17901817
for (LoadInst *LI : NonDerefLoads) {
17911818
// Only support unit-stride access for now.
@@ -1813,6 +1840,99 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
18131840
"backedge taken count: "
18141841
<< *SymbolicMaxBTC << '\n');
18151842
UncountableExitingBB = SingleUncountableExitingBlock;
1843+
UncountableExitWithSideEffects = HasSideEffects;
1844+
return true;
1845+
}
1846+
1847+
bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
1848+
BasicBlock *ExitingBlock) {
1849+
// Try to find a load in the critical path for the uncountable exit condition.
1850+
// This is currently matching about the simplest form we can, expecting
1851+
// only one in-loop load, the result of which is directly compared against
1852+
// a loop-invariant value.
1853+
// FIXME: We're insisting on a single use for now, because otherwise we will
1854+
// need to make PHI nodes for other users. That can be done once the initial
1855+
// transform code lands.
1856+
auto *Br = cast<BranchInst>(ExitingBlock->getTerminator());
1857+
1858+
using namespace llvm::PatternMatch;
1859+
Instruction *L = nullptr;
1860+
Value *Ptr = nullptr;
1861+
Value *R = nullptr;
1862+
if (!match(Br->getCondition(),
1863+
m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))),
1864+
m_Value(R))))) {
1865+
reportVectorizationFailure(
1866+
"Early exit loop with store but no supported condition load",
1867+
"NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1868+
return false;
1869+
}
1870+
1871+
// FIXME: Don't rely on operand ordering for the comparison.
1872+
if (!TheLoop->isLoopInvariant(R)) {
1873+
reportVectorizationFailure(
1874+
"Early exit loop with store but no supported condition load",
1875+
"NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1876+
return false;
1877+
}
1878+
1879+
// Make sure that the load address is not loop invariant; we want an
1880+
// address calculation that we can rotate to the next vector iteration.
1881+
const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
1882+
if (!isa<SCEVAddRecExpr>(PtrScev)) {
1883+
reportVectorizationFailure(
1884+
"Uncountable exit condition depends on load with an address that is "
1885+
"not an add recurrence",
1886+
"EarlyExitLoadInvariantAddress", ORE, TheLoop);
1887+
return false;
1888+
}
1889+
1890+
// FIXME: Support gathers after first-faulting load support lands.
1891+
SmallVector<const SCEVPredicate *, 4> Predicates;
1892+
LoadInst *Load = cast<LoadInst>(L);
1893+
if (!isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(), *DT, AC,
1894+
&Predicates)) {
1895+
reportVectorizationFailure(
1896+
"Loop may fault",
1897+
"Cannot vectorize potentially faulting early exit loop",
1898+
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1899+
return false;
1900+
}
1901+
1902+
ICFLoopSafetyInfo SafetyInfo;
1903+
SafetyInfo.computeLoopSafetyInfo(TheLoop);
1904+
// We need to know that load will be executed before we can hoist a
1905+
// copy out to run just before the first iteration.
1906+
// FIXME: Currently, other restrictions prevent us from reaching this point
1907+
// with a loop where the uncountable exit condition is determined
1908+
// by a conditional load.
1909+
assert(SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop) &&
1910+
"Unhandled control flow in uncountable exit loop with side effects");
1911+
1912+
// Prohibit any potential aliasing with any instruction in the loop which
1913+
// might store to memory.
1914+
// FIXME: Relax this constraint where possible.
1915+
for (auto *BB : TheLoop->blocks()) {
1916+
for (auto &I : *BB) {
1917+
if (&I == Load)
1918+
continue;
1919+
1920+
if (I.mayWriteToMemory()) {
1921+
if (auto *SI = dyn_cast<StoreInst>(&I)) {
1922+
AliasResult AR = AA->alias(Ptr, SI->getPointerOperand());
1923+
if (AR == AliasResult::NoAlias)
1924+
continue;
1925+
}
1926+
1927+
reportVectorizationFailure(
1928+
"Cannot determine whether critical uncountable exit load address "
1929+
"does not alias with a memory write",
1930+
"CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop);
1931+
return false;
1932+
}
1933+
}
1934+
}
1935+
18161936
return true;
18171937
}
18181938

@@ -1885,6 +2005,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
18852005
} else {
18862006
if (!isVectorizableEarlyExitLoop()) {
18872007
assert(!hasUncountableEarlyExit() &&
2008+
!hasUncountableExitWithSideEffects() &&
18882009
"Must be false without vectorizable early-exit loop");
18892010
if (DoExtraAnalysis)
18902011
Result = false;
@@ -1903,6 +2024,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
19032024
return false;
19042025
}
19052026

2027+
// Bail out for state-changing loops with uncountable exits for now.
2028+
if (UncountableExitWithSideEffects) {
2029+
reportVectorizationFailure(
2030+
"Writes to memory unsupported in early exit loops",
2031+
"Cannot vectorize early exit loop with writes to memory",
2032+
"WritesInEarlyExitLoop", ORE, TheLoop);
2033+
return false;
2034+
}
2035+
19062036
if (Result) {
19072037
LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
19082038
<< (LAI->getRuntimePointerChecking()->Need

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9810,7 +9810,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
98109810
// Check if it is legal to vectorize the loop.
98119811
LoopVectorizationRequirements Requirements;
98129812
LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9813-
&Requirements, &Hints, DB, AC, BFI, PSI);
9813+
&Requirements, &Hints, DB, AC, BFI, PSI, AA);
98149814
if (!LVL.canVectorize(EnableVPlanNativePath)) {
98159815
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
98169816
Hints.emitRemarkWithHints();
@@ -10247,6 +10247,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
1024710247
DB = &AM.getResult<DemandedBitsAnalysis>(F);
1024810248
ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1024910249
LAIs = &AM.getResult<LoopAccessAnalysis>(F);
10250+
AA = &AM.getResult<AAManager>(F);
1025010251

1025110252
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
1025210253
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());

0 commit comments

Comments
 (0)