Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions llvm/include/llvm/Analysis/LoopAccessAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ class TargetTransformInfo;
struct VectorizerParams {
/// Maximum SIMD width.
static const unsigned MaxVectorWidth;
/// Maximum unroll factor factor. Can represent actual unroll factor and/or
/// some other target-specific features, like LMUL factor for RISC-V with RVV
/// support.
static const unsigned MaxVectorUF;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment, we never interleave in EVL mode, why do we need to account for LMUL here?


/// VF as overridden by the user.
static unsigned VectorizationFactor;
Expand Down Expand Up @@ -180,9 +184,10 @@ class MemoryDepChecker {

MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L,
const DenseMap<Value *, const SCEV *> &SymbolicStrides,
unsigned MaxTargetVectorWidthInBits)
unsigned MaxTargetVectorWidthInBits, bool AllowNonPow2Deps)
: PSE(PSE), InnermostLoop(L), SymbolicStrides(SymbolicStrides),
MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {}
MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits),
AllowNonPow2Deps(AllowNonPow2Deps) {}

/// Register the location (instructions are given increasing numbers)
/// of a write access.
Expand Down Expand Up @@ -216,6 +221,18 @@ class MemoryDepChecker {
return MaxSafeVectorWidthInBits;
}

/// Return safe power-of-2 number of elements, which do not prevent store-load
/// forwarding and safe to operate simultaneously.
std::optional<uint64_t> getStoreLoadForwardSafeVFPowerOf2() const {
return MaxStoreLoadForwardSafeVF.first;
}

/// Return safe non-power-of-2 number of elements, which do not prevent
/// store-load forwarding and safe to operate simultaneously.
std::optional<uint64_t> getStoreLoadForwardSafeVFNonPowerOf2() const {
return MaxStoreLoadForwardSafeVF.second;
}

/// In same cases when the dependency check fails we can still
/// vectorize the loop with a dynamic array access check.
bool shouldRetryWithRuntimeCheck() const {
Expand Down Expand Up @@ -304,6 +321,11 @@ class MemoryDepChecker {
/// restrictive.
uint64_t MaxSafeVectorWidthInBits = -1U;

/// Maximum number of elements (power-of-2 and non-power-of-2), which do not
/// prevent store-load forwarding and safe to operate simultaneously.
std::pair<std::optional<uint64_t>, std::optional<uint64_t>>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to distinguish between non-power and power of 2 here, can't we just use the max of both?

MaxStoreLoadForwardSafeVF;

/// If we see a non-constant dependence distance we can still try to
/// vectorize this loop with runtime checks.
bool FoundNonConstantDistanceDependence = false;
Expand All @@ -328,6 +350,9 @@ class MemoryDepChecker {
/// backwards-vectorizable or unknown (triggering a runtime check).
unsigned MaxTargetVectorWidthInBits = 0;

/// true if current target supports non-power-of-2 dependence distances.
bool AllowNonPow2Deps = false;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is confusing, in general any target supports non-power-of-2 dependence distances, if >= VF?


/// Mapping of SCEV expressions to their expanded pointer bounds (pair of
/// start and end pointer expressions).
DenseMap<std::pair<const SCEV *, Type *>,
Expand Down
12 changes: 12 additions & 0 deletions llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,18 @@ class LoopVectorizationLegality {
return getUncountableExitBlocks()[0];
}

/// Return safe power-of-2 number of elements, which do not prevent store-load
/// forwarding and safe to operate simultaneously.
std::optional<unsigned> getMaxStoreLoadForwardSafeVFPowerOf2() const {
return LAI->getDepChecker().getStoreLoadForwardSafeVFPowerOf2();
}

/// Return safe non-power-of-2 number of elements, which do not prevent
/// store-load forwarding and safe to operate simultaneously.
std::optional<unsigned> getMaxStoreLoadForwardSafeVFNonPowerOf2() const {
return LAI->getDepChecker().getStoreLoadForwardSafeVFNonPowerOf2();
}

/// Returns true if vector representation of the instruction \p I
/// requires mask.
bool isMaskRequired(const Instruction *I) const {
Expand Down
67 changes: 55 additions & 12 deletions llvm/lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ static cl::opt<unsigned> MemoryCheckMergeThreshold(

/// Maximum SIMD width.
const unsigned VectorizerParams::MaxVectorWidth = 64;
/// Maximum LMUL factor.
const unsigned VectorizerParams::MaxVectorUF = 8;

/// We collect dependences up to this threshold.
static cl::opt<unsigned>
Expand Down Expand Up @@ -1752,31 +1754,71 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
// cause any slowdowns.
const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
// Maximum vector factor.
uint64_t MaxVFWithoutSLForwardIssues = std::min(
VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes);
uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
std::min(VectorizerParams::MaxVectorWidth * TypeByteSize,
MaxStoreLoadForwardSafeVF.first.value_or(
std::numeric_limits<uint64_t>::max()));
uint64_t MaxStoreLoadForwardSafeVFNonPower2 =
MaxStoreLoadForwardSafeVF.second.value_or(
AllowNonPow2Deps ? std::numeric_limits<uint64_t>::max() : 0);
uint64_t MaxVFWithoutSLForwardIssuesNonPowerOf2 =
std::min(VectorizerParams::MaxVectorUF *
VectorizerParams::MaxVectorWidth * TypeByteSize,
MaxStoreLoadForwardSafeVFNonPower2);

// Compute the smallest VF at which the store and load would be misaligned.
for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
VF *= 2) {
for (uint64_t VF = 2 * TypeByteSize;
VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) {
// If the number of vector iteration between the store and the load are
// small we could incur conflicts.
if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
MaxVFWithoutSLForwardIssues = (VF >> 1);
MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1);
break;
}
}
// RISCV VLA supports non-power-2 vector factor. So, we iterate in a
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this needed for correctness for RISCV? If not, can be done separately as this adds some extra complexity.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially, we can support only power-of-2. I can split this patch into 2 sub-patches.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes please, this would help to make the patch simpler hopefully.

Also MaxStoreLoadForwardSafeVF is not really a legality constraint but a cost constraint (to prevent cases where the HW support Store to load forwarding, which may be faster than a vector loop not allowing for store->load forwarding).

Is this relevant for cores supporting EVL?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The initial patch with only power-of-2 support is committed already, this one adds non-power-of-2

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also MaxStoreLoadForwardSafeVF is not really a legality constraint but a cost constraint (to prevent cases where the HW support Store to load forwarding, which may be faster than a vector loop not allowing for store->load forwarding).

Right.

Is this relevant for cores supporting EVL?
Yes, but it supports non-power-of-2 sizes due to its nature

// backward order to find largest VF, which allows aligned stores-loads or
// the number of iterations between conflicting memory addresses is not less
// than 8 (NumItersForStoreLoadThroughMemory).
if (AllowNonPow2Deps) {
for (uint64_t VF = MaxVFWithoutSLForwardIssuesNonPowerOf2,
E = 2 * TypeByteSize;
VF >= E; VF -= TypeByteSize) {
if (Distance % VF == 0 ||
Distance / VF >= NumItersForStoreLoadThroughMemory) {
uint64_t GCD = MaxStoreLoadForwardSafeVF.second
? std::gcd(MaxStoreLoadForwardSafeVFNonPower2, VF)
: VF;
MaxVFWithoutSLForwardIssuesNonPowerOf2 = GCD;
break;
}
}
}

if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize &&
MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize) {
LLVM_DEBUG(
dbgs() << "LAA: Distance " << Distance
<< " that could cause a store-load forwarding conflict\n");
return true;
}

if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
MaxVFWithoutSLForwardIssues !=
VectorizerParams::MaxVectorWidth * TypeByteSize)
MinDepDistBytes = MaxVFWithoutSLForwardIssues;
if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize)
MaxStoreLoadForwardSafeVF.first = 1;
else if (MaxVFWithoutSLForwardIssuesPowerOf2 <
MaxStoreLoadForwardSafeVF.first &&
MaxVFWithoutSLForwardIssuesPowerOf2 !=
VectorizerParams::MaxVectorWidth * TypeByteSize)
MaxStoreLoadForwardSafeVF.first = MaxVFWithoutSLForwardIssuesPowerOf2;
if (MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize)
MaxStoreLoadForwardSafeVF.second = 1;
else if (AllowNonPow2Deps &&
(!MaxStoreLoadForwardSafeVF.second ||
MaxVFWithoutSLForwardIssuesNonPowerOf2 <
*MaxStoreLoadForwardSafeVF.second) &&
MaxVFWithoutSLForwardIssuesNonPowerOf2 !=
VectorizerParams::MaxVectorWidth * TypeByteSize)
MaxStoreLoadForwardSafeVF.second = MaxVFWithoutSLForwardIssuesNonPowerOf2;
return false;
}

Expand Down Expand Up @@ -3021,8 +3063,9 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
if (ScalableWidth.isNonZero())
MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
}
DepChecker = std::make_unique<MemoryDepChecker>(*PSE, L, SymbolicStrides,
MaxTargetVectorWidthInBits);
DepChecker = std::make_unique<MemoryDepChecker>(
*PSE, L, SymbolicStrides, MaxTargetVectorWidthInBits,
TTI && TTI->hasActiveVectorLength(0, nullptr, Align()));
PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
if (canAnalyzeLoop())
CanVecMem = analyzeLoop(AA, LI, TLI, DT);
Expand Down
Loading
Loading