-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LAA] Add initial support for non-power-of-2 store-load forwarding distance #137873
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -182,9 +182,10 @@ class MemoryDepChecker { | |
|
|
||
| MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L, | ||
| const DenseMap<Value *, const SCEV *> &SymbolicStrides, | ||
| unsigned MaxTargetVectorWidthInBits) | ||
| unsigned MaxTargetVectorWidthInBits, bool AllowNonPow2Deps) | ||
| : PSE(PSE), InnermostLoop(L), SymbolicStrides(SymbolicStrides), | ||
| MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {} | ||
| MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits), | ||
| AllowNonPow2Deps(AllowNonPow2Deps) {} | ||
|
|
||
| /// Register the location (instructions are given increasing numbers) | ||
| /// of a write access. | ||
|
|
@@ -221,17 +222,29 @@ class MemoryDepChecker { | |
|
|
||
| /// Return true if there are no store-load forwarding dependencies. | ||
| bool isSafeForAnyStoreLoadForwardDistances() const { | ||
| return MaxStoreLoadForwardSafeDistanceInBits == | ||
| std::numeric_limits<uint64_t>::max(); | ||
| return MaxPowerOf2StoreLoadForwardSafeDistanceInBits == | ||
| std::numeric_limits<uint64_t>::max() && | ||
| MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits == | ||
| std::numeric_limits<uint64_t>::max(); | ||
| } | ||
|
|
||
| /// Return safe power-of-2 number of elements, which do not prevent store-load | ||
| /// forwarding, multiplied by the size of the elements in bits. | ||
| uint64_t getStoreLoadForwardSafeDistanceInBits() const { | ||
| /// Return safe number of elements, which do not prevent store-load | ||
| /// forwarding, multiplied by the size of the elements in bits (power-of-2). | ||
| uint64_t getPowerOf2StoreLoadForwardSafeDistanceInBits() const { | ||
| assert(!isSafeForAnyStoreLoadForwardDistances() && | ||
| "Expected the distance, that prevent store-load forwarding, to be " | ||
| "set."); | ||
| return MaxStoreLoadForwardSafeDistanceInBits; | ||
| return MaxPowerOf2StoreLoadForwardSafeDistanceInBits; | ||
| } | ||
|
|
||
| /// Return safe number of elements, which do not prevent store-load | ||
| /// forwarding, multiplied by the size of the elements in bits | ||
| /// (non-power-of-2). | ||
| uint64_t getNonPowerOf2StoreLoadForwardSafeDistanceInBits() const { | ||
| assert(!isSafeForAnyStoreLoadForwardDistances() && | ||
| "Expected the distance, that prevent store-load forwarding, to be " | ||
| "set."); | ||
| return MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits; | ||
| } | ||
|
|
||
| /// In same cases when the dependency check fails we can still | ||
|
|
@@ -322,9 +335,14 @@ class MemoryDepChecker { | |
| /// restrictive. | ||
| uint64_t MaxSafeVectorWidthInBits = -1U; | ||
|
|
||
| /// Maximum power-of-2 number of elements, which do not prevent store-load | ||
| /// forwarding, multiplied by the size of the elements in bits. | ||
| uint64_t MaxStoreLoadForwardSafeDistanceInBits = | ||
| /// Maximum number of elements, which do not prevent store-load forwarding, | ||
| /// multiplied by the size of the elements in bits (power-of-2). | ||
| uint64_t MaxPowerOf2StoreLoadForwardSafeDistanceInBits = | ||
| std::numeric_limits<uint64_t>::max(); | ||
|
|
||
| /// Maximum number of elements, which do not prevent store-load forwarding, | ||
| /// multiplied by the size of the elements in bits (non-power-of-2). | ||
| uint64_t MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits = | ||
| std::numeric_limits<uint64_t>::max(); | ||
|
|
||
| /// If we see a non-constant dependence distance we can still try to | ||
|
|
@@ -351,6 +369,9 @@ class MemoryDepChecker { | |
| /// backwards-vectorizable or unknown (triggering a runtime check). | ||
| unsigned MaxTargetVectorWidthInBits = 0; | ||
|
|
||
| /// True if current target supports non-power-of-2 dependence distances. | ||
| bool AllowNonPow2Deps = false; | ||
|
||
|
|
||
| /// Mapping of SCEV expressions to their expanded pointer bounds (pair of | ||
| /// start and end pointer expressions). | ||
| DenseMap<std::pair<const SCEV *, Type *>, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1819,7 +1819,8 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, | |
| // Maximum vector factor. | ||
| uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 = | ||
| std::min(VectorizerParams::MaxVectorWidth * TypeByteSize, | ||
| MaxStoreLoadForwardSafeDistanceInBits); | ||
| MaxPowerOf2StoreLoadForwardSafeDistanceInBits); | ||
| uint64_t MaxVFWithoutSLForwardIssuesNonPowerOf2 = 0; | ||
|
|
||
| // Compute the smallest VF at which the store and load would be misaligned. | ||
| for (uint64_t VF = 2 * TypeByteSize; | ||
|
|
@@ -1831,24 +1832,61 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, | |
| break; | ||
| } | ||
| } | ||
| // If target supports active vector length, then it supports non-power-2 | ||
alexey-bataev marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| // vector factor. So, we iterate in a backward order to find largest VF, which | ||
| // allows aligned stores-loads or the number of iterations between conflicting | ||
| // memory addresses is not less than 8 (NumItersForStoreLoadThroughMemory). | ||
| if (AllowNonPow2Deps) { | ||
| MaxVFWithoutSLForwardIssuesNonPowerOf2 = | ||
| std::min(8 * VectorizerParams::MaxVectorWidth / TypeByteSize, | ||
| MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits); | ||
|
|
||
| for (uint64_t VF = MaxVFWithoutSLForwardIssuesNonPowerOf2; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we count backwards here while forwards for the power-of-2 case?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The power-of-2 case tries to find the minimal supported vector factor. For non-power-of-2 it tries to find the largest (but still legal) dep distance, so it goes backwards
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok I see. But then it behaves different to Is Am I understanding correctly for example with, Max pow2 VF = 2, MaxNonPowOf2 VF = 9, LV can either chose 2 or 16 (with limiting VF to 9)? If that's the case, I think it. would be good to update the comment for
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Not quite so. Any power-of-2 VF, but not any VF.
Not necessary. We can use any whole divider of the MaxVFWithoutSLForwardIssuesNonPowerOf2. Say, if MaxVFWithoutSLForwardIssuesNonPowerOf2 is 9, then we can use 3 and 9. If it is 6, we can use 2, 3, 6. All these are safe.
The vector factor can be 2, 4, 8 or 16. But with non-power-of-2 we need an extra check ( or special instruction), that the number of the processed elements is limited by 9 or 3 elements only.
Suggestions? Any preferences here?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Thanks for the extra info. I think it would be helpful to update the comment to. include a generalized variant of the extra info you shared regarding what VFs can be picked (highlighted below)
I guess most RISCV HW with EVL support will support store-load forwarding with non-power-of-2 distances? |
||
| VF > MaxVFWithoutSLForwardIssuesPowerOf2; VF -= TypeByteSize) { | ||
| if (Distance % VF == 0 || | ||
| Distance / VF >= NumItersForStoreLoadThroughMemory) { | ||
| uint64_t GCD = | ||
| isSafeForAnyStoreLoadForwardDistances() | ||
alexey-bataev marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ? VF | ||
| : std::gcd(MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits, | ||
| VF); | ||
| MaxVFWithoutSLForwardIssuesNonPowerOf2 = GCD; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) { | ||
| if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize && | ||
| MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize) { | ||
| LLVM_DEBUG( | ||
| dbgs() << "LAA: Distance " << Distance | ||
| << " that could cause a store-load forwarding conflict\n"); | ||
| return true; | ||
| } | ||
|
|
||
| // Handle non-power-2 store-load forwarding distance, power-of-2 distance can | ||
| // be calculated. | ||
| if (AllowNonPow2Deps && CommonStride && | ||
| MaxVFWithoutSLForwardIssuesNonPowerOf2 < | ||
| MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits && | ||
| MaxVFWithoutSLForwardIssuesNonPowerOf2 != | ||
| 8 * VectorizerParams::MaxVectorWidth / TypeByteSize) { | ||
| uint64_t MaxVF = MaxVFWithoutSLForwardIssuesNonPowerOf2 / CommonStride; | ||
| uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; | ||
| MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to compute this separately? Would it instead be possible to always compute the non-power-of-2 version and then have users convert it to the closest power-of-2 if that's what they need?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, it won't work. I tried this, but there might be different results because of CommonStride value |
||
| std::min(MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits, MaxVFInBits); | ||
| } | ||
|
|
||
| if (CommonStride && | ||
| MaxVFWithoutSLForwardIssuesPowerOf2 < | ||
| MaxStoreLoadForwardSafeDistanceInBits && | ||
| MaxPowerOf2StoreLoadForwardSafeDistanceInBits && | ||
| MaxVFWithoutSLForwardIssuesPowerOf2 != | ||
| VectorizerParams::MaxVectorWidth * TypeByteSize) { | ||
| uint64_t MaxVF = | ||
| bit_floor(MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride); | ||
| uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; | ||
| MaxStoreLoadForwardSafeDistanceInBits = | ||
| std::min(MaxStoreLoadForwardSafeDistanceInBits, MaxVFInBits); | ||
| MaxPowerOf2StoreLoadForwardSafeDistanceInBits = | ||
| std::min(MaxPowerOf2StoreLoadForwardSafeDistanceInBits, MaxVFInBits); | ||
| } | ||
| return false; | ||
| } | ||
|
|
@@ -2985,8 +3023,9 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, | |
| MaxTargetVectorWidthInBits = | ||
| TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2; | ||
|
|
||
| DepChecker = std::make_unique<MemoryDepChecker>(*PSE, L, SymbolicStrides, | ||
| MaxTargetVectorWidthInBits); | ||
| DepChecker = std::make_unique<MemoryDepChecker>( | ||
| *PSE, L, SymbolicStrides, MaxTargetVectorWidthInBits, | ||
| TTI && TTI->hasActiveVectorLength()); | ||
| PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE); | ||
| if (canAnalyzeLoop()) | ||
| CanVecMem = analyzeLoop(AA, LI, TLI, DT); | ||
|
|
@@ -3000,7 +3039,9 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { | |
| OS << " with a maximum safe vector width of " | ||
| << DC.getMaxSafeVectorWidthInBits() << " bits"; | ||
| if (!DC.isSafeForAnyStoreLoadForwardDistances()) { | ||
| uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits(); | ||
| uint64_t SLDist = DC.getNonPowerOf2StoreLoadForwardSafeDistanceInBits(); | ||
| if (SLDist == std::numeric_limits<uint64_t>::max()) | ||
| SLDist = DC.getPowerOf2StoreLoadForwardSafeDistanceInBits(); | ||
|
Comment on lines
+3097
to
+3099
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This logic would be eliminated by not having two different fields. |
||
| OS << ", with a maximum safe store-load forward width of " << SLDist | ||
| << " bits"; | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems redundant to have both, when only one is ever going to be used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't know before the vectorization which one is going to be used. Need to keep both