-
Couldn't load subscription status.
- Fork 15k
[SLPVectorizer] Widen strided loads. #153074
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Mikhail Gudim (mgudim) ChangesCurrently SLPVectorizer can generate strided loads only for this pattern: In this PR we extend it to this pattern: This works for both run-time and constant strides. Patch is 58.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153074.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index af78b3cc2c7ff..4d43cb7ec0300 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -37,6 +37,11 @@ static cl::opt<unsigned> SLPMaxVF(
"exclusively by SLP vectorizer."),
cl::Hidden);
+static cl::opt<bool> SLPPreferAltOpcVectorization(
+ "riscv-v-slp-prefer-alt-opc-vectorization",
+ cl::desc("Controls preferAlternateOpcodeVectorization"), cl::init(false),
+ cl::Hidden);
+
static cl::opt<unsigned>
RVVMinTripCount("riscv-v-min-trip-count",
cl::desc("Set the lower bound of a trip count to decide on "
@@ -3018,3 +3023,7 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
}
return Options;
}
+
+bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
+ return SLPPreferAltOpcVectorization;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 6a1f4b3e3bedf..254908f97186c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -132,7 +132,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
- bool preferAlternateOpcodeVectorization() const override { return false; }
+ bool preferAlternateOpcodeVectorization() const override;
bool preferEpilogueVectorization() const override {
// Epilogue vectorization is usually unprofitable - tail folding or
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3045eeb3eb48e..207d9fc969f76 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1918,6 +1918,21 @@ class BoUpSLP {
class ShuffleCostEstimator;
class ShuffleInstructionBuilder;
+ // If we decide to generate strided load / store, this struct contains all the
+ // necessary info. It's fields are calculated by analyzeRtStrideCandidate and
+ // analyzeConstantStrideCandidate. Note that Stride can be given either as a
+ // SCEV or as a Value if it already exists.
+ // To get the stride in bytes, StrideVal (or value obtained from StrideSCEV)
+ // has to by multiplied by the size of element of FixedVectorType.
+ struct StridedPtrInfo {
+ Value *StrideVal = nullptr;
+ const SCEV *StrideSCEV = nullptr;
+ // Represents the ammount which needs to be added to the base pointer of
+ // strided load.
+ FixedVectorType *Ty = nullptr;
+ };
+ DenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+
public:
/// Tracks the state we can represent the loads in the given sequence.
enum class LoadsState {
@@ -2078,6 +2093,7 @@ class BoUpSLP {
UserIgnoreList = nullptr;
PostponedGathers.clear();
ValueToGatherNodes.clear();
+ TreeEntryToStridedPtrInfoMap.clear();
}
unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -2214,6 +2230,17 @@ class BoUpSLP {
/// may not be necessary.
bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
+ bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
+ Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices,
+ StridedPtrInfo *SPtrInfo) const;
+
+ bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
+ Type *ElemTy, Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices,
+ StridedPtrInfo *SPtrInfo, int64_t Diff,
+ Value *Ptr0, Value *PtrN) const;
+
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
/// \param VL list of loads.
@@ -2227,6 +2254,7 @@ class BoUpSLP {
LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
SmallVectorImpl<unsigned> &Order,
SmallVectorImpl<Value *> &PointerOps,
+ StridedPtrInfo *SPtrInfo = nullptr,
unsigned *BestVF = nullptr,
bool TryRecursiveCheck = true) const;
@@ -4467,11 +4495,10 @@ class BoUpSLP {
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
- TreeEntry::EntryState
- getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
- bool IsScatterVectorizeUserTE,
- OrdersType &CurrentOrder,
- SmallVectorImpl<Value *> &PointerOps);
+ TreeEntry::EntryState getScalarsVectorizationState(
+ const InstructionsState &S, ArrayRef<Value *> VL,
+ bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
+ SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo);
/// Maps a specific scalar to its tree entry(ies).
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6314,18 +6341,12 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
});
}
-/// Checks if the provided list of pointers \p Pointers represents the strided
-/// pointers for type ElemTy. If they are not, std::nullopt is returned.
-/// Otherwise, if \p Inst is not specified, just initialized optional value is
-/// returned to show that the pointers represent strided pointers. If \p Inst
-/// specified, the runtime stride is materialized before the given \p Inst.
-/// \returns std::nullopt if the pointers are not pointers with the runtime
-/// stride, nullptr or actual stride value, otherwise.
-static std::optional<Value *>
-calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
- const DataLayout &DL, ScalarEvolution &SE,
- SmallVectorImpl<unsigned> &SortedIndices,
- Instruction *Inst = nullptr) {
+/// Returns a SCEV expression for the stride if PointerOps is a set of strided
+/// pointers, or nullptr otherwise.
+static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<unsigned> &SortedIndices,
+ SmallVectorImpl<int64_t> &Coeffs) {
SmallVector<const SCEV *> SCEVs;
const SCEV *PtrSCEVLowest = nullptr;
const SCEV *PtrSCEVHighest = nullptr;
@@ -6334,7 +6355,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
for (Value *Ptr : PointerOps) {
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
if (!PtrSCEV)
- return std::nullopt;
+ return nullptr;
SCEVs.push_back(PtrSCEV);
if (!PtrSCEVLowest && !PtrSCEVHighest) {
PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
@@ -6342,14 +6363,14 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
}
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
if (isa<SCEVCouldNotCompute>(Diff))
- return std::nullopt;
+ return nullptr;
if (Diff->isNonConstantNegative()) {
PtrSCEVLowest = PtrSCEV;
continue;
}
const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
if (isa<SCEVCouldNotCompute>(Diff1))
- return std::nullopt;
+ return nullptr;
if (Diff1->isNonConstantNegative()) {
PtrSCEVHighest = PtrSCEV;
continue;
@@ -6358,7 +6379,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
// Dist = PtrSCEVHighest - PtrSCEVLowest;
const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
if (isa<SCEVCouldNotCompute>(Dist))
- return std::nullopt;
+ return nullptr;
int Size = DL.getTypeStoreSize(ElemTy);
auto TryGetStride = [&](const SCEV *Dist,
const SCEV *Multiplier) -> const SCEV * {
@@ -6379,10 +6400,10 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
Stride = TryGetStride(Dist, Sz);
if (!Stride)
- return std::nullopt;
+ return nullptr;
}
if (!Stride || isa<SCEVConstant>(Stride))
- return std::nullopt;
+ return nullptr;
// Iterate through all pointers and check if all distances are
// unique multiple of Stride.
using DistOrdPair = std::pair<int64_t, int>;
@@ -6396,42 +6417,184 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
const SCEV *Coeff = TryGetStride(Diff, Stride);
if (!Coeff)
- return std::nullopt;
+ return nullptr;
const auto *SC = dyn_cast<SCEVConstant>(Coeff);
if (!SC || isa<SCEVCouldNotCompute>(SC))
- return std::nullopt;
+ return nullptr;
+ Coeffs.push_back((int64_t)SC->getAPInt().getLimitedValue());
if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
SE.getMulExpr(Stride, SC)))
->isZero())
- return std::nullopt;
+ return nullptr;
Dist = SC->getAPInt().getZExtValue();
- }
+ } else
+ Coeffs.push_back(0);
// If the strides are not the same or repeated, we can't vectorize.
if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
- return std::nullopt;
+ return nullptr;
auto Res = Offsets.emplace(Dist, Cnt);
if (!Res.second)
- return std::nullopt;
+ return nullptr;
// Consecutive order if the inserted element is the last one.
IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
++Cnt;
}
if (Offsets.size() != SCEVs.size())
- return std::nullopt;
+ return nullptr;
SortedIndices.clear();
- if (!IsConsecutive) {
- // Fill SortedIndices array only if it is non-consecutive.
- SortedIndices.resize(PointerOps.size());
- Cnt = 0;
- for (const std::pair<int64_t, int> &Pair : Offsets) {
- SortedIndices[Cnt] = Pair.second;
- ++Cnt;
+ SortedIndices.resize(PointerOps.size());
+ Cnt = 0;
+ for (const std::pair<int64_t, int> &Pair : Offsets) {
+ SortedIndices[Cnt] = Pair.second;
+ ++Cnt;
+ }
+ return Stride;
+}
+
+// Suppose we are given pointers of the form: %b + x * %s + y * %c
+// where %c is constant. Check if the pointers can be rearranged as follows:
+// %b + 0 * %s + 0
+// %b + 0 * %s + 1
+// %b + 0 * %s + 2
+// ...
+// %b + 0 * %s + w
+//
+// %b + 1 * %s + 0
+// %b + 1 * %s + 1
+// %b + 1 * %s + 2
+// ...
+// %b + 1 * %s + w
+// ...
+//
+// If the pointers can be rearanged in the above pattern, it means that the
+// memory can be accessed with a strided loads of width `w` and stride `%s`.
+bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
+ Type *ElemTy, Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices,
+ StridedPtrInfo *SPtrInfo) const {
+ // Group the pointers by constant offset.
+ DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
+ OffsetToPointerOpIdxMap;
+ for (auto [Idx, Ptr] : enumerate(PointerOps)) {
+ const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+ if (!PtrSCEV)
+ return false;
+
+ const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+ int64_t Offset = 0;
+ if (Add) {
+ for (int i = 0; i < (int)Add->getNumOperands(); ++i) {
+ const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(i));
+ if (!SC)
+ continue;
+ Offset = (int64_t)(SC->getAPInt().getLimitedValue());
+ break;
+ }
}
+ OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
+ OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
}
- if (!Inst)
- return nullptr;
- SCEVExpander Expander(SE, DL, "strided-load-vec");
- return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
+ int NumOffsets = OffsetToPointerOpIdxMap.size();
+
+ const unsigned Sz = PointerOps.size();
+ unsigned VecSz = Sz;
+ Type *ScalarTy = ElemTy;
+ if (NumOffsets > 1) {
+ if (Sz % NumOffsets != 0)
+ return false;
+ VecSz = Sz / NumOffsets;
+ ScalarTy = Type::getIntNTy(SE->getContext(),
+ DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+ NumOffsets);
+ }
+ FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+ if (!TTI->isTypeLegal(StridedLoadTy) ||
+ !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+ return false;
+
+ SmallVector<int> SortedOffsetsV;
+ for (auto [K, V] : OffsetToPointerOpIdxMap) {
+ SortedOffsetsV.push_back(K);
+ }
+ llvm::sort(SortedOffsetsV);
+ if (NumOffsets > 1) {
+ int CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
+ if ((CommonDiff) != 1)
+ return false;
+ for (int i = 1; i < (int)SortedOffsetsV.size() - 1; ++i) {
+ if (SortedOffsetsV[i + 1] - SortedOffsetsV[i] != CommonDiff)
+ return false;
+ }
+ }
+
+ int64_t LowestOffset = SortedOffsetsV[0];
+ SmallVector<Value *> &PointerOps0 =
+ OffsetToPointerOpIdxMap[LowestOffset].first;
+ SmallVector<unsigned> &IndicesInAllPointerOps0 =
+ OffsetToPointerOpIdxMap[LowestOffset].second;
+
+ SmallVector<int64_t> Coeffs0;
+ SmallVector<unsigned> SortedIndicesForOffset0;
+ const SCEV *Stride0 = calculateRtStride(PointerOps0, ElemTy, *DL, *SE,
+ SortedIndicesForOffset0, Coeffs0);
+ if (!Stride0)
+ return false;
+ unsigned NumCoeffs0 = Coeffs0.size();
+ if (NumCoeffs0 * NumOffsets != Sz)
+ return false;
+ llvm::sort(Coeffs0);
+
+ SmallVector<unsigned> SortedIndicesDraft;
+ SortedIndicesDraft.resize(Sz);
+ auto updateSortedIndices =
+ [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
+ SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
+ int64_t OffsetNum) {
+ unsigned Num = 0;
+ for (unsigned Idx : SortedIndicesForOffset) {
+ SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
+ IndicesInAllPointerOps[Idx];
+ ++Num;
+ }
+ };
+
+ updateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
+
+ SmallVector<int64_t> Coeffs;
+ SmallVector<unsigned> SortedIndicesForOffset;
+ for (int i = 1; i < NumOffsets; ++i) {
+ Coeffs.clear();
+ SortedIndicesForOffset.clear();
+
+ int64_t Offset = SortedOffsetsV[i];
+ SmallVector<Value *> &PointerOpsForOffset =
+ OffsetToPointerOpIdxMap[Offset].first;
+ SmallVector<unsigned> &IndicesInAllPointerOps =
+ OffsetToPointerOpIdxMap[Offset].second;
+ const SCEV *StrideWithinGroup = calculateRtStride(
+ PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
+
+ if ((!StrideWithinGroup) || StrideWithinGroup != Stride0) {
+ return false;
+ }
+ if (Coeffs.size() != NumCoeffs0)
+ return false;
+ llvm::sort(Coeffs);
+ for (unsigned i = 0; i < NumCoeffs0; ++i) {
+ if (Coeffs[i] != Coeffs0[i])
+ return false;
+ }
+
+ updateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, i);
+ }
+
+ SortedIndices.clear();
+ SortedIndices = SortedIndicesDraft;
+ if (SPtrInfo) {
+ SPtrInfo->StrideSCEV = Stride0;
+ SPtrInfo->Ty = StridedLoadTy;
+ }
+ return true;
}
static std::pair<InstructionCost, InstructionCost>
@@ -6761,77 +6924,133 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
CompressMask, LoadVecTy);
}
-/// Checks if strided loads can be generated out of \p VL loads with pointers \p
-/// PointerOps:
-/// 1. Target with strided load support is detected.
-/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
-/// potential stride <= MaxProfitableLoadStride and the potential stride is
-/// power-of-2 (to avoid perf regressions for the very small number of loads)
-/// and max distance > number of loads, or potential stride is -1.
-/// 3. The loads are ordered, or number of unordered loads <=
-/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
-/// to avoid extra costs for very expensive shuffles).
-/// 4. Any pointer operand is an instruction with the users outside of the
-/// current graph (for masked gathers extra extractelement instructions
-/// might be required).
-static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
- ArrayRef<unsigned> Order,
- const TargetTransformInfo &TTI, const DataLayout &DL,
- ScalarEvolution &SE,
- const bool IsAnyPointerUsedOutGraph,
- const int64_t Diff) {
- const size_t Sz = VL.size();
- const uint64_t AbsoluteDiff = std::abs(Diff);
- Type *ScalarTy = VL.front()->getType();
- auto *VecTy = getWidenedType(ScalarTy, Sz);
- if (IsAnyPointerUsedOutGraph ||
- (AbsoluteDiff > Sz &&
- (Sz > MinProfitableStridedLoads ||
- (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
- AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
- Diff == -(static_cast<int64_t>(Sz) - 1)) {
- int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
- if (Diff != Stride * static_cast<int64_t>(Sz - 1))
+// Same as analyzeRtStrideCandidate, but for constant strides.
+bool BoUpSLP::analyzeConstantStrideCandidate(
+ ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
+ int64_t Diff, Value *Ptr0, Value *PtrN) const {
+ const unsigned Sz = PointerOps.size();
+ SmallVector<int64_t> SortedOffsetsFromBase;
+ SortedOffsetsFromBase.resize(Sz);
+ for (unsigned i = 0; i < Sz; ++i) {
+ Value *Ptr =
+ SortedIndices.empty() ? PointerOps[i] : PointerOps[SortedIndices[i]];
+ SortedOffsetsFromBase[i] =
+ *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
+ }
+
+ // Find where the first group ends.
+ assert(SortedOffsetsFromBase.size() > 1);
+ int64_t StrideWithinGroup =
+ SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
+ unsigned GroupSize = 1;
+ for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) {
+ if (SortedOffsetsFromBase[GroupSize] -
+ SortedOffsetsFromBase[GroupSize - 1] !=
+ StrideWithinGroup)
+ break;
+ }
+ unsigned VecSz = Sz;
+ Type *ScalarTy = ElemTy;
+ int64_t StrideIntVal = StrideWithinGroup;
+ FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+
+ if (Sz != GroupSize) {
+ if (Sz % GroupSize != 0)
return false;
- Align Alignment =
- cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
- ->getAlign();
- if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
+ VecSz = Sz / GroupSize;
+
+ if (StrideWithinGroup != 1)
return false;
- Value *Ptr0;
- Value *PtrN;
- if (Order.empty()) {
- Ptr0 = PointerOps.front();
- PtrN = PointerOps.back();
- } else {
- Ptr0 = PointerOps[Order.front()];
- PtrN = PointerOps[Order.back()];
- }
- // Iterate through all pointers and check if all distances are
- // unique multiple of Dist.
- SmallSet<int64_t, 4> Dists;
- for (Value *Ptr : PointerOps) {
- int64_t Dist = 0;
- if (Ptr == PtrN)
- Dist = Diff;
- else if (Ptr != Ptr0)
- Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
- // If the strides are not the same or repeated, we can't
- // vectorize.
- if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+ unsigned VecSz = Sz / GroupSize;
+ ScalarTy = Type::getIntNTy(SE->getContext(),
+ DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+ GroupSize);
+ StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+ if (!TTI->isTypeLegal(StridedLoad...
[truncated]
|
|
@llvm/pr-subscribers-backend-risc-v Author: Mikhail Gudim (mgudim) ChangesCurrently SLPVectorizer can generate strided loads only for this pattern: In this PR we extend it to this pattern: This works for both run-time and constant strides. Patch is 58.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153074.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index af78b3cc2c7ff..4d43cb7ec0300 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -37,6 +37,11 @@ static cl::opt<unsigned> SLPMaxVF(
"exclusively by SLP vectorizer."),
cl::Hidden);
+static cl::opt<bool> SLPPreferAltOpcVectorization(
+ "riscv-v-slp-prefer-alt-opc-vectorization",
+ cl::desc("Controls preferAlternateOpcodeVectorization"), cl::init(false),
+ cl::Hidden);
+
static cl::opt<unsigned>
RVVMinTripCount("riscv-v-min-trip-count",
cl::desc("Set the lower bound of a trip count to decide on "
@@ -3018,3 +3023,7 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
}
return Options;
}
+
+bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
+ return SLPPreferAltOpcVectorization;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 6a1f4b3e3bedf..254908f97186c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -132,7 +132,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
- bool preferAlternateOpcodeVectorization() const override { return false; }
+ bool preferAlternateOpcodeVectorization() const override;
bool preferEpilogueVectorization() const override {
// Epilogue vectorization is usually unprofitable - tail folding or
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3045eeb3eb48e..207d9fc969f76 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1918,6 +1918,21 @@ class BoUpSLP {
class ShuffleCostEstimator;
class ShuffleInstructionBuilder;
+ // If we decide to generate strided load / store, this struct contains all the
+ // necessary info. It's fields are calculated by analyzeRtStrideCandidate and
+ // analyzeConstantStrideCandidate. Note that Stride can be given either as a
+ // SCEV or as a Value if it already exists.
+ // To get the stride in bytes, StrideVal (or value obtained from StrideSCEV)
+ // has to by multiplied by the size of element of FixedVectorType.
+ struct StridedPtrInfo {
+ Value *StrideVal = nullptr;
+ const SCEV *StrideSCEV = nullptr;
+ // Represents the ammount which needs to be added to the base pointer of
+ // strided load.
+ FixedVectorType *Ty = nullptr;
+ };
+ DenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+
public:
/// Tracks the state we can represent the loads in the given sequence.
enum class LoadsState {
@@ -2078,6 +2093,7 @@ class BoUpSLP {
UserIgnoreList = nullptr;
PostponedGathers.clear();
ValueToGatherNodes.clear();
+ TreeEntryToStridedPtrInfoMap.clear();
}
unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -2214,6 +2230,17 @@ class BoUpSLP {
/// may not be necessary.
bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
+ bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
+ Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices,
+ StridedPtrInfo *SPtrInfo) const;
+
+ bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
+ Type *ElemTy, Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices,
+ StridedPtrInfo *SPtrInfo, int64_t Diff,
+ Value *Ptr0, Value *PtrN) const;
+
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
/// \param VL list of loads.
@@ -2227,6 +2254,7 @@ class BoUpSLP {
LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
SmallVectorImpl<unsigned> &Order,
SmallVectorImpl<Value *> &PointerOps,
+ StridedPtrInfo *SPtrInfo = nullptr,
unsigned *BestVF = nullptr,
bool TryRecursiveCheck = true) const;
@@ -4467,11 +4495,10 @@ class BoUpSLP {
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
- TreeEntry::EntryState
- getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
- bool IsScatterVectorizeUserTE,
- OrdersType &CurrentOrder,
- SmallVectorImpl<Value *> &PointerOps);
+ TreeEntry::EntryState getScalarsVectorizationState(
+ const InstructionsState &S, ArrayRef<Value *> VL,
+ bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
+ SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo);
/// Maps a specific scalar to its tree entry(ies).
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6314,18 +6341,12 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
});
}
-/// Checks if the provided list of pointers \p Pointers represents the strided
-/// pointers for type ElemTy. If they are not, std::nullopt is returned.
-/// Otherwise, if \p Inst is not specified, just initialized optional value is
-/// returned to show that the pointers represent strided pointers. If \p Inst
-/// specified, the runtime stride is materialized before the given \p Inst.
-/// \returns std::nullopt if the pointers are not pointers with the runtime
-/// stride, nullptr or actual stride value, otherwise.
-static std::optional<Value *>
-calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
- const DataLayout &DL, ScalarEvolution &SE,
- SmallVectorImpl<unsigned> &SortedIndices,
- Instruction *Inst = nullptr) {
+/// Returns a SCEV expression for the stride if PointerOps is a set of strided
+/// pointers, or nullptr otherwise.
+static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<unsigned> &SortedIndices,
+ SmallVectorImpl<int64_t> &Coeffs) {
SmallVector<const SCEV *> SCEVs;
const SCEV *PtrSCEVLowest = nullptr;
const SCEV *PtrSCEVHighest = nullptr;
@@ -6334,7 +6355,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
for (Value *Ptr : PointerOps) {
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
if (!PtrSCEV)
- return std::nullopt;
+ return nullptr;
SCEVs.push_back(PtrSCEV);
if (!PtrSCEVLowest && !PtrSCEVHighest) {
PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
@@ -6342,14 +6363,14 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
}
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
if (isa<SCEVCouldNotCompute>(Diff))
- return std::nullopt;
+ return nullptr;
if (Diff->isNonConstantNegative()) {
PtrSCEVLowest = PtrSCEV;
continue;
}
const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
if (isa<SCEVCouldNotCompute>(Diff1))
- return std::nullopt;
+ return nullptr;
if (Diff1->isNonConstantNegative()) {
PtrSCEVHighest = PtrSCEV;
continue;
@@ -6358,7 +6379,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
// Dist = PtrSCEVHighest - PtrSCEVLowest;
const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
if (isa<SCEVCouldNotCompute>(Dist))
- return std::nullopt;
+ return nullptr;
int Size = DL.getTypeStoreSize(ElemTy);
auto TryGetStride = [&](const SCEV *Dist,
const SCEV *Multiplier) -> const SCEV * {
@@ -6379,10 +6400,10 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
Stride = TryGetStride(Dist, Sz);
if (!Stride)
- return std::nullopt;
+ return nullptr;
}
if (!Stride || isa<SCEVConstant>(Stride))
- return std::nullopt;
+ return nullptr;
// Iterate through all pointers and check if all distances are
// unique multiple of Stride.
using DistOrdPair = std::pair<int64_t, int>;
@@ -6396,42 +6417,184 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
const SCEV *Coeff = TryGetStride(Diff, Stride);
if (!Coeff)
- return std::nullopt;
+ return nullptr;
const auto *SC = dyn_cast<SCEVConstant>(Coeff);
if (!SC || isa<SCEVCouldNotCompute>(SC))
- return std::nullopt;
+ return nullptr;
+ Coeffs.push_back((int64_t)SC->getAPInt().getLimitedValue());
if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
SE.getMulExpr(Stride, SC)))
->isZero())
- return std::nullopt;
+ return nullptr;
Dist = SC->getAPInt().getZExtValue();
- }
+ } else
+ Coeffs.push_back(0);
// If the strides are not the same or repeated, we can't vectorize.
if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
- return std::nullopt;
+ return nullptr;
auto Res = Offsets.emplace(Dist, Cnt);
if (!Res.second)
- return std::nullopt;
+ return nullptr;
// Consecutive order if the inserted element is the last one.
IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
++Cnt;
}
if (Offsets.size() != SCEVs.size())
- return std::nullopt;
+ return nullptr;
SortedIndices.clear();
- if (!IsConsecutive) {
- // Fill SortedIndices array only if it is non-consecutive.
- SortedIndices.resize(PointerOps.size());
- Cnt = 0;
- for (const std::pair<int64_t, int> &Pair : Offsets) {
- SortedIndices[Cnt] = Pair.second;
- ++Cnt;
+ SortedIndices.resize(PointerOps.size());
+ Cnt = 0;
+ for (const std::pair<int64_t, int> &Pair : Offsets) {
+ SortedIndices[Cnt] = Pair.second;
+ ++Cnt;
+ }
+ return Stride;
+}
+
+// Suppose we are given pointers of the form: %b + x * %s + y * %c
+// where %c is constant. Check if the pointers can be rearranged as follows:
+// %b + 0 * %s + 0
+// %b + 0 * %s + 1
+// %b + 0 * %s + 2
+// ...
+// %b + 0 * %s + w
+//
+// %b + 1 * %s + 0
+// %b + 1 * %s + 1
+// %b + 1 * %s + 2
+// ...
+// %b + 1 * %s + w
+// ...
+//
+// If the pointers can be rearanged in the above pattern, it means that the
+// memory can be accessed with a strided loads of width `w` and stride `%s`.
+bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
+ Type *ElemTy, Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices,
+ StridedPtrInfo *SPtrInfo) const {
+ // Group the pointers by constant offset.
+ DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
+ OffsetToPointerOpIdxMap;
+ for (auto [Idx, Ptr] : enumerate(PointerOps)) {
+ const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+ if (!PtrSCEV)
+ return false;
+
+ const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+ int64_t Offset = 0;
+ if (Add) {
+ for (int i = 0; i < (int)Add->getNumOperands(); ++i) {
+ const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(i));
+ if (!SC)
+ continue;
+ Offset = (int64_t)(SC->getAPInt().getLimitedValue());
+ break;
+ }
}
+ OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
+ OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
}
- if (!Inst)
- return nullptr;
- SCEVExpander Expander(SE, DL, "strided-load-vec");
- return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
+ int NumOffsets = OffsetToPointerOpIdxMap.size();
+
+ const unsigned Sz = PointerOps.size();
+ unsigned VecSz = Sz;
+ Type *ScalarTy = ElemTy;
+ if (NumOffsets > 1) {
+ if (Sz % NumOffsets != 0)
+ return false;
+ VecSz = Sz / NumOffsets;
+ ScalarTy = Type::getIntNTy(SE->getContext(),
+ DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+ NumOffsets);
+ }
+ FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+ if (!TTI->isTypeLegal(StridedLoadTy) ||
+ !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+ return false;
+
+ SmallVector<int> SortedOffsetsV;
+ for (auto [K, V] : OffsetToPointerOpIdxMap) {
+ SortedOffsetsV.push_back(K);
+ }
+ llvm::sort(SortedOffsetsV);
+ if (NumOffsets > 1) {
+ int CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
+ if ((CommonDiff) != 1)
+ return false;
+ for (int i = 1; i < (int)SortedOffsetsV.size() - 1; ++i) {
+ if (SortedOffsetsV[i + 1] - SortedOffsetsV[i] != CommonDiff)
+ return false;
+ }
+ }
+
+ int64_t LowestOffset = SortedOffsetsV[0];
+ SmallVector<Value *> &PointerOps0 =
+ OffsetToPointerOpIdxMap[LowestOffset].first;
+ SmallVector<unsigned> &IndicesInAllPointerOps0 =
+ OffsetToPointerOpIdxMap[LowestOffset].second;
+
+ SmallVector<int64_t> Coeffs0;
+ SmallVector<unsigned> SortedIndicesForOffset0;
+ const SCEV *Stride0 = calculateRtStride(PointerOps0, ElemTy, *DL, *SE,
+ SortedIndicesForOffset0, Coeffs0);
+ if (!Stride0)
+ return false;
+ unsigned NumCoeffs0 = Coeffs0.size();
+ if (NumCoeffs0 * NumOffsets != Sz)
+ return false;
+ llvm::sort(Coeffs0);
+
+ SmallVector<unsigned> SortedIndicesDraft;
+ SortedIndicesDraft.resize(Sz);
+ auto updateSortedIndices =
+ [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
+ SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
+ int64_t OffsetNum) {
+ unsigned Num = 0;
+ for (unsigned Idx : SortedIndicesForOffset) {
+ SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
+ IndicesInAllPointerOps[Idx];
+ ++Num;
+ }
+ };
+
+ updateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
+
+ SmallVector<int64_t> Coeffs;
+ SmallVector<unsigned> SortedIndicesForOffset;
+ for (int i = 1; i < NumOffsets; ++i) {
+ Coeffs.clear();
+ SortedIndicesForOffset.clear();
+
+ int64_t Offset = SortedOffsetsV[i];
+ SmallVector<Value *> &PointerOpsForOffset =
+ OffsetToPointerOpIdxMap[Offset].first;
+ SmallVector<unsigned> &IndicesInAllPointerOps =
+ OffsetToPointerOpIdxMap[Offset].second;
+ const SCEV *StrideWithinGroup = calculateRtStride(
+ PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
+
+ if ((!StrideWithinGroup) || StrideWithinGroup != Stride0) {
+ return false;
+ }
+ if (Coeffs.size() != NumCoeffs0)
+ return false;
+ llvm::sort(Coeffs);
+ for (unsigned i = 0; i < NumCoeffs0; ++i) {
+ if (Coeffs[i] != Coeffs0[i])
+ return false;
+ }
+
+ updateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, i);
+ }
+
+ SortedIndices.clear();
+ SortedIndices = SortedIndicesDraft;
+ if (SPtrInfo) {
+ SPtrInfo->StrideSCEV = Stride0;
+ SPtrInfo->Ty = StridedLoadTy;
+ }
+ return true;
}
static std::pair<InstructionCost, InstructionCost>
@@ -6761,77 +6924,133 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
CompressMask, LoadVecTy);
}
-/// Checks if strided loads can be generated out of \p VL loads with pointers \p
-/// PointerOps:
-/// 1. Target with strided load support is detected.
-/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
-/// potential stride <= MaxProfitableLoadStride and the potential stride is
-/// power-of-2 (to avoid perf regressions for the very small number of loads)
-/// and max distance > number of loads, or potential stride is -1.
-/// 3. The loads are ordered, or number of unordered loads <=
-/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
-/// to avoid extra costs for very expensive shuffles).
-/// 4. Any pointer operand is an instruction with the users outside of the
-/// current graph (for masked gathers extra extractelement instructions
-/// might be required).
-static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
- ArrayRef<unsigned> Order,
- const TargetTransformInfo &TTI, const DataLayout &DL,
- ScalarEvolution &SE,
- const bool IsAnyPointerUsedOutGraph,
- const int64_t Diff) {
- const size_t Sz = VL.size();
- const uint64_t AbsoluteDiff = std::abs(Diff);
- Type *ScalarTy = VL.front()->getType();
- auto *VecTy = getWidenedType(ScalarTy, Sz);
- if (IsAnyPointerUsedOutGraph ||
- (AbsoluteDiff > Sz &&
- (Sz > MinProfitableStridedLoads ||
- (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
- AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
- Diff == -(static_cast<int64_t>(Sz) - 1)) {
- int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
- if (Diff != Stride * static_cast<int64_t>(Sz - 1))
+// Same as analyzeRtStrideCandidate, but for constant strides.
+bool BoUpSLP::analyzeConstantStrideCandidate(
+ ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
+ int64_t Diff, Value *Ptr0, Value *PtrN) const {
+ const unsigned Sz = PointerOps.size();
+ SmallVector<int64_t> SortedOffsetsFromBase;
+ SortedOffsetsFromBase.resize(Sz);
+ for (unsigned i = 0; i < Sz; ++i) {
+ Value *Ptr =
+ SortedIndices.empty() ? PointerOps[i] : PointerOps[SortedIndices[i]];
+ SortedOffsetsFromBase[i] =
+ *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
+ }
+
+ // Find where the first group ends.
+ assert(SortedOffsetsFromBase.size() > 1);
+ int64_t StrideWithinGroup =
+ SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
+ unsigned GroupSize = 1;
+ for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) {
+ if (SortedOffsetsFromBase[GroupSize] -
+ SortedOffsetsFromBase[GroupSize - 1] !=
+ StrideWithinGroup)
+ break;
+ }
+ unsigned VecSz = Sz;
+ Type *ScalarTy = ElemTy;
+ int64_t StrideIntVal = StrideWithinGroup;
+ FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+
+ if (Sz != GroupSize) {
+ if (Sz % GroupSize != 0)
return false;
- Align Alignment =
- cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
- ->getAlign();
- if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
+ VecSz = Sz / GroupSize;
+
+ if (StrideWithinGroup != 1)
return false;
- Value *Ptr0;
- Value *PtrN;
- if (Order.empty()) {
- Ptr0 = PointerOps.front();
- PtrN = PointerOps.back();
- } else {
- Ptr0 = PointerOps[Order.front()];
- PtrN = PointerOps[Order.back()];
- }
- // Iterate through all pointers and check if all distances are
- // unique multiple of Dist.
- SmallSet<int64_t, 4> Dists;
- for (Value *Ptr : PointerOps) {
- int64_t Dist = 0;
- if (Ptr == PtrN)
- Dist = Diff;
- else if (Ptr != Ptr0)
- Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
- // If the strides are not the same or repeated, we can't
- // vectorize.
- if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+ unsigned VecSz = Sz / GroupSize;
+ ScalarTy = Type::getIntNTy(SE->getContext(),
+ DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+ GroupSize);
+ StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+ if (!TTI->isTypeLegal(StridedLoad...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is the satd loop from x264. This test should probably get removed, but it's here for now just to show what code we generate with this patch.
|
What is the benefit of this compared to the existing solution? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By using the map we avoid adding extra field to each node.
|
I mean, what are the perf benefits? Do you have any perf numbers? |
On x264 train workload this gives 16.5% improvement in dynamic instruction count. No affect on other spec benchmarks |
Instruction count is not the best criteria, need to measure real perf changes |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks promising in general
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
341a7f2 to
33dee74
Compare
1457e15 to
cef24ae
Compare
7e6699b to
001c120
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do this in a separate patch
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In that separate patch should I:
(1) Record the SCEV result of the first call to calculateRtStride in a map and then use it when we generate code? (like here https://github.com/llvm/llvm-project/pull/152359/files)
or
(2) Just make calculateRtStride return the SCEV and still call it twice?
(3) two patches: first (1), then (2)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just split the calculateRtStride
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW, I can make a separate NFC patch to move the code into analyzeConstantStrideCandidate . Should I do that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just split the calculateRtStride
Sorry, what do you mean by "split"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean, make it return SCEV * instead of optional and move SCEV expansion to a callee function
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW, I can make a separate NFC patch to move the code into analyzeConstantStrideCandidate . Should I do that?
Yes
Actually this doesn't make sense, because it would be almost the same as just renaming "isStridedLoad"
6c659a9 to
ae3d20b
Compare
|
@alexey-bataev Do you have any more feedback? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why need a cast here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getLimitedValue returns uint64_t
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you try to introduce these functions and their implementation in separate NFC patches? With the original functionality? Just to reduce the number of changes in this patch
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK. I'll do that soon. I also separated SPtrInfo stuff in a separate patch:
#157706
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently SLPVectorizer can generate strided loads only for this pattern: ``` load %base + 0 * %stride load %base + 1 * %stride ... load %base + n * %stride ``` In this PR we extend it to this pattern: ``` ; load w consecutive elements starting at %base load %base + 0 * %stride + 0 load %base + 0 * %stride + 1 load %base + 0 * %stride + 2 ... load %base + 0 * %stride + (w - 1) ; load w consecutive elements starting at %base + 1 * %stride load %base + 1 * %stride + 0 load %base + 1 * %stride + 1 load %base + 1 * %stride + 2 ... load %base + 1 * %stride + (w - 1) ... ; load w consecutive elements starting at %base + n * %stride load %base + n * %stride load %base + n * %stride + 0 load %base + n * %stride + 1 load %base + n * %stride + 2 ... load %base + n * %stride + (w - 1) ``` This works for both run-time and constant strides.
called with nullptr
7e8fa8e to
60864f1
Compare
Currently SLPVectorizer can generate strided loads only for this pattern:
In this PR we extend it to this pattern:
This works for both run-time and constant strides.