Skip to content

Commit b64fd18

Browse files
author
Mikhail Gudim
committed
[SLPVectorizer] Widen run-time-strided loads.
Suppose we are given pointers of the form: `%b + x * %s + y * %c_i` where `%c_i`s are constants and %s is a run-time fixed value. If the pointers can be rearranged as follows: ``` %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2 ... %b + 0 * %s + w %b + 1 * %s + 0 %b + 1 * %s + 1 %b + 1 * %s + 2 ... %b + 1 * %s + w ... ``` It means that the memory can be accessed with a strided loads of width `w` and stride `%s`. This is motivated by x264 benchmark.
1 parent 0833db8 commit b64fd18

File tree

2 files changed

+192
-14
lines changed

2 files changed

+192
-14
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 115 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2266,7 +2266,7 @@ class BoUpSLP {
22662266
int64_t Diff, Value *Ptr0, Value *PtrN,
22672267
StridedPtrInfo &SPtrInfo) const;
22682268

2269-
bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2269+
bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
22702270
Align CommonAlignment,
22712271
SmallVectorImpl<unsigned> &SortedIndices,
22722272
StridedPtrInfo &SPtrInfo) const;
@@ -6403,7 +6403,8 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
64036403
/// Otherwise, SCEV* of the stride value is returned.
64046404
static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
64056405
const DataLayout &DL, ScalarEvolution &SE,
6406-
SmallVectorImpl<unsigned> &SortedIndices) {
6406+
SmallVectorImpl<unsigned> &SortedIndices,
6407+
SmallVectorImpl<int64_t> &Coeffs) {
64076408
SmallVector<const SCEV *> SCEVs;
64086409
const SCEV *PtrSCEVLowest = nullptr;
64096410
const SCEV *PtrSCEVHighest = nullptr;
@@ -6478,12 +6479,14 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
64786479
const auto *SC = dyn_cast<SCEVConstant>(Coeff);
64796480
if (!SC || isa<SCEVCouldNotCompute>(SC))
64806481
return nullptr;
6482+
Coeffs.push_back((int64_t)SC->getAPInt().getLimitedValue());
64816483
if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
64826484
SE.getMulExpr(Stride, SC)))
64836485
->isZero())
64846486
return nullptr;
64856487
Dist = SC->getAPInt().getZExtValue();
6486-
}
6488+
} else
6489+
Coeffs.push_back(0);
64876490
// If the strides are not the same or repeated, we can't vectorize.
64886491
if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
64896492
return nullptr;
@@ -6965,23 +6968,124 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
69656968
}
69666969

69676970
bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
6968-
Type *ScalarTy, Align CommonAlignment,
6971+
Type *ElemTy, Align CommonAlignment,
69696972
SmallVectorImpl<unsigned> &SortedIndices,
69706973
StridedPtrInfo &SPtrInfo) const {
6974+
// Group the pointers by constant offset.
6975+
SmallDenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
6976+
OffsetToPointerOpIdxMap;
6977+
for (auto [Idx, Ptr] : enumerate(PointerOps)) {
6978+
const SCEV *PtrSCEV = SE->getSCEV(Ptr);
6979+
if (!PtrSCEV)
6980+
return false;
6981+
6982+
const auto *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
6983+
int64_t Offset = 0;
6984+
if (Add) {
6985+
for (int I : seq<int>(Add->getNumOperands())) {
6986+
const auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
6987+
if (!SC)
6988+
continue;
6989+
Offset = SC->getAPInt().getSExtValue();
6990+
break;
6991+
}
6992+
}
6993+
OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
6994+
OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
6995+
}
6996+
int NumOffsets = OffsetToPointerOpIdxMap.size();
6997+
69716998
const unsigned Sz = PointerOps.size();
6972-
// TODO: VecSz may change if we widen the strided load.
69736999
unsigned VecSz = Sz;
7000+
Type *ScalarTy = ElemTy;
7001+
if (NumOffsets > 1) {
7002+
if (Sz % NumOffsets != 0)
7003+
return false;
7004+
VecSz = Sz / NumOffsets;
7005+
ScalarTy = Type::getIntNTy(SE->getContext(),
7006+
DL->getTypeSizeInBits(ElemTy).getFixedValue() *
7007+
NumOffsets);
7008+
}
69747009
FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
69757010
if (!(Sz > MinProfitableStridedLoads && TTI->isTypeLegal(StridedLoadTy) &&
69767011
TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment)))
69777012
return false;
6978-
if (const SCEV *Stride =
6979-
calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) {
6980-
SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
6981-
SPtrInfo.StrideSCEV = Stride;
6982-
return true;
7013+
7014+
SmallVector<int64_t> SortedOffsetsV;
7015+
for (auto [K, _] : OffsetToPointerOpIdxMap)
7016+
SortedOffsetsV.push_back(K);
7017+
sort(SortedOffsetsV);
7018+
if (NumOffsets > 1) {
7019+
int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
7020+
if (CommonDiff != 1)
7021+
return false;
7022+
for (int I : seq<int>(1, SortedOffsetsV.size() - 1)) {
7023+
if (SortedOffsetsV[I + 1] - SortedOffsetsV[I] != CommonDiff)
7024+
return false;
7025+
}
69837026
}
6984-
return false;
7027+
7028+
int64_t LowestOffset = SortedOffsetsV[0];
7029+
SmallVector<Value *> &PointerOps0 =
7030+
OffsetToPointerOpIdxMap[LowestOffset].first;
7031+
SmallVector<unsigned> &IndicesInAllPointerOps0 =
7032+
OffsetToPointerOpIdxMap[LowestOffset].second;
7033+
7034+
SmallVector<int64_t> Coeffs0;
7035+
SmallVector<unsigned> SortedIndicesForOffset0;
7036+
const SCEV *Stride0 = calculateRtStride(PointerOps0, ElemTy, *DL, *SE,
7037+
SortedIndicesForOffset0, Coeffs0);
7038+
if (!Stride0)
7039+
return false;
7040+
unsigned NumCoeffs0 = Coeffs0.size();
7041+
if (NumCoeffs0 * NumOffsets != Sz)
7042+
return false;
7043+
sort(Coeffs0);
7044+
7045+
SmallVector<unsigned> SortedIndicesDraft;
7046+
SortedIndicesDraft.resize(Sz);
7047+
auto UpdateSortedIndices =
7048+
[&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7049+
SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
7050+
int64_t OffsetNum) {
7051+
for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
7052+
SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7053+
IndicesInAllPointerOps[Idx];
7054+
}
7055+
};
7056+
7057+
UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7058+
7059+
SmallVector<int64_t> Coeffs;
7060+
SmallVector<unsigned> SortedIndicesForOffset;
7061+
for (int I : seq<int>(1, NumOffsets)) {
7062+
Coeffs.clear();
7063+
SortedIndicesForOffset.clear();
7064+
7065+
int64_t Offset = SortedOffsetsV[I];
7066+
SmallVector<Value *> &PointerOpsForOffset =
7067+
OffsetToPointerOpIdxMap[Offset].first;
7068+
SmallVector<unsigned> &IndicesInAllPointerOps =
7069+
OffsetToPointerOpIdxMap[Offset].second;
7070+
const SCEV *StrideWithinGroup = calculateRtStride(
7071+
PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
7072+
7073+
if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7074+
return false;
7075+
if (Coeffs.size() != NumCoeffs0)
7076+
return false;
7077+
sort(Coeffs);
7078+
if (Coeffs != Coeffs0)
7079+
return false;
7080+
7081+
UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
7082+
}
7083+
7084+
SortedIndices.clear();
7085+
SortedIndices = SortedIndicesDraft;
7086+
SPtrInfo.StrideSCEV = Stride0;
7087+
SPtrInfo.Ty = StridedLoadTy;
7088+
return true;
69857089
}
69867090

69877091
BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(

llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -332,11 +332,85 @@ define void @rt_stride_1_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
332332
; CHECK-LABEL: define void @rt_stride_1_no_reordering(
333333
; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
334334
; CHECK-NEXT: [[STRIDE0:%.*]] = mul nsw i64 [[STRIDE]], 0
335+
; CHECK-NEXT: [[STRIDE1:%.*]] = mul nsw i64 [[STRIDE]], 1
336+
; CHECK-NEXT: [[STRIDE2:%.*]] = mul nsw i64 [[STRIDE]], 2
337+
; CHECK-NEXT: [[STRIDE3:%.*]] = mul nsw i64 [[STRIDE]], 3
338+
; CHECK-NEXT: [[STRIDE4:%.*]] = mul nsw i64 [[STRIDE]], 4
339+
; CHECK-NEXT: [[STRIDE5:%.*]] = mul nsw i64 [[STRIDE]], 5
340+
; CHECK-NEXT: [[STRIDE6:%.*]] = mul nsw i64 [[STRIDE]], 6
341+
; CHECK-NEXT: [[STRIDE7:%.*]] = mul nsw i64 [[STRIDE]], 7
342+
; CHECK-NEXT: [[STRIDE8:%.*]] = mul nsw i64 [[STRIDE]], 8
343+
; CHECK-NEXT: [[STRIDE9:%.*]] = mul nsw i64 [[STRIDE]], 9
344+
; CHECK-NEXT: [[STRIDE10:%.*]] = mul nsw i64 [[STRIDE]], 10
345+
; CHECK-NEXT: [[STRIDE11:%.*]] = mul nsw i64 [[STRIDE]], 11
346+
; CHECK-NEXT: [[STRIDE12:%.*]] = mul nsw i64 [[STRIDE]], 12
347+
; CHECK-NEXT: [[STRIDE13:%.*]] = mul nsw i64 [[STRIDE]], 13
348+
; CHECK-NEXT: [[STRIDE14:%.*]] = mul nsw i64 [[STRIDE]], 14
349+
; CHECK-NEXT: [[STRIDE15:%.*]] = mul nsw i64 [[STRIDE]], 15
335350
; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE0]]
351+
; CHECK-NEXT: [[GEP_L1:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE1]]
352+
; CHECK-NEXT: [[GEP_L2:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE2]]
353+
; CHECK-NEXT: [[GEP_L3:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE3]]
354+
; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE4]]
355+
; CHECK-NEXT: [[GEP_L5:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE5]]
356+
; CHECK-NEXT: [[GEP_L6:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE6]]
357+
; CHECK-NEXT: [[GEP_L7:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE7]]
358+
; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE8]]
359+
; CHECK-NEXT: [[GEP_L9:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE9]]
360+
; CHECK-NEXT: [[GEP_L10:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE10]]
361+
; CHECK-NEXT: [[GEP_L11:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE11]]
362+
; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE12]]
363+
; CHECK-NEXT: [[GEP_L13:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE13]]
364+
; CHECK-NEXT: [[GEP_L14:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE14]]
365+
; CHECK-NEXT: [[GEP_L15:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE15]]
366+
; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP_L0]], align 1
367+
; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP_L1]], align 1
368+
; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP_L2]], align 1
369+
; CHECK-NEXT: [[LOAD3:%.*]] = load i8, ptr [[GEP_L3]], align 1
370+
; CHECK-NEXT: [[LOAD4:%.*]] = load i8, ptr [[GEP_L4]], align 1
371+
; CHECK-NEXT: [[LOAD5:%.*]] = load i8, ptr [[GEP_L5]], align 1
372+
; CHECK-NEXT: [[LOAD6:%.*]] = load i8, ptr [[GEP_L6]], align 1
373+
; CHECK-NEXT: [[LOAD7:%.*]] = load i8, ptr [[GEP_L7]], align 1
374+
; CHECK-NEXT: [[LOAD8:%.*]] = load i8, ptr [[GEP_L8]], align 1
375+
; CHECK-NEXT: [[LOAD9:%.*]] = load i8, ptr [[GEP_L9]], align 1
376+
; CHECK-NEXT: [[LOAD10:%.*]] = load i8, ptr [[GEP_L10]], align 1
377+
; CHECK-NEXT: [[LOAD11:%.*]] = load i8, ptr [[GEP_L11]], align 1
378+
; CHECK-NEXT: [[LOAD12:%.*]] = load i8, ptr [[GEP_L12]], align 1
379+
; CHECK-NEXT: [[LOAD13:%.*]] = load i8, ptr [[GEP_L13]], align 1
380+
; CHECK-NEXT: [[LOAD14:%.*]] = load i8, ptr [[GEP_L14]], align 1
381+
; CHECK-NEXT: [[LOAD15:%.*]] = load i8, ptr [[GEP_L15]], align 1
336382
; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
337-
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STRIDE]], 1
338-
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 1 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16)
339-
; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
383+
; CHECK-NEXT: [[GEP_S1:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 1
384+
; CHECK-NEXT: [[GEP_S2:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 2
385+
; CHECK-NEXT: [[GEP_S3:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 3
386+
; CHECK-NEXT: [[GEP_S4:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 4
387+
; CHECK-NEXT: [[GEP_S5:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 5
388+
; CHECK-NEXT: [[GEP_S6:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 6
389+
; CHECK-NEXT: [[GEP_S7:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 7
390+
; CHECK-NEXT: [[GEP_S8:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 8
391+
; CHECK-NEXT: [[GEP_S9:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 9
392+
; CHECK-NEXT: [[GEP_S10:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 10
393+
; CHECK-NEXT: [[GEP_S11:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 11
394+
; CHECK-NEXT: [[GEP_S12:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 12
395+
; CHECK-NEXT: [[GEP_S13:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 13
396+
; CHECK-NEXT: [[GEP_S14:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 14
397+
; CHECK-NEXT: [[GEP_S15:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 15
398+
; CHECK-NEXT: store i8 [[LOAD0]], ptr [[GEP_S0]], align 1
399+
; CHECK-NEXT: store i8 [[LOAD1]], ptr [[GEP_S1]], align 1
400+
; CHECK-NEXT: store i8 [[LOAD2]], ptr [[GEP_S2]], align 1
401+
; CHECK-NEXT: store i8 [[LOAD3]], ptr [[GEP_S3]], align 1
402+
; CHECK-NEXT: store i8 [[LOAD4]], ptr [[GEP_S4]], align 1
403+
; CHECK-NEXT: store i8 [[LOAD5]], ptr [[GEP_S5]], align 1
404+
; CHECK-NEXT: store i8 [[LOAD6]], ptr [[GEP_S6]], align 1
405+
; CHECK-NEXT: store i8 [[LOAD7]], ptr [[GEP_S7]], align 1
406+
; CHECK-NEXT: store i8 [[LOAD8]], ptr [[GEP_S8]], align 1
407+
; CHECK-NEXT: store i8 [[LOAD9]], ptr [[GEP_S9]], align 1
408+
; CHECK-NEXT: store i8 [[LOAD10]], ptr [[GEP_S10]], align 1
409+
; CHECK-NEXT: store i8 [[LOAD11]], ptr [[GEP_S11]], align 1
410+
; CHECK-NEXT: store i8 [[LOAD12]], ptr [[GEP_S12]], align 1
411+
; CHECK-NEXT: store i8 [[LOAD13]], ptr [[GEP_S13]], align 1
412+
; CHECK-NEXT: store i8 [[LOAD14]], ptr [[GEP_S14]], align 1
413+
; CHECK-NEXT: store i8 [[LOAD15]], ptr [[GEP_S15]], align 1
340414
; CHECK-NEXT: ret void
341415
;
342416
%stride0 = mul nsw i64 %stride, 0

0 commit comments

Comments
 (0)