Skip to content

Commit 1c62296

Browse files
author
Mikhail Gudim
committed
[SLPVectorizer] Widen run-time-strided loads.
Suppose we are given pointers of the form: `%b + x * %s + y * %c_i` where `%c_i`s are constants and %s is a run-time fixed value. If the pointers can be rearranged as follows: ``` %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2 ... %b + 0 * %s + w %b + 1 * %s + 0 %b + 1 * %s + 1 %b + 1 * %s + 2 ... %b + 1 * %s + w ... ``` It means that the memory can be accessed with a strided loads of width `w` and stride `%s`. This is motivated by x264 benchmark.
1 parent 274a4c0 commit 1c62296

File tree

3 files changed

+145
-84
lines changed

3 files changed

+145
-84
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 130 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6529,7 +6529,8 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
65296529
/// Otherwise, SCEV* of the stride value is returned.
65306530
static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
65316531
const DataLayout &DL, ScalarEvolution &SE,
6532-
SmallVectorImpl<unsigned> &SortedIndices) {
6532+
SmallVectorImpl<unsigned> &SortedIndices,
6533+
SmallVectorImpl<int64_t> &Coeffs) {
65336534
SmallVector<const SCEV *> SCEVs;
65346535
const SCEV *PtrSCEVLowest = nullptr;
65356536
const SCEV *PtrSCEVHighest = nullptr;
@@ -6604,12 +6605,14 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
66046605
const auto *SC = dyn_cast<SCEVConstant>(Coeff);
66056606
if (!SC || isa<SCEVCouldNotCompute>(SC))
66066607
return nullptr;
6608+
Coeffs.push_back((int64_t)SC->getAPInt().getLimitedValue());
66076609
if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
66086610
SE.getMulExpr(Stride, SC)))
66096611
->isZero())
66106612
return nullptr;
66116613
Dist = SC->getAPInt().getZExtValue();
6612-
}
6614+
} else
6615+
Coeffs.push_back(0);
66136616
// If the strides are not the same or repeated, we can't vectorize.
66146617
if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
66156618
return nullptr;
@@ -7105,18 +7108,134 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
71057108
Type *ScalarTy, Align CommonAlignment,
71067109
SmallVectorImpl<unsigned> &SortedIndices,
71077110
StridedPtrInfo &SPtrInfo) const {
7111+
// If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7112+
// is constant for each offset we record values from `PointerOps` and their
7113+
// indicies in `PointerOps`.
7114+
SmallDenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
7115+
OffsetToPointerOpIdxMap;
7116+
for (auto [Idx, Ptr] : enumerate(PointerOps)) {
7117+
const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7118+
if (!PtrSCEV)
7119+
return false;
7120+
7121+
const auto *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
7122+
int64_t Offset = 0;
7123+
if (Add) {
7124+
for (int I : seq<int>(Add->getNumOperands())) {
7125+
const auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
7126+
if (!SC)
7127+
continue;
7128+
Offset = SC->getAPInt().getSExtValue();
7129+
break;
7130+
}
7131+
}
7132+
OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
7133+
OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
7134+
}
7135+
int NumOffsets = OffsetToPointerOpIdxMap.size();
7136+
71087137
const unsigned Sz = PointerOps.size();
7109-
FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, Sz);
7110-
if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7111-
!TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7138+
unsigned VecSz = Sz;
7139+
Type *NewScalarTy = ScalarTy;
7140+
if (NumOffsets > 1) {
7141+
if (Sz % NumOffsets != 0)
7142+
return false;
7143+
VecSz = Sz / NumOffsets;
7144+
NewScalarTy = Type::getIntNTy(SE->getContext(),
7145+
DL->getTypeSizeInBits(ScalarTy).getFixedValue() *
7146+
NumOffsets);
7147+
}
7148+
FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
7149+
if (!(Sz > MinProfitableStridedLoads && TTI->isTypeLegal(StridedLoadTy) &&
7150+
TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment)))
71127151
return false;
7113-
if (const SCEV *Stride =
7114-
calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) {
7115-
SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
7116-
SPtrInfo.StrideSCEV = Stride;
7117-
return true;
7152+
7153+
// Check if the offsets are contiguous.
7154+
SmallVector<int64_t> SortedOffsetsV;
7155+
for (auto [K, _] : OffsetToPointerOpIdxMap)
7156+
SortedOffsetsV.push_back(K);
7157+
sort(SortedOffsetsV);
7158+
if (NumOffsets > 1) {
7159+
int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
7160+
if (CommonDiff != 1)
7161+
return false;
7162+
for (int I : seq<int>(1, SortedOffsetsV.size() - 1)) {
7163+
if (SortedOffsetsV[I + 1] - SortedOffsetsV[I] != CommonDiff)
7164+
return false;
7165+
}
71187166
}
7119-
return false;
7167+
7168+
// For the set of pointers with the same offset check that the distance
7169+
// between adjacent pointers are all equal to the same value (stride). As we
7170+
// do that, also calculate SortedIndices. Since we should not modify
7171+
// `SortedIndices` unless we know that all the checks succeede, record the
7172+
// indicies into `SortedIndicesDraft`.
7173+
int64_t LowestOffset = SortedOffsetsV[0];
7174+
SmallVector<Value *> &PointerOps0 =
7175+
OffsetToPointerOpIdxMap[LowestOffset].first;
7176+
SmallVector<unsigned> &IndicesInAllPointerOps0 =
7177+
OffsetToPointerOpIdxMap[LowestOffset].second;
7178+
7179+
SmallVector<int64_t> Coeffs0;
7180+
SmallVector<unsigned> SortedIndicesForOffset0;
7181+
const SCEV *Stride0 = calculateRtStride(PointerOps0, ScalarTy, *DL, *SE,
7182+
SortedIndicesForOffset0, Coeffs0);
7183+
if (!Stride0)
7184+
return false;
7185+
unsigned NumCoeffs0 = Coeffs0.size();
7186+
if (NumCoeffs0 * NumOffsets != Sz)
7187+
return false;
7188+
sort(Coeffs0);
7189+
7190+
SmallVector<unsigned> SortedIndicesDraft;
7191+
SortedIndicesDraft.resize(Sz);
7192+
auto UpdateSortedIndices =
7193+
[&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7194+
const SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
7195+
const int64_t OffsetNum) {
7196+
if (SortedIndicesForOffset.empty()) {
7197+
SortedIndicesForOffset.resize(IndicesInAllPointerOps.size());
7198+
std::iota(SortedIndicesForOffset.begin(),
7199+
SortedIndicesForOffset.end(), 0);
7200+
}
7201+
for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
7202+
SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7203+
IndicesInAllPointerOps[Idx];
7204+
}
7205+
};
7206+
7207+
UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7208+
7209+
SmallVector<int64_t> Coeffs;
7210+
SmallVector<unsigned> SortedIndicesForOffset;
7211+
for (int I : seq<int>(1, NumOffsets)) {
7212+
Coeffs.clear();
7213+
SortedIndicesForOffset.clear();
7214+
7215+
int64_t Offset = SortedOffsetsV[I];
7216+
SmallVector<Value *> &PointerOpsForOffset =
7217+
OffsetToPointerOpIdxMap[Offset].first;
7218+
SmallVector<unsigned> &IndicesInAllPointerOps =
7219+
OffsetToPointerOpIdxMap[Offset].second;
7220+
const SCEV *StrideWithinGroup = calculateRtStride(
7221+
PointerOpsForOffset, ScalarTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
7222+
7223+
if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7224+
return false;
7225+
if (Coeffs.size() != NumCoeffs0)
7226+
return false;
7227+
sort(Coeffs);
7228+
if (Coeffs != Coeffs0)
7229+
return false;
7230+
7231+
UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
7232+
}
7233+
7234+
SortedIndices.clear();
7235+
SortedIndices = SortedIndicesDraft;
7236+
SPtrInfo.StrideSCEV = Stride0;
7237+
SPtrInfo.Ty = StridedLoadTy;
7238+
return true;
71207239
}
71217240

71227241
BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(

llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -710,25 +710,11 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
710710
; CHECK-LABEL: define void @rt_stride_widen_no_reordering(
711711
; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
712712
; CHECK-NEXT: [[OFFSET0:%.*]] = mul nsw i64 [[STRIDE]], 0
713-
; CHECK-NEXT: [[OFFSET4:%.*]] = mul nsw i64 [[STRIDE]], 1
714-
; CHECK-NEXT: [[OFFSET8:%.*]] = mul nsw i64 [[STRIDE]], 2
715-
; CHECK-NEXT: [[OFFSET12:%.*]] = mul nsw i64 [[STRIDE]], 3
716713
; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET0]]
717-
; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET4]]
718-
; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET8]]
719-
; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET12]]
720714
; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
721-
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1
722-
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1
723-
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1
724-
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1
725-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
726-
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
727-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
728-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
729-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
730-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
731-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
715+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STRIDE]], 1
716+
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[GEP_L0]], i64 [[TMP1]], <4 x i1> splat (i1 true), i32 4)
717+
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
732718
; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
733719
; CHECK-NEXT: ret void
734720
;

0 commit comments

Comments
 (0)