Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 100 additions & 24 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2248,7 +2248,6 @@ class BoUpSLP {
/// Return true if an array of scalar loads can be replaced with a strided
/// load (with constant stride).
///
/// TODO:
/// It is possible that the load gets "widened". Suppose that originally each
/// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
/// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
Expand Down Expand Up @@ -6942,32 +6941,99 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
const size_t Sz = PointerOps.size();
if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
SmallVector<int64_t> SortedOffsetsFromBase(Sz);
// Go through `PointerOps` in sorted order and record offsets from `Ptr0`.
for (unsigned I : seq<unsigned>(Sz)) {
Value *Ptr =
SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
SortedOffsetsFromBase[I] =
*getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
}

// The code below checks that `SortedOffsetsFromBase` looks as follows:
// ```
// [
// (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
// (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
// ...
// (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
// GroupSize - 1}), // last group
// ]
// ```
// The distance between consecutive elements within each group should all be
// the same `StrideWithinGroup`. The distance between the first elements of
// consecutive groups should all be the same `StrideBetweenGroups`.

int64_t StrideWithinGroup =
SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
// Determine size of the first group. Later we will check that all other
// groups have the same size.
auto isEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
StrideWithinGroup;
};
unsigned GroupSize = *llvm::find_if(seq<unsigned>(1, Sz), isEndOfGroupIndex);

unsigned VecSz = Sz;
Type *NewScalarTy = ScalarTy;
int64_t StrideIntVal = StrideWithinGroup;
FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can move it after the if with NeedsWidening check

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, but not sure that I understood you correctly.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean, you can move FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz); after first if (NeedsWidening) { construct

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The StridedLoadTy changes value inside the first if (NeedsWidening) { in this line StridedLoadTy = getWidenedType(NewScalarTy, VecSz);

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, just changes, does not use it. It means you can define it after this first if

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, right. Sorry it took me so long to realize. I actually pushed it all the way down to its only use.


// Quick detour: at this point we can say what the type of strided load would
// be if all the checks pass. Check if this type is legal for the target.
bool NeedsWidening = Sz != GroupSize;
if (NeedsWidening) {
if (Sz % GroupSize != 0)
return false;
VecSz = Sz / GroupSize;

if (StrideWithinGroup != 1)
return false;
unsigned VecSz = Sz / GroupSize;
NewScalarTy = Type::getIntNTy(
SE->getContext(),
DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
}

if (!isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
return false;

int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
if (NeedsWidening) {
// Continue with checking the "shape" of `SortedOffsetsFromBase`.
// Check that the strides between groups are all the same.
unsigned CurrentGroupStartIdx = GroupSize;
int64_t StrideBetweenGroups =
SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
StrideIntVal = StrideBetweenGroups;
for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
StrideBetweenGroups)
return false;
}

// Iterate through all pointers and check if all distances are
// unique multiple of Dist.
SmallSet<int64_t, 4> Dists;
for (Value *Ptr : PointerOps) {
int64_t Dist = 0;
if (Ptr == PtrN)
Dist = Diff;
else if (Ptr != Ptr0)
Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
// If the strides are not the same or repeated, we can't
// vectorize.
if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
break;
}
if (Dists.size() == Sz) {
Type *StrideTy = DL->getIndexType(Ptr0->getType());
SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
return true;
auto CheckGroup = [&](const unsigned StartIdx, const unsigned GroupSize0,
const int64_t StrideWithinGroup) -> bool {
unsigned GroupEndIdx = StartIdx + 1;
for (; GroupEndIdx != Sz; ++GroupEndIdx) {
if (SortedOffsetsFromBase[GroupEndIdx] -
SortedOffsetsFromBase[GroupEndIdx - 1] !=
StrideWithinGroup)
break;
}
return GroupEndIdx - StartIdx == GroupSize0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
break;
return false;

?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, because we're computing GroupEndIdx in this loop.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, I can rewrite this using find_if as with the previous

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, please

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

};
for (unsigned I = 0; I < Sz; I += GroupSize) {
if (!CheckGroup(I, GroupSize, StrideWithinGroup))
return false;
}
}
return false;

Type *StrideTy = DL->getIndexType(Ptr0->getType());
SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
SPtrInfo.Ty = StridedLoadTy;
return true;
}

bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
Expand Down Expand Up @@ -14989,11 +15055,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
break;
case TreeEntry::StridedVectorize: {
const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
Align CommonAlignment =
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
VecLdCost = TTI->getStridedMemoryOpCost(
Instruction::Load, VecTy, LI0->getPointerOperand(),
Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
/*VariableMask=*/false, CommonAlignment, CostKind);
if (StridedLoadTy != VecTy)
VecLdCost +=
TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
getCastContextHint(*E), CostKind);

break;
}
case TreeEntry::CompressVectorize: {
Expand Down Expand Up @@ -19760,6 +19834,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
? NewLI
: ::propagateMetadata(NewLI, E->Scalars);

if (StridedLoadTy != VecTy)
V = Builder.CreateBitOrPointerCast(V, VecTy);
V = FinalShuffle(V, E);
E->VectorizedValue = V;
++NumVectorInstructions;
Expand Down
18 changes: 3 additions & 15 deletions llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5

; RUN: opt -mtriple=riscv64 -mattr=+m,+v -passes=slp-vectorizer -S < %s | FileCheck %s
; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem -passes=slp-vectorizer -S < %s | FileCheck %s

define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
; CHECK-LABEL: define void @const_stride_1_no_reordering(
Expand Down Expand Up @@ -621,21 +621,9 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
; CHECK-LABEL: define void @constant_stride_widen_no_reordering(
; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 100
; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 200
; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 300
; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[GEP_L0]], i64 100, <4 x i1> splat (i1 true), i32 4)
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
; CHECK-NEXT: ret void
;
Expand Down