Skip to content

Commit cf1a11d

Browse files
committed
[𝘀𝗽𝗿] initial version
Created using spr 1.3.5
1 parent 3b0ec61 commit cf1a11d

File tree

3 files changed

+62
-62
lines changed

3 files changed

+62
-62
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5310,12 +5310,11 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
53105310
/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
53115311
/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
53125312
/// instead of a scalar.
5313-
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI,
5314-
Type *ScalarTy, VectorType *Ty,
5315-
const APInt &DemandedElts,
5316-
bool Insert, bool Extract,
5317-
TTI::TargetCostKind CostKind,
5318-
ArrayRef<Value *> VL = {}) {
5313+
static InstructionCost
5314+
getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
5315+
VectorType *Ty, const APInt &DemandedElts, bool Insert,
5316+
bool Extract, TTI::TargetCostKind CostKind,
5317+
bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
53195318
assert(!isa<ScalableVectorType>(Ty) &&
53205319
"ScalableVectorType is not supported.");
53215320
assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
@@ -5339,8 +5338,19 @@ static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI,
53395338
}
53405339
return Cost;
53415340
}
5342-
return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
5343-
CostKind, VL);
5341+
APInt NewDemandedElts = DemandedElts;
5342+
InstructionCost Cost = 0;
5343+
if (!ForPoisonSrc && Insert) {
5344+
// Handle insert into non-poison vector.
5345+
unsigned LeftMostBit = NewDemandedElts.countr_zero();
5346+
NewDemandedElts.clearBit(LeftMostBit);
5347+
Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
5348+
LeftMostBit, Constant::getNullValue(Ty));
5349+
}
5350+
return Cost + (NewDemandedElts.isZero()
5351+
? 0
5352+
: TTI.getScalarizationOverhead(Ty, NewDemandedElts, Insert,
5353+
Extract, CostKind, VL));
53445354
}
53455355

53465356
/// Correctly creates insert_subvector, checking that the index is multiple of
@@ -11684,6 +11694,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1168411694
// No need to delay the cost estimation during analysis.
1168511695
return std::nullopt;
1168611696
}
11697+
/// Reset the builder to handle perfect diamond match.
11698+
void resetForSameNode() {
11699+
IsFinalized = false;
11700+
CommonMask.clear();
11701+
InVectors.clear();
11702+
Cost = 0;
11703+
VectorizedVals.clear();
11704+
SameNodesEstimated = true;
11705+
}
1168711706
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
1168811707
if (&E1 == &E2) {
1168911708
assert(all_of(Mask,
@@ -14890,15 +14909,18 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1489014909
ShuffledElements.setBit(I);
1489114910
ShuffleMask[I] = Res.first->second;
1489214911
}
14893-
if (!DemandedElements.isZero())
14894-
Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
14895-
/*Insert=*/true,
14896-
/*Extract=*/false, CostKind, VL);
14897-
if (ForPoisonSrc)
14912+
if (ForPoisonSrc) {
1489814913
Cost = getScalarizationOverhead(*TTI, ScalarTy, VecTy,
1489914914
/*DemandedElts*/ ~ShuffledElements,
1490014915
/*Insert*/ true,
14901-
/*Extract*/ false, CostKind, VL);
14916+
/*Extract*/ false, CostKind,
14917+
/*ForPoisonSrc=*/true, VL);
14918+
} else if (!DemandedElements.isZero()) {
14919+
Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
14920+
/*Insert=*/true,
14921+
/*Extract=*/false, CostKind,
14922+
/*ForPoisonSrc=*/false, VL);
14923+
}
1490214924
if (DuplicateNonConst)
1490314925
Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
1490414926
VecTy, ShuffleMask);
@@ -15556,6 +15578,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1555615578
PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
1555715579
MaybeAlign());
1555815580
}
15581+
/// Reset the builder to handle perfect diamond match.
15582+
void resetForSameNode() {
15583+
IsFinalized = false;
15584+
CommonMask.clear();
15585+
InVectors.clear();
15586+
}
1555915587
/// Adds 2 input vectors (in form of tree entries) and the mask for their
1556015588
/// shuffling.
1556115589
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
@@ -16111,6 +16139,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1611116139
Mask[I] = FrontTE->findLaneForValue(V);
1611216140
}
1611316141
}
16142+
// Reset the builder(s) to correctly handle perfect diamond matched
16143+
// nodes.
16144+
ShuffleBuilder.resetForSameNode();
1611416145
ShuffleBuilder.add(*FrontTE, Mask);
1611516146
// Full matched entry found, no need to insert subvectors.
1611616147
Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});

llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,15 @@ define <4 x double> @test(ptr %ia, ptr %ib, ptr %ic, ptr %id, ptr %ie, ptr %x) {
1010
; CHECK-NEXT: [[I4275:%.*]] = load double, ptr [[ID]], align 8
1111
; CHECK-NEXT: [[I4277:%.*]] = load double, ptr [[IE]], align 8
1212
; CHECK-NEXT: [[I4326:%.*]] = load <4 x double>, ptr [[X]], align 8
13-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> zeroinitializer
14-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[I4238]], i32 0
15-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I4252]], i32 1
16-
; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP3]]
17-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1
18-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[I4264]], i32 0
19-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[I4277]], i32 1
20-
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP7]]
21-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
22-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
23-
; CHECK-NEXT: [[I44281:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
24-
; CHECK-NEXT: ret <4 x double> [[I44281]]
13+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> <i32 0, i32 poison>
14+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1
15+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
16+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[I4238]], i32 0
17+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[I4252]], i32 1
18+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[I4264]], i32 2
19+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[I4277]], i32 3
20+
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP7]]
21+
; CHECK-NEXT: ret <4 x double> [[TMP8]]
2522
;
2623
%i4238 = load double, ptr %ia, align 8
2724
%i4252 = load double, ptr %ib, align 8

llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -49,24 +49,10 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <
4949
;
5050
; AVX512-LABEL: @reduce_and4(
5151
; AVX512-NEXT: entry:
52-
; AVX512-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
53-
; AVX512-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1
54-
; AVX512-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2
55-
; AVX512-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3
56-
; AVX512-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
57-
; AVX512-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
58-
; AVX512-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2
59-
; AVX512-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3
60-
; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
61-
; AVX512-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[VECEXT8]], i32 8
62-
; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT7]], i32 9
63-
; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT10]], i32 10
64-
; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT12]], i32 11
65-
; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 12
66-
; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT]], i32 13
67-
; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT2]], i32 14
68-
; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT4]], i32 15
69-
; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP8]])
52+
; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
53+
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
54+
; AVX512-NEXT: [[RDX_OP:%.*]] = and <8 x i32> [[TMP0]], [[TMP1]]
55+
; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[RDX_OP]])
7056
; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
7157
; AVX512-NEXT: ret i32 [[OP_RDX1]]
7258
;
@@ -144,24 +130,10 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i
144130
; AVX2-NEXT: ret i32 [[OP_RDX]]
145131
;
146132
; AVX512-LABEL: @reduce_and4_transpose(
147-
; AVX512-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
148-
; AVX512-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
149-
; AVX512-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1
150-
; AVX512-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
151-
; AVX512-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2
152-
; AVX512-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2
153-
; AVX512-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3
154-
; AVX512-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3
155-
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
156-
; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT24]], i32 8
157-
; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT16]], i32 9
158-
; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT8]], i32 10
159-
; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 11
160-
; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT23]], i32 12
161-
; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT15]], i32 13
162-
; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT7]], i32 14
163-
; AVX512-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[VECEXT]], i32 15
164-
; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP9]])
133+
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
134+
; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
135+
; AVX512-NEXT: [[RDX_OP:%.*]] = and <8 x i32> [[TMP1]], [[TMP2]]
136+
; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[RDX_OP]])
165137
; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
166138
; AVX512-NEXT: ret i32 [[OP_RDX1]]
167139
;

0 commit comments

Comments
 (0)