Skip to content

Commit 18486c4

Browse files
alexey-bataevmemfrob
authored andcommitted
[PATCH] D105827: [SLP]Workaround for InsertSubVector cost.
The cost of the InsertSubvector shuffle kind cost is not complete and may end up with just extracts + inserts costs in many cases. Added a workaround to represent it as a generic PermuteSingleSrc, which is still pessimistic but better than InsertSubvector. Differential Revision: https://reviews.llvm.org/D105827
1 parent 68c650a commit 18486c4

14 files changed

+283
-345
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3885,14 +3885,16 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
38853885
Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
38863886
/*Insert*/ true, /*Extract*/ false);
38873887

3888-
if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0)
3888+
if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
3889+
// FIXME: Replace with SK_InsertSubvector once it is properly supported.
3890+
unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
38893891
Cost += TTI->getShuffleCost(
3890-
TargetTransformInfo::SK_InsertSubvector, SrcVecTy, /*Mask*/ None,
3891-
Offset,
3892-
FixedVectorType::get(SrcVecTy->getElementType(), NumScalars));
3893-
else if (!IsIdentity)
3892+
TargetTransformInfo::SK_PermuteSingleSrc,
3893+
FixedVectorType::get(SrcVecTy->getElementType(), Sz));
3894+
} else if (!IsIdentity) {
38943895
Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy,
38953896
ShuffleMask);
3897+
}
38963898

38973899
return Cost;
38983900
}

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,16 @@ define <4 x float> @int_sin_4x(<4 x float>* %a) {
2222
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
2323
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
2424
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
25-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
26-
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
2725
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
26+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
27+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
28+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
29+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
30+
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
2831
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
29-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
30-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
31-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
32-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
33-
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
34-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
32+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
33+
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
34+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
3535
;
3636
entry:
3737
%0 = load <4 x float>, <4 x float>* %a, align 16
@@ -218,16 +218,16 @@ define <4 x float> @exp_4x(<4 x float>* %a) {
218218
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
219219
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
220220
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
221-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
222-
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
223221
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
222+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
223+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
224+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
225+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
226+
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
224227
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
225-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
226-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
227-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]])
228-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
229-
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
230-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
228+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
229+
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
230+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
231231
;
232232
entry:
233233
%0 = load <4 x float>, <4 x float>* %a, align 16
@@ -301,16 +301,16 @@ define <4 x float> @log_4x(<4 x float>* %a) {
301301
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
302302
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
303303
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
304-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
305-
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
306304
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
305+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
306+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
307+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
308+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
309+
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
307310
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
308-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
309-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
310-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]])
311-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
312-
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
313-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
311+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
312+
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
313+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
314314
;
315315
entry:
316316
%0 = load <4 x float>, <4 x float>* %a, align 16
@@ -477,16 +477,16 @@ define <4 x float> @sin_4x(<4 x float>* %a) {
477477
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
478478
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
479479
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
480-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
481-
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
482480
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
481+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
482+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
483+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
484+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
485+
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
483486
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
484-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
485-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
486-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
487-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
488-
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
489-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
487+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
488+
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
489+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
490490
;
491491
entry:
492492
%0 = load <4 x float>, <4 x float>* %a, align 16
@@ -519,16 +519,16 @@ define <4 x float> @cos_4x(<4 x float>* %a) {
519519
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
520520
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
521521
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
522-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
523-
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
524522
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
523+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
524+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
525+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
526+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
527+
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
525528
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
526-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
527-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
528-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
529-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
530-
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
531-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
529+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
530+
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
531+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
532532
;
533533
entry:
534534
%0 = load <4 x float>, <4 x float>* %a, align 16
@@ -1010,16 +1010,16 @@ define <4 x float> @int_cos_4x(<4 x float>* %a) {
10101010
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
10111011
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
10121012
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1013-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
1014-
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
10151013
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1014+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
1015+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
1016+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
1017+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1018+
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
10161019
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1017-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
1018-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
1019-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
1020-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1021-
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1022-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
1020+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
1021+
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
1022+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
10231023
;
10241024
entry:
10251025
%0 = load <4 x float>, <4 x float>* %a, align 16

0 commit comments

Comments
 (0)