Skip to content

Commit ce99ef6

Browse files
committed
[AArch64] Add a subvector extract cost.
These can generally be emitted using an ext instruction or mov from the high half. The half half extracts can be free depending on the users, but that is not handled here, just the basic costs.
1 parent ca603d2 commit ce99ef6

File tree

3 files changed

+22
-16
lines changed

3 files changed

+22
-16
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4706,10 +4706,21 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
47064706
}
47074707

47084708
Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
4709-
// Treat extractsubvector as single op permutation.
47104709
bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
4711-
if (IsExtractSubvector && LT.second.isFixedLengthVector())
4710+
// A sebvector extract can be implemented with a ext (or trivial extract, if
4711+
// from lane 0). This currently only handles low or high extracts to prevent
4712+
// SLP vectorizer regressions.
4713+
if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
4714+
if (LT.second.is128BitVector() &&
4715+
cast<FixedVectorType>(SubTp)->getNumElements() ==
4716+
LT.second.getVectorNumElements() / 2) {
4717+
if (Index == 0)
4718+
return 0;
4719+
if (Index == LT.second.getVectorNumElements() / 2)
4720+
return 1;
4721+
}
47124722
Kind = TTI::SK_PermuteSingleSrc;
4723+
}
47134724

47144725
// Check for broadcast loads, which are supported by the LD1R instruction.
47154726
// In terms of code-size, the shuffle vector is free when a load + dup get

llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,23 @@ define void @extract_half() {
1515
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1616
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1717
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
18-
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1919
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i16_lo = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> zeroinitializer
2020
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_hi = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> <i32 1>
2121
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 0, i32 1>
2222
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 1, i32 2>
2323
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
2424
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2525
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
26-
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
26+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2727
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2828
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
2929
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3030
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_lo = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> zeroinitializer
3131
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_hi = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> <i32 1>
3232
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
3333
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
34-
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
34+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
3535
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3636
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
3737
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -114,19 +114,19 @@ define void @extract_qtr() {
114114
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
115115
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
116116
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
117-
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
117+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
118118
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
119119
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> zeroinitializer
120120
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
121121
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
122122
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
123-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
123+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
124124
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
125125
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
126126
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
127127
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
128128
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> zeroinitializer
129-
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
129+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
130130
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
131131
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 0, i32 1>
132132
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>

llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -80,16 +80,11 @@ define half @reduce_fast_half8(<8 x half> %vec8) {
8080
; NOFP16-LABEL: define half @reduce_fast_half8(
8181
; NOFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
8282
; NOFP16-NEXT: [[ENTRY:.*:]]
83-
; NOFP16-NEXT: [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
84-
; NOFP16-NEXT: [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
85-
; NOFP16-NEXT: [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
86-
; NOFP16-NEXT: [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
8783
; NOFP16-NEXT: [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8884
; NOFP16-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
89-
; NOFP16-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]]
90-
; NOFP16-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
91-
; NOFP16-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
92-
; NOFP16-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]]
85+
; NOFP16-NEXT: [[TMP2:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
86+
; NOFP16-NEXT: [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP2]])
87+
; NOFP16-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
9388
; NOFP16-NEXT: ret half [[OP_RDX3]]
9489
;
9590
; FULLFP16-LABEL: define half @reduce_fast_half8(

0 commit comments

Comments
 (0)