Skip to content

Commit 2ee921f

Browse files
committed
VectorCombine: Improve the insert/extract fold in the narrowing case
Keeping the extracted element in a natural position in the narrowed vector has two beneficial effects: 1. It makes the narrowing shuffles cheaper (at least on AMDGPU), which allows the insert/extract fold to trigger. 2. It makes the narrowing shuffles in a chain of extract/insert compatible, which allows foldLengthChangingShuffles to successfully recognize a chain that can be folded. There are minor X86 test changes that look reasonable to me. The IR change for AVX2 in llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll doesn't change the assembly generated by `llc -mtriple=x86_64-- -mattr=AVX2` at all. commit-id:c151bb04
1 parent 316715e commit 2ee921f

File tree

5 files changed

+22
-42
lines changed

5 files changed

+22
-42
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4558,22 +4558,15 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
45584558
SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
45594559

45604560
bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4561-
bool IsExtIdxInBounds = ExtIdx < NumDstElts;
45624561
bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
45634562
if (NeedDstSrcSwap) {
45644563
SK = TargetTransformInfo::SK_PermuteSingleSrc;
4565-
if (!IsExtIdxInBounds && NeedExpOrNarrow)
4566-
Mask[InsIdx] = 0;
4567-
else
4568-
Mask[InsIdx] = ExtIdx;
4564+
Mask[InsIdx] = ExtIdx % NumDstElts;
45694565
std::swap(DstVec, SrcVec);
45704566
} else {
45714567
SK = TargetTransformInfo::SK_PermuteTwoSrc;
45724568
std::iota(Mask.begin(), Mask.end(), 0);
4573-
if (!IsExtIdxInBounds && NeedExpOrNarrow)
4574-
Mask[InsIdx] = NumDstElts;
4575-
else
4576-
Mask[InsIdx] = ExtIdx + NumDstElts;
4569+
Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
45774570
}
45784571

45794572
// Cost
@@ -4594,14 +4587,11 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
45944587
NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
45954588
nullptr, {DstVec, SrcVec});
45964589
} else {
4597-
// When creating length-changing-vector, always create with a Mask whose
4598-
// first element has an ExtIdx, so that the first element of the vector
4599-
// being created is always the target to be extracted.
4590+
// When creating a length-changing-vector, always try to keep the relevant
4591+
// element in an equivalent position, so that bulk shuffles are more likely
4592+
// to be useful.
46004593
ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4601-
if (IsExtIdxInBounds)
4602-
ExtToVecMask[ExtIdx] = ExtIdx;
4603-
else
4604-
ExtToVecMask[0] = ExtIdx;
4594+
ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
46054595
// Add cost for expanding or narrowing
46064596
NewCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
46074597
DstVecTy, SrcVecTy, ExtToVecMask, CostKind);

llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -88,22 +88,8 @@ entry:
8888
define <8 x i8> @extract_insert_chain_shortening(<32 x i8> %in) {
8989
; OPT-LABEL: define <8 x i8> @extract_insert_chain_shortening(
9090
; OPT-SAME: <32 x i8> [[IN:%.*]]) #[[ATTR0]] {
91-
; OPT-NEXT: [[I_1:%.*]] = extractelement <32 x i8> [[IN]], i64 17
92-
; OPT-NEXT: [[I_2:%.*]] = extractelement <32 x i8> [[IN]], i64 18
93-
; OPT-NEXT: [[I_3:%.*]] = extractelement <32 x i8> [[IN]], i64 19
94-
; OPT-NEXT: [[I_5:%.*]] = extractelement <32 x i8> [[IN]], i64 21
95-
; OPT-NEXT: [[I_6:%.*]] = extractelement <32 x i8> [[IN]], i64 22
96-
; OPT-NEXT: [[I_7:%.*]] = extractelement <32 x i8> [[IN]], i64 23
97-
; OPT-NEXT: [[O_0:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
98-
; OPT-NEXT: [[O_1:%.*]] = insertelement <8 x i8> [[O_0]], i8 [[I_1]], i32 1
99-
; OPT-NEXT: [[O_2:%.*]] = insertelement <8 x i8> [[O_1]], i8 [[I_2]], i32 2
100-
; OPT-NEXT: [[O_3:%.*]] = insertelement <8 x i8> [[O_2]], i8 [[I_3]], i32 3
101-
; OPT-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 20, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
102-
; OPT-NEXT: [[O_4:%.*]] = shufflevector <8 x i8> [[O_3]], <8 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7>
103-
; OPT-NEXT: [[O_5:%.*]] = insertelement <8 x i8> [[O_4]], i8 [[I_5]], i32 5
104-
; OPT-NEXT: [[O_6:%.*]] = insertelement <8 x i8> [[O_5]], i8 [[I_6]], i32 6
105-
; OPT-NEXT: [[O_7:%.*]] = insertelement <8 x i8> [[O_6]], i8 [[I_7]], i32 7
106-
; OPT-NEXT: ret <8 x i8> [[O_7]]
91+
; OPT-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
92+
; OPT-NEXT: ret <8 x i8> [[TMP1]]
10793
;
10894
%i.0 = extractelement <32 x i8> %in, i64 16
10995
%i.1 = extractelement <32 x i8> %in, i64 17

llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,14 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
140140
}
141141

142142
define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
143-
; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64(
144-
; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3
145-
; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x double> poison, double [[EXT]], i32 0
146-
; CHECK-NEXT: ret <2 x double> [[INS]]
143+
; SSE-LABEL: @src_ins0_v2f64_ext3_v4f64(
144+
; SSE-NEXT: [[INS:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
145+
; SSE-NEXT: ret <2 x double> [[INS]]
146+
;
147+
; AVX-LABEL: @src_ins0_v2f64_ext3_v4f64(
148+
; AVX-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3
149+
; AVX-NEXT: [[INS:%.*]] = insertelement <2 x double> poison, double [[EXT]], i32 0
150+
; AVX-NEXT: ret <2 x double> [[INS]]
147151
;
148152
%ext = extractelement <4 x double> %b, i32 3
149153
%ins = insertelement <2 x double> poison, double %ext, i32 0

llvm/test/Transforms/VectorCombine/X86/extract-insert.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
136136

137137
define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
138138
; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64(
139-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
140-
; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 2, i32 1>
139+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
140+
; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 3, i32 1>
141141
; CHECK-NEXT: ret <2 x double> [[INS]]
142142
;
143143
%ext = extractelement <4 x double> %b, i32 3
@@ -185,8 +185,8 @@ define <2 x double> @src_ins1_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
185185

186186
define <2 x double> @src_ins1_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
187187
; CHECK-LABEL: @src_ins1_v2f64_ext3_v4f64(
188-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
189-
; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
188+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
189+
; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
190190
; CHECK-NEXT: ret <2 x double> [[INS]]
191191
;
192192
%ext = extractelement <4 x double> %b, i32 3

llvm/test/Transforms/VectorCombine/X86/pr126085.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ define i32 @test(ptr %a0) {
66
; CHECK-SAME: ptr [[A0:%.*]]) {
77
; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[A0]], align 1
88
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
9-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <16 x i8> [[LOAD]], i64 11
10-
; CHECK-NEXT: [[INS:%.*]] = insertelement <4 x i8> [[SHUF]], i8 [[ELT]], i64 1
9+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 11>
10+
; CHECK-NEXT: [[INS:%.*]] = shufflevector <4 x i8> [[SHUF]], <4 x i8> [[TMP1]], <4 x i32> <i32 0, i32 7, i32 2, i32 3>
1111
; CHECK-NEXT: [[RES:%.*]] = bitcast <4 x i8> [[INS]] to i32
1212
; CHECK-NEXT: ret i32 [[RES]]
1313
;

0 commit comments

Comments
 (0)