Skip to content

Commit 874eb8a

Browse files
committed
VectorCombine: Improve the insert/extract fold in the narrowing case
Keeping the extracted element in a natural position in the narrowed vector has two beneficial effects: 1. It makes the narrowing shuffles cheaper (at least on AMDGPU), which allows the insert/extract fold to trigger. 2. It makes the narrowing shuffles in a chain of extract/insert compatible, which allows foldLengthChangingShuffles to successfully recognize a chain that can be folded. There are minor X86 test changes that look reasonable to me. The IR change for AVX2 in llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll doesn't change the assembly generated by `llc -mtriple=x86_64-- -mattr=AVX2` at all. commit-id:c151bb04
1 parent 748b863 commit 874eb8a

File tree

5 files changed

+22
-41
lines changed

5 files changed

+22
-41
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4455,22 +4455,15 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
44554455
SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
44564456

44574457
bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4458-
bool IsExtIdxInBounds = ExtIdx < NumDstElts;
44594458
bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
44604459
if (NeedDstSrcSwap) {
44614460
SK = TargetTransformInfo::SK_PermuteSingleSrc;
4462-
if (!IsExtIdxInBounds && NeedExpOrNarrow)
4463-
Mask[InsIdx] = 0;
4464-
else
4465-
Mask[InsIdx] = ExtIdx;
4461+
Mask[InsIdx] = ExtIdx % NumDstElts;
44664462
std::swap(DstVec, SrcVec);
44674463
} else {
44684464
SK = TargetTransformInfo::SK_PermuteTwoSrc;
44694465
std::iota(Mask.begin(), Mask.end(), 0);
4470-
if (!IsExtIdxInBounds && NeedExpOrNarrow)
4471-
Mask[InsIdx] = NumDstElts;
4472-
else
4473-
Mask[InsIdx] = ExtIdx + NumDstElts;
4466+
Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
44744467
}
44754468

44764469
// Cost
@@ -4491,14 +4484,11 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
44914484
NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
44924485
nullptr, {DstVec, SrcVec});
44934486
} else {
4494-
// When creating length-changing-vector, always create with a Mask whose
4495-
// first element has an ExtIdx, so that the first element of the vector
4496-
// being created is always the target to be extracted.
4487+
// When creating a length-changing-vector, always try to keep the relevant
4488+
// element in an equivalent position, so that bulk shuffles are more likely
4489+
// to be useful.
44974490
ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4498-
if (IsExtIdxInBounds)
4499-
ExtToVecMask[ExtIdx] = ExtIdx;
4500-
else
4501-
ExtToVecMask[0] = ExtIdx;
4491+
ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
45024492
// Add cost for expanding or narrowing
45034493
NewCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
45044494
DstVecTy, SrcVecTy, ExtToVecMask, CostKind);

llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -91,21 +91,8 @@ entry:
9191
define <8 x i8> @extract_insert_chain_shortening(<32 x i8> %in) {
9292
; OPT-LABEL: define <8 x i8> @extract_insert_chain_shortening(
9393
; OPT-SAME: <32 x i8> [[IN:%.*]]) #[[ATTR0]] {
94-
; OPT-NEXT: [[I_1:%.*]] = extractelement <32 x i8> [[IN]], i64 17
95-
; OPT-NEXT: [[I_2:%.*]] = extractelement <32 x i8> [[IN]], i64 18
96-
; OPT-NEXT: [[I_3:%.*]] = extractelement <32 x i8> [[IN]], i64 19
97-
; OPT-NEXT: [[I_5:%.*]] = extractelement <32 x i8> [[IN]], i64 21
98-
; OPT-NEXT: [[I_6:%.*]] = extractelement <32 x i8> [[IN]], i64 22
99-
; OPT-NEXT: [[I_7:%.*]] = extractelement <32 x i8> [[IN]], i64 23
100-
; OPT-NEXT: [[O_0:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
101-
; OPT-NEXT: [[O_1:%.*]] = insertelement <8 x i8> [[O_0]], i8 [[I_1]], i32 1
102-
; OPT-NEXT: [[O_2:%.*]] = insertelement <8 x i8> [[O_1]], i8 [[I_2]], i32 2
103-
; OPT-NEXT: [[O_3:%.*]] = insertelement <8 x i8> [[O_2]], i8 [[I_3]], i32 3
104-
; OPT-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 20, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
105-
; OPT-NEXT: [[O_4:%.*]] = shufflevector <8 x i8> [[O_3]], <8 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7>
106-
; OPT-NEXT: [[O_5:%.*]] = insertelement <8 x i8> [[O_4]], i8 [[I_5]], i32 5
107-
; OPT-NEXT: [[O_6:%.*]] = insertelement <8 x i8> [[O_5]], i8 [[I_6]], i32 6
108-
; OPT-NEXT: [[O_7:%.*]] = insertelement <8 x i8> [[O_6]], i8 [[I_7]], i32 7
94+
; OPT-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
95+
; OPT-NEXT: [[O_7:%.*]] = shufflevector <8 x i8> poison, <8 x i8> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
10996
; OPT-NEXT: ret <8 x i8> [[O_7]]
11097
;
11198
%i.0 = extractelement <32 x i8> %in, i64 16

llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,14 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
140140
}
141141

142142
define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
143-
; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64(
144-
; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3
145-
; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x double> poison, double [[EXT]], i32 0
146-
; CHECK-NEXT: ret <2 x double> [[INS]]
143+
; SSE-LABEL: @src_ins0_v2f64_ext3_v4f64(
144+
; SSE-NEXT: [[INS:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
145+
; SSE-NEXT: ret <2 x double> [[INS]]
146+
;
147+
; AVX-LABEL: @src_ins0_v2f64_ext3_v4f64(
148+
; AVX-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3
149+
; AVX-NEXT: [[INS:%.*]] = insertelement <2 x double> poison, double [[EXT]], i32 0
150+
; AVX-NEXT: ret <2 x double> [[INS]]
147151
;
148152
%ext = extractelement <4 x double> %b, i32 3
149153
%ins = insertelement <2 x double> poison, double %ext, i32 0

llvm/test/Transforms/VectorCombine/X86/extract-insert.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
136136

137137
define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
138138
; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64(
139-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
140-
; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 2, i32 1>
139+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
140+
; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 3, i32 1>
141141
; CHECK-NEXT: ret <2 x double> [[INS]]
142142
;
143143
%ext = extractelement <4 x double> %b, i32 3
@@ -185,8 +185,8 @@ define <2 x double> @src_ins1_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
185185

186186
define <2 x double> @src_ins1_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
187187
; CHECK-LABEL: @src_ins1_v2f64_ext3_v4f64(
188-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
189-
; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
188+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
189+
; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
190190
; CHECK-NEXT: ret <2 x double> [[INS]]
191191
;
192192
%ext = extractelement <4 x double> %b, i32 3

llvm/test/Transforms/VectorCombine/X86/pr126085.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ define i32 @test(ptr %a0) {
66
; CHECK-SAME: ptr [[A0:%.*]]) {
77
; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[A0]], align 1
88
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
9-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <16 x i8> [[LOAD]], i64 11
10-
; CHECK-NEXT: [[INS:%.*]] = insertelement <4 x i8> [[SHUF]], i8 [[ELT]], i64 1
9+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 11>
10+
; CHECK-NEXT: [[INS:%.*]] = shufflevector <4 x i8> [[SHUF]], <4 x i8> [[TMP1]], <4 x i32> <i32 0, i32 7, i32 2, i32 3>
1111
; CHECK-NEXT: [[RES:%.*]] = bitcast <4 x i8> [[INS]] to i32
1212
; CHECK-NEXT: ret i32 [[RES]]
1313
;

0 commit comments

Comments
 (0)