Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit e5439d5

Browse files
committed
[DAGCombiner] narrow vector binop with 2 insert subvector operands
vecbo (insertsubv undef, X, Z), (insertsubv undef, Y, Z) --> insertsubv VecC, (vecbo X, Y), Z This is another step in generic vector narrowing. It's also a step towards more horizontal op formation specifically for x86 (although we still failed to match those in the affected tests). The scalarization cases are also not optimal (we should be scalarizing those), but it's still an improvement to use a narrower vector op when we know part of the result must be constant because both inputs are undef in some vector lanes. I think a similar match but checking for a constant operand might help some of the cases in D51553. Differential Revision: https://reviews.llvm.org/D56875 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351825 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 5d73c0d commit e5439d5

File tree

4 files changed

+44
-28
lines changed

4 files changed

+44
-28
lines changed

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18174,6 +18174,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
1817418174
SDValue LHS = N->getOperand(0);
1817518175
SDValue RHS = N->getOperand(1);
1817618176
SDValue Ops[] = {LHS, RHS};
18177+
EVT VT = N->getValueType(0);
1817718178

1817818179
// See if we can constant fold the vector operation.
1817918180
if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
@@ -18191,7 +18192,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
1819118192
ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
1819218193

1819318194
if (SVN0->getMask().equals(SVN1->getMask())) {
18194-
EVT VT = N->getValueType(0);
1819518195
SDValue UndefVector = LHS.getOperand(1);
1819618196
SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
1819718197
LHS.getOperand(0), RHS.getOperand(0),
@@ -18202,6 +18202,29 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
1820218202
}
1820318203
}
1820418204

18205+
// The following pattern is likely to emerge with vector reduction ops. Moving
18206+
// the binary operation ahead of insertion may allow using a narrower vector
18207+
// instruction that has better performance than the wide version of the op:
18208+
// VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z
18209+
if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() &&
18210+
RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() &&
18211+
LHS.getOperand(2) == RHS.getOperand(2) &&
18212+
(LHS.hasOneUse() || RHS.hasOneUse())) {
18213+
SDValue X = LHS.getOperand(1);
18214+
SDValue Y = RHS.getOperand(1);
18215+
SDValue Z = LHS.getOperand(2);
18216+
EVT NarrowVT = X.getValueType();
18217+
if (NarrowVT == Y.getValueType() &&
18218+
TLI.isOperationLegalOrCustomOrPromote(N->getOpcode(), NarrowVT)) {
18219+
// (binop undef, undef) may not return undef, so compute that result.
18220+
SDLoc DL(N);
18221+
SDValue VecC = DAG.getNode(N->getOpcode(), DL, VT, DAG.getUNDEF(VT),
18222+
DAG.getUNDEF(VT));
18223+
SDValue NarrowBO = DAG.getNode(N->getOpcode(), DL, NarrowVT, X, Y);
18224+
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z);
18225+
}
18226+
}
18227+
1820518228
return SDValue();
1820618229
}
1820718230

test/CodeGen/X86/avx512-hadd-hsub.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,14 +111,14 @@ define <16 x i32> @hadd_16_3(<16 x i32> %x225, <16 x i32> %x227) {
111111
; KNL: # %bb.0:
112112
; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
113113
; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
114-
; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0
114+
; KNL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
115115
; KNL-NEXT: retq
116116
;
117117
; SKX-LABEL: hadd_16_3:
118118
; SKX: # %bb.0:
119119
; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
120120
; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
121-
; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0
121+
; SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0
122122
; SKX-NEXT: retq
123123
%x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
124124
, i32 4, i32 6, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -134,14 +134,14 @@ define <16 x float> @fhadd_16_3(<16 x float> %x225, <16 x float> %x227) {
134134
; KNL: # %bb.0:
135135
; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
136136
; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
137-
; KNL-NEXT: vaddps %zmm0, %zmm2, %zmm0
137+
; KNL-NEXT: vaddps %ymm0, %ymm2, %ymm0
138138
; KNL-NEXT: retq
139139
;
140140
; SKX-LABEL: fhadd_16_3:
141141
; SKX: # %bb.0:
142142
; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
143143
; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
144-
; SKX-NEXT: vaddps %zmm0, %zmm2, %zmm0
144+
; SKX-NEXT: vaddps %ymm0, %ymm2, %ymm0
145145
; SKX-NEXT: retq
146146
%x226 = shufflevector <16 x float> %x225, <16 x float> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
147147
, i32 4, i32 6, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -156,14 +156,14 @@ define <8 x double> @fhadd_16_4(<8 x double> %x225, <8 x double> %x227) {
156156
; KNL: # %bb.0:
157157
; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
158158
; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
159-
; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0
159+
; KNL-NEXT: vaddpd %ymm0, %ymm2, %ymm0
160160
; KNL-NEXT: retq
161161
;
162162
; SKX-LABEL: fhadd_16_4:
163163
; SKX: # %bb.0:
164164
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
165165
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
166-
; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0
166+
; SKX-NEXT: vaddpd %ymm0, %ymm2, %ymm0
167167
; SKX-NEXT: retq
168168
%x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
169169
%x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 undef ,i32 undef, i32 undef, i32 undef>

test/CodeGen/X86/scalarize-fp.ll

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -198,9 +198,8 @@ define <4 x double> @fadd_op1_constant_v4f64(double %x) nounwind {
198198
;
199199
; AVX-LABEL: fadd_op1_constant_v4f64:
200200
; AVX: # %bb.0:
201-
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
202201
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
203-
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
202+
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
204203
; AVX-NEXT: retq
205204
%v = insertelement <4 x double> undef, double %x, i32 0
206205
%b = fadd <4 x double> %v, <double 42.0, double undef, double undef, double undef>
@@ -219,7 +218,7 @@ define <4 x double> @load_fadd_op1_constant_v4f64(double* %p) nounwind {
219218
; AVX: # %bb.0:
220219
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
221220
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
222-
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
221+
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
223222
; AVX-NEXT: retq
224223
%x = load double, double* %p
225224
%v = insertelement <4 x double> undef, double %x, i32 0
@@ -237,9 +236,8 @@ define <4 x double> @fsub_op0_constant_v4f64(double %x) nounwind {
237236
;
238237
; AVX-LABEL: fsub_op0_constant_v4f64:
239238
; AVX: # %bb.0:
240-
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
241239
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
242-
; AVX-NEXT: vsubpd %ymm0, %ymm1, %ymm0
240+
; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0
243241
; AVX-NEXT: retq
244242
%v = insertelement <4 x double> undef, double %x, i32 0
245243
%b = fsub <4 x double> <double 42.0, double undef, double undef, double undef>, %v
@@ -258,7 +256,7 @@ define <4 x double> @load_fsub_op0_constant_v4f64(double* %p) nounwind {
258256
; AVX: # %bb.0:
259257
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
260258
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
261-
; AVX-NEXT: vsubpd %ymm0, %ymm1, %ymm0
259+
; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0
262260
; AVX-NEXT: retq
263261
%x = load double, double* %p
264262
%v = insertelement <4 x double> undef, double %x, i32 0
@@ -275,9 +273,8 @@ define <4 x double> @fmul_op1_constant_v4f64(double %x) nounwind {
275273
;
276274
; AVX-LABEL: fmul_op1_constant_v4f64:
277275
; AVX: # %bb.0:
278-
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
279276
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
280-
; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
277+
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
281278
; AVX-NEXT: retq
282279
%v = insertelement <4 x double> undef, double %x, i32 0
283280
%b = fmul <4 x double> %v, <double 42.0, double undef, double undef, double undef>
@@ -296,7 +293,7 @@ define <4 x double> @load_fmul_op1_constant_v4f64(double* %p) nounwind {
296293
; AVX: # %bb.0:
297294
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
298295
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
299-
; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
296+
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
300297
; AVX-NEXT: retq
301298
%x = load double, double* %p
302299
%v = insertelement <4 x double> undef, double %x, i32 0
@@ -313,9 +310,8 @@ define <4 x double> @fdiv_op1_constant_v4f64(double %x) nounwind {
313310
;
314311
; AVX-LABEL: fdiv_op1_constant_v4f64:
315312
; AVX: # %bb.0:
316-
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
317313
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
318-
; AVX-NEXT: vdivpd %ymm1, %ymm0, %ymm0
314+
; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0
319315
; AVX-NEXT: retq
320316
%v = insertelement <4 x double> undef, double %x, i32 0
321317
%b = fdiv <4 x double> %v, <double 42.0, double undef, double undef, double undef>
@@ -334,7 +330,7 @@ define <4 x double> @load_fdiv_op1_constant_v4f64(double* %p) nounwind {
334330
; AVX: # %bb.0:
335331
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
336332
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
337-
; AVX-NEXT: vdivpd %ymm1, %ymm0, %ymm0
333+
; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0
338334
; AVX-NEXT: retq
339335
%x = load double, double* %p
340336
%v = insertelement <4 x double> undef, double %x, i32 0
@@ -352,9 +348,8 @@ define <4 x double> @fdiv_op0_constant_v4f64(double %x) nounwind {
352348
;
353349
; AVX-LABEL: fdiv_op0_constant_v4f64:
354350
; AVX: # %bb.0:
355-
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
356351
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
357-
; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0
352+
; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0
358353
; AVX-NEXT: retq
359354
%v = insertelement <4 x double> undef, double %x, i32 0
360355
%b = fdiv <4 x double> <double 42.0, double undef, double undef, double undef>, %v
@@ -373,7 +368,7 @@ define <4 x double> @load_fdiv_op0_constant_v4f64(double* %p) nounwind {
373368
; AVX: # %bb.0:
374369
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
375370
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
376-
; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0
371+
; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0
377372
; AVX-NEXT: retq
378373
%x = load double, double* %p
379374
%v = insertelement <4 x double> undef, double %x, i32 0

test/CodeGen/X86/vector-partial-undef.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@ define <4 x i64> @xor_insert_insert(<2 x i64> %x, <2 x i64> %y) {
1313
;
1414
; AVX-LABEL: xor_insert_insert:
1515
; AVX: # %bb.0:
16-
; AVX-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
17-
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
18-
; AVX-NEXT: vxorps %ymm1, %ymm0, %ymm0
16+
; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
1917
; AVX-NEXT: retq
2018
%xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2119
%yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -32,9 +30,9 @@ define <4 x i64> @xor_insert_insert_high_half(<2 x i64> %x, <2 x i64> %y) {
3230
;
3331
; AVX-LABEL: xor_insert_insert_high_half:
3432
; AVX: # %bb.0:
35-
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
36-
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
37-
; AVX-NEXT: vxorps %ymm1, %ymm0, %ymm0
33+
; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
34+
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
35+
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
3836
; AVX-NEXT: retq
3937
%xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
4038
%yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>

0 commit comments

Comments
 (0)