Skip to content

Commit 4d25c1f

Browse files
RKSimongithub-actions[bot]
authored andcommitted
Automerge: [X86] Fold ADD(x,x) -> X86ISD::VSHLI(x,1) (#161843)
Now that #161007 will attempt to fold this back to ADD(x,x) in X86FixupInstTunings, we can more aggressively create X86ISD::VSHLI nodes to avoid missed optimisations due to oneuse limits, avoids unnecessary freezes and allows AVX512 to fold to mi memory folding variants. I've currently limited SSE targets to cases where ADD is the only user of x to prevent extra moves - AVX shift patterns benefit from breaking the ADD+ADD+ADD chains into shifts, but its not so beneficial on SSE with the extra moves.
2 parents 1285c6b + ab9611e commit 4d25c1f

17 files changed

+406
-398
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58135,6 +58135,14 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
5813558135
if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
5813658136
return V;
5813758137

58138+
// Prefer VSHLI to reduce uses, X86FixupInstTunings may revert this depending
58139+
// on the scheduler model. Limit multiple users to AVX+ targets to prevent
58140+
// introducing extra register moves.
58141+
if (Op0 == Op1 && supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL))
58142+
if (Subtarget.hasAVX() || N->isOnlyUserOf(Op0.getNode()))
58143+
return getTargetVShiftByConstNode(X86ISD::VSHLI, DL, VT.getSimpleVT(),
58144+
Op0, 1, DAG);
58145+
5813858146
// Canonicalize hidden LEA pattern:
5813958147
// Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
5814058148
// iff c < 4

llvm/test/CodeGen/X86/avx2-vector-shifts.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -441,21 +441,21 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
441441
; CHECK-NEXT: vpsraw $4, %ymm3, %ymm4
442442
; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
443443
; CHECK-NEXT: vpsraw $2, %ymm3, %ymm4
444-
; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
445-
; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
444+
; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm5
445+
; CHECK-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
446446
; CHECK-NEXT: vpsraw $1, %ymm3, %ymm4
447-
; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
447+
; CHECK-NEXT: vpsllw $2, %ymm2, %ymm2
448448
; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
449449
; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2
450450
; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
451451
; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
452452
; CHECK-NEXT: vpsraw $4, %ymm0, %ymm3
453453
; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
454454
; CHECK-NEXT: vpsraw $2, %ymm0, %ymm3
455-
; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
456-
; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
455+
; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm4
456+
; CHECK-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
457457
; CHECK-NEXT: vpsraw $1, %ymm0, %ymm3
458-
; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
458+
; CHECK-NEXT: vpsllw $2, %ymm1, %ymm1
459459
; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
460460
; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
461461
; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0

llvm/test/CodeGen/X86/gfni-shifts.ll

Lines changed: 84 additions & 84 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/logic-shift.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -129,21 +129,21 @@ define <16 x i8> @or_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <
129129
; CHECK-NEXT: vpsraw $4, %xmm1, %xmm5
130130
; CHECK-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
131131
; CHECK-NEXT: vpsraw $2, %xmm1, %xmm5
132-
; CHECK-NEXT: vpaddw %xmm4, %xmm4, %xmm4
133-
; CHECK-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
132+
; CHECK-NEXT: vpaddw %xmm4, %xmm4, %xmm6
133+
; CHECK-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
134134
; CHECK-NEXT: vpsraw $1, %xmm1, %xmm5
135-
; CHECK-NEXT: vpaddw %xmm4, %xmm4, %xmm4
135+
; CHECK-NEXT: vpsllw $2, %xmm4, %xmm4
136136
; CHECK-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
137137
; CHECK-NEXT: vpsrlw $8, %xmm1, %xmm1
138138
; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
139139
; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
140140
; CHECK-NEXT: vpsraw $4, %xmm0, %xmm4
141141
; CHECK-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
142142
; CHECK-NEXT: vpsraw $2, %xmm0, %xmm4
143-
; CHECK-NEXT: vpaddw %xmm2, %xmm2, %xmm2
144-
; CHECK-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
143+
; CHECK-NEXT: vpaddw %xmm2, %xmm2, %xmm5
144+
; CHECK-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
145145
; CHECK-NEXT: vpsraw $1, %xmm0, %xmm4
146-
; CHECK-NEXT: vpaddw %xmm2, %xmm2, %xmm2
146+
; CHECK-NEXT: vpsllw $2, %xmm2, %xmm2
147147
; CHECK-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
148148
; CHECK-NEXT: vpsrlw $8, %xmm0, %xmm0
149149
; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -413,21 +413,21 @@ define <16 x i8> @xor_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y,
413413
; CHECK-NEXT: vpsraw $4, %xmm1, %xmm5
414414
; CHECK-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
415415
; CHECK-NEXT: vpsraw $2, %xmm1, %xmm5
416-
; CHECK-NEXT: vpaddw %xmm4, %xmm4, %xmm4
417-
; CHECK-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
416+
; CHECK-NEXT: vpaddw %xmm4, %xmm4, %xmm6
417+
; CHECK-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
418418
; CHECK-NEXT: vpsraw $1, %xmm1, %xmm5
419-
; CHECK-NEXT: vpaddw %xmm4, %xmm4, %xmm4
419+
; CHECK-NEXT: vpsllw $2, %xmm4, %xmm4
420420
; CHECK-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
421421
; CHECK-NEXT: vpsrlw $8, %xmm1, %xmm1
422422
; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
423423
; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
424424
; CHECK-NEXT: vpsraw $4, %xmm0, %xmm4
425425
; CHECK-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
426426
; CHECK-NEXT: vpsraw $2, %xmm0, %xmm4
427-
; CHECK-NEXT: vpaddw %xmm2, %xmm2, %xmm2
428-
; CHECK-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
427+
; CHECK-NEXT: vpaddw %xmm2, %xmm2, %xmm5
428+
; CHECK-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
429429
; CHECK-NEXT: vpsraw $1, %xmm0, %xmm4
430-
; CHECK-NEXT: vpaddw %xmm2, %xmm2, %xmm2
430+
; CHECK-NEXT: vpsllw $2, %xmm2, %xmm2
431431
; CHECK-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
432432
; CHECK-NEXT: vpsrlw $8, %xmm0, %xmm0
433433
; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -697,21 +697,21 @@ define <16 x i8> @and_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y,
697697
; CHECK-NEXT: vpsraw $4, %xmm1, %xmm5
698698
; CHECK-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
699699
; CHECK-NEXT: vpsraw $2, %xmm1, %xmm5
700-
; CHECK-NEXT: vpaddw %xmm4, %xmm4, %xmm4
701-
; CHECK-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
700+
; CHECK-NEXT: vpaddw %xmm4, %xmm4, %xmm6
701+
; CHECK-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
702702
; CHECK-NEXT: vpsraw $1, %xmm1, %xmm5
703-
; CHECK-NEXT: vpaddw %xmm4, %xmm4, %xmm4
703+
; CHECK-NEXT: vpsllw $2, %xmm4, %xmm4
704704
; CHECK-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
705705
; CHECK-NEXT: vpsrlw $8, %xmm1, %xmm1
706706
; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
707707
; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
708708
; CHECK-NEXT: vpsraw $4, %xmm0, %xmm4
709709
; CHECK-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
710710
; CHECK-NEXT: vpsraw $2, %xmm0, %xmm4
711-
; CHECK-NEXT: vpaddw %xmm2, %xmm2, %xmm2
712-
; CHECK-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
711+
; CHECK-NEXT: vpaddw %xmm2, %xmm2, %xmm5
712+
; CHECK-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
713713
; CHECK-NEXT: vpsraw $1, %xmm0, %xmm4
714-
; CHECK-NEXT: vpaddw %xmm2, %xmm2, %xmm2
714+
; CHECK-NEXT: vpsllw $2, %xmm2, %xmm2
715715
; CHECK-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
716716
; CHECK-NEXT: vpsrlw $8, %xmm0, %xmm0
717717
; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0

llvm/test/CodeGen/X86/prefer-avx256-shift.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -302,21 +302,21 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) {
302302
; AVX256-NEXT: vpsraw $4, %ymm3, %ymm4
303303
; AVX256-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
304304
; AVX256-NEXT: vpsraw $2, %ymm3, %ymm4
305-
; AVX256-NEXT: vpaddw %ymm2, %ymm2, %ymm2
306-
; AVX256-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
305+
; AVX256-NEXT: vpaddw %ymm2, %ymm2, %ymm5
306+
; AVX256-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
307307
; AVX256-NEXT: vpsraw $1, %ymm3, %ymm4
308-
; AVX256-NEXT: vpaddw %ymm2, %ymm2, %ymm2
308+
; AVX256-NEXT: vpsllw $2, %ymm2, %ymm2
309309
; AVX256-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
310310
; AVX256-NEXT: vpsrlw $8, %ymm2, %ymm2
311311
; AVX256-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
312312
; AVX256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
313313
; AVX256-NEXT: vpsraw $4, %ymm0, %ymm3
314314
; AVX256-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
315315
; AVX256-NEXT: vpsraw $2, %ymm0, %ymm3
316-
; AVX256-NEXT: vpaddw %ymm1, %ymm1, %ymm1
317-
; AVX256-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
316+
; AVX256-NEXT: vpaddw %ymm1, %ymm1, %ymm4
317+
; AVX256-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
318318
; AVX256-NEXT: vpsraw $1, %ymm0, %ymm3
319-
; AVX256-NEXT: vpaddw %ymm1, %ymm1, %ymm1
319+
; AVX256-NEXT: vpsllw $2, %ymm1, %ymm1
320320
; AVX256-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
321321
; AVX256-NEXT: vpsrlw $8, %ymm0, %ymm0
322322
; AVX256-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
@@ -338,21 +338,21 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) {
338338
; AVX512VL-NEXT: vpsraw $4, %ymm3, %ymm4
339339
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
340340
; AVX512VL-NEXT: vpsraw $2, %ymm3, %ymm4
341-
; AVX512VL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
342-
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
341+
; AVX512VL-NEXT: vpaddw %ymm2, %ymm2, %ymm5
342+
; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
343343
; AVX512VL-NEXT: vpsraw $1, %ymm3, %ymm4
344-
; AVX512VL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
344+
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm2
345345
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
346346
; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
347347
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
348348
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
349349
; AVX512VL-NEXT: vpsraw $4, %ymm0, %ymm3
350350
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
351351
; AVX512VL-NEXT: vpsraw $2, %ymm0, %ymm3
352-
; AVX512VL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
353-
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
352+
; AVX512VL-NEXT: vpaddw %ymm1, %ymm1, %ymm4
353+
; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
354354
; AVX512VL-NEXT: vpsraw $1, %ymm0, %ymm3
355-
; AVX512VL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
355+
; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm1
356356
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
357357
; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
358358
; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
@@ -432,21 +432,21 @@ define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) {
432432
; AVX256VL-NEXT: vpsraw $4, %xmm3, %xmm4
433433
; AVX256VL-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
434434
; AVX256VL-NEXT: vpsraw $2, %xmm3, %xmm4
435-
; AVX256VL-NEXT: vpaddw %xmm2, %xmm2, %xmm2
436-
; AVX256VL-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
435+
; AVX256VL-NEXT: vpaddw %xmm2, %xmm2, %xmm5
436+
; AVX256VL-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
437437
; AVX256VL-NEXT: vpsraw $1, %xmm3, %xmm4
438-
; AVX256VL-NEXT: vpaddw %xmm2, %xmm2, %xmm2
438+
; AVX256VL-NEXT: vpsllw $2, %xmm2, %xmm2
439439
; AVX256VL-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
440440
; AVX256VL-NEXT: vpsrlw $8, %xmm2, %xmm2
441441
; AVX256VL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
442442
; AVX256VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
443443
; AVX256VL-NEXT: vpsraw $4, %xmm0, %xmm3
444444
; AVX256VL-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
445445
; AVX256VL-NEXT: vpsraw $2, %xmm0, %xmm3
446-
; AVX256VL-NEXT: vpaddw %xmm1, %xmm1, %xmm1
447-
; AVX256VL-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
446+
; AVX256VL-NEXT: vpaddw %xmm1, %xmm1, %xmm4
447+
; AVX256VL-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
448448
; AVX256VL-NEXT: vpsraw $1, %xmm0, %xmm3
449-
; AVX256VL-NEXT: vpaddw %xmm1, %xmm1, %xmm1
449+
; AVX256VL-NEXT: vpsllw $2, %xmm1, %xmm1
450450
; AVX256VL-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
451451
; AVX256VL-NEXT: vpsrlw $8, %xmm0, %xmm0
452452
; AVX256VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0

0 commit comments

Comments
 (0)