Skip to content

Commit bc32a1d

Browse files
committed
[DAG] Add non-uniform vector support to (shl (sr[la] exact X, C1), C2) folds
1 parent e67b90b commit bc32a1d

File tree

2 files changed

+37
-60
lines changed

2 files changed

+37
-60
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8888,23 +8888,36 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
88888888
}
88898889

88908890
// fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
8891-
// fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2
8892-
// TODO - support non-uniform vector shift amounts.
8893-
ConstantSDNode *N1C = isConstOrConstSplat(N1);
8894-
if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) &&
8891+
// fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2
8892+
if ((N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) &&
88958893
N0->getFlags().hasExact()) {
8896-
if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
8897-
uint64_t C1 = N0C1->getZExtValue();
8898-
uint64_t C2 = N1C->getZExtValue();
8894+
auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
8895+
ConstantSDNode *RHS) {
8896+
const APInt &LHSC = LHS->getAPIntValue();
8897+
const APInt &RHSC = RHS->getAPIntValue();
8898+
return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
8899+
LHSC.getZExtValue() <= RHSC.getZExtValue();
8900+
};
8901+
if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
8902+
/*AllowUndefs*/ false,
8903+
/*AllowTypeMismatch*/ true)) {
88998904
SDLoc DL(N);
8900-
if (C1 <= C2)
8901-
return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
8902-
DAG.getConstant(C2 - C1, DL, ShiftVT));
8903-
return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0),
8904-
DAG.getConstant(C1 - C2, DL, ShiftVT));
8905+
SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
8906+
SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
8907+
return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
8908+
}
8909+
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
8910+
/*AllowUndefs*/ false,
8911+
/*AllowTypeMismatch*/ true)) {
8912+
SDLoc DL(N);
8913+
SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
8914+
SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
8915+
return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), Diff);
89058916
}
89068917
}
89078918

8919+
ConstantSDNode *N1C = isConstOrConstSplat(N1);
8920+
89088921
// fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
89098922
// (and (srl x, (sub c1, c2), MASK)
89108923
// Only fold this if the inner shift has no other uses -- if it does, folding

llvm/test/CodeGen/X86/combine-shl.ll

Lines changed: 12 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -418,39 +418,21 @@ define <4 x i32> @combine_vec_shl_ge_ashr_exact1(<4 x i32> %x) {
418418
; SSE2-LABEL: combine_vec_shl_ge_ashr_exact1:
419419
; SSE2: # %bb.0:
420420
; SSE2-NEXT: movdqa %xmm0, %xmm1
421-
; SSE2-NEXT: psrad $3, %xmm1
422-
; SSE2-NEXT: movdqa %xmm0, %xmm2
423-
; SSE2-NEXT: psrad $5, %xmm2
424-
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
425-
; SSE2-NEXT: movdqa %xmm0, %xmm1
426-
; SSE2-NEXT: psrad $8, %xmm1
427-
; SSE2-NEXT: psrad $4, %xmm0
428-
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
429-
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
430-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
431-
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
432-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
433-
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
421+
; SSE2-NEXT: pslld $2, %xmm1
422+
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
423+
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
424+
; SSE2-NEXT: movaps %xmm1, %xmm0
434425
; SSE2-NEXT: retq
435426
;
436427
; SSE41-LABEL: combine_vec_shl_ge_ashr_exact1:
437428
; SSE41: # %bb.0:
438429
; SSE41-NEXT: movdqa %xmm0, %xmm1
439-
; SSE41-NEXT: psrad $8, %xmm1
440-
; SSE41-NEXT: movdqa %xmm0, %xmm2
441-
; SSE41-NEXT: psrad $4, %xmm2
442-
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
443-
; SSE41-NEXT: movdqa %xmm0, %xmm1
444-
; SSE41-NEXT: psrad $5, %xmm1
445-
; SSE41-NEXT: psrad $3, %xmm0
446-
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
447-
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
448-
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
430+
; SSE41-NEXT: pslld $2, %xmm1
431+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
449432
; SSE41-NEXT: retq
450433
;
451434
; AVX-LABEL: combine_vec_shl_ge_ashr_exact1:
452435
; AVX: # %bb.0:
453-
; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
454436
; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
455437
; AVX-NEXT: retq
456438
%1 = ashr exact <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8>
@@ -495,40 +477,22 @@ define <4 x i32> @combine_vec_shl_lt_ashr_exact1(<4 x i32> %x) {
495477
; SSE2-LABEL: combine_vec_shl_lt_ashr_exact1:
496478
; SSE2: # %bb.0:
497479
; SSE2-NEXT: movdqa %xmm0, %xmm1
498-
; SSE2-NEXT: psrad $5, %xmm1
499-
; SSE2-NEXT: movdqa %xmm0, %xmm2
500-
; SSE2-NEXT: psrad $7, %xmm2
501-
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
502-
; SSE2-NEXT: movdqa %xmm0, %xmm1
503-
; SSE2-NEXT: psrad $8, %xmm1
504-
; SSE2-NEXT: psrad $6, %xmm0
505-
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
506-
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
507-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
508-
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
509-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
510-
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
480+
; SSE2-NEXT: psrad $2, %xmm1
481+
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
482+
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
483+
; SSE2-NEXT: movaps %xmm1, %xmm0
511484
; SSE2-NEXT: retq
512485
;
513486
; SSE41-LABEL: combine_vec_shl_lt_ashr_exact1:
514487
; SSE41: # %bb.0:
515488
; SSE41-NEXT: movdqa %xmm0, %xmm1
516-
; SSE41-NEXT: psrad $8, %xmm1
517-
; SSE41-NEXT: movdqa %xmm0, %xmm2
518-
; SSE41-NEXT: psrad $6, %xmm2
519-
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
520-
; SSE41-NEXT: movdqa %xmm0, %xmm1
521-
; SSE41-NEXT: psrad $7, %xmm1
522-
; SSE41-NEXT: psrad $5, %xmm0
523-
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
524-
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
525-
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
489+
; SSE41-NEXT: psrad $2, %xmm1
490+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
526491
; SSE41-NEXT: retq
527492
;
528493
; AVX-LABEL: combine_vec_shl_lt_ashr_exact1:
529494
; AVX: # %bb.0:
530495
; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
531-
; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
532496
; AVX-NEXT: retq
533497
%1 = ashr exact <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
534498
%2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8>

0 commit comments

Comments
 (0)