Skip to content

Commit d20a8ae

Browse files
committed
[DAG] Use known-bits when creating umulh/smulh.
This extends the creation of umulh/smulh instructions to handle cases where one operand is a zext/sext and the other has enough known-zero or sign bits to create a mulh. This can be useful when one of the operands is hoisted out of a loop.
1 parent f8b79e6 commit d20a8ae

File tree

5 files changed

+68
-136
lines changed

5 files changed

+68
-136
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10789,6 +10789,10 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
1078910789
SDValue LeftOp = ShiftOperand.getOperand(0);
1079010790
SDValue RightOp = ShiftOperand.getOperand(1);
1079110791

10792+
if (LeftOp.getOpcode() != ISD::SIGN_EXTEND &&
10793+
LeftOp.getOpcode() != ISD::ZERO_EXTEND)
10794+
std::swap(LeftOp, RightOp);
10795+
1079210796
bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
1079310797
bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
1079410798

@@ -10821,18 +10825,26 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
1082110825
}
1082210826

1082310827
SDValue MulhRightOp;
10824-
if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
10825-
unsigned ActiveBits = IsSignExt
10826-
? Constant->getAPIntValue().getSignificantBits()
10827-
: Constant->getAPIntValue().getActiveBits();
10828-
if (ActiveBits > NarrowVTSize)
10828+
if (LeftOp.getOpcode() != RightOp.getOpcode()) {
10829+
if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
10830+
unsigned ActiveBits = IsSignExt
10831+
? Constant->getAPIntValue().getSignificantBits()
10832+
: Constant->getAPIntValue().getActiveBits();
10833+
if (ActiveBits > NarrowVTSize)
10834+
return SDValue();
10835+
MulhRightOp = DAG.getConstant(
10836+
Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
10837+
NarrowVT);
10838+
} else if (IsZeroExt &&
10839+
DAG.computeKnownBits(RightOp).countMinLeadingZeros() >=
10840+
NarrowVTSize) {
10841+
MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
10842+
} else if (IsSignExt && DAG.ComputeNumSignBits(RightOp) > NarrowVTSize) {
10843+
MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
10844+
} else {
1082910845
return SDValue();
10830-
MulhRightOp = DAG.getConstant(
10831-
Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
10832-
NarrowVT);
10846+
}
1083310847
} else {
10834-
if (LeftOp.getOpcode() != RightOp.getOpcode())
10835-
return SDValue();
1083610848
// Check that the two extend nodes are the same type.
1083710849
if (NarrowVT != RightOp.getOperand(0).getValueType())
1083810850
return SDValue();

llvm/test/CodeGen/AMDGPU/sdiv64.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
572572
; GCN-NEXT: v_mul_lo_u32 v3, v3, v2
573573
; GCN-NEXT: v_mul_hi_u32 v3, v2, v3
574574
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
575-
; GCN-NEXT: v_mul_hi_u32 v2, v1, v2
575+
; GCN-NEXT: v_mul_hi_u32 v2, v2, v1
576576
; GCN-NEXT: v_mul_u32_u24_e32 v3, v2, v0
577577
; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2
578578
; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
@@ -599,7 +599,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
599599
; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v2
600600
; GCN-IR-NEXT: v_mul_hi_u32 v3, v2, v3
601601
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v3
602-
; GCN-IR-NEXT: v_mul_hi_u32 v2, v1, v2
602+
; GCN-IR-NEXT: v_mul_hi_u32 v2, v2, v1
603603
; GCN-IR-NEXT: v_mul_u32_u24_e32 v3, v2, v0
604604
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2
605605
; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, v1, v3

llvm/test/CodeGen/AMDGPU/udiv64.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
512512
; GCN-NEXT: s_mov_b32 s4, s0
513513
; GCN-NEXT: s_mov_b32 s5, s1
514514
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
515-
; GCN-NEXT: v_mul_hi_u32 v0, s3, v0
515+
; GCN-NEXT: v_mul_hi_u32 v0, v0, s3
516516
; GCN-NEXT: v_readfirstlane_b32 s0, v0
517517
; GCN-NEXT: s_mul_i32 s0, s0, s8
518518
; GCN-NEXT: s_sub_i32 s0, s3, s0
@@ -548,7 +548,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
548548
; GCN-IR-NEXT: s_mov_b32 s4, s0
549549
; GCN-IR-NEXT: s_mov_b32 s5, s1
550550
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
551-
; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0
551+
; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s3
552552
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
553553
; GCN-IR-NEXT: s_mul_i32 s0, s0, s8
554554
; GCN-IR-NEXT: s_sub_i32 s0, s3, s0
@@ -592,7 +592,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
592592
; GCN-NEXT: s_lshr_b32 s2, s3, 1
593593
; GCN-NEXT: s_mov_b32 s4, s0
594594
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
595-
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
595+
; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
596596
; GCN-NEXT: s_mov_b32 s5, s1
597597
; GCN-NEXT: v_readfirstlane_b32 s0, v0
598598
; GCN-NEXT: s_mul_i32 s0, s0, s8
@@ -630,7 +630,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
630630
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1
631631
; GCN-IR-NEXT: s_mov_b32 s4, s0
632632
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
633-
; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
633+
; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2
634634
; GCN-IR-NEXT: s_mov_b32 s5, s1
635635
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
636636
; GCN-IR-NEXT: s_mul_i32 s0, s0, s8

llvm/test/CodeGen/AMDGPU/urem64.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
469469
; GCN-NEXT: s_lshr_b32 s2, s3, 1
470470
; GCN-NEXT: s_mov_b32 s4, s0
471471
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
472-
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
472+
; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
473473
; GCN-NEXT: s_mov_b32 s5, s1
474474
; GCN-NEXT: v_mov_b32_e32 v1, 0
475475
; GCN-NEXT: v_readfirstlane_b32 s0, v0
@@ -504,7 +504,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
504504
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1
505505
; GCN-IR-NEXT: s_mov_b32 s4, s0
506506
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
507-
; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
507+
; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2
508508
; GCN-IR-NEXT: s_mov_b32 s5, s1
509509
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
510510
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
@@ -546,7 +546,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
546546
; GCN-NEXT: s_lshr_b32 s1, s9, 1
547547
; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
548548
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
549-
; GCN-NEXT: v_mul_hi_u32 v0, s1, v0
549+
; GCN-NEXT: v_mul_hi_u32 v0, v0, s1
550550
; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
551551
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
552552
; GCN-NEXT: v_readfirstlane_b32 s2, v0
@@ -564,7 +564,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
564564
; GCN-NEXT: s_mov_b32 s2, -1
565565
; GCN-NEXT: v_mul_hi_u32 v0, v1, v0
566566
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
567-
; GCN-NEXT: v_mul_hi_u32 v2, s7, v0
567+
; GCN-NEXT: v_mul_hi_u32 v2, v0, s7
568568
; GCN-NEXT: v_mov_b32_e32 v1, 0
569569
; GCN-NEXT: v_mov_b32_e32 v0, s8
570570
; GCN-NEXT: v_mov_b32_e32 v3, v1
@@ -601,7 +601,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
601601
; GCN-IR-NEXT: s_lshr_b32 s1, s9, 1
602602
; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1
603603
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
604-
; GCN-IR-NEXT: v_mul_hi_u32 v0, s1, v0
604+
; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s1
605605
; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
606606
; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1
607607
; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0
@@ -619,7 +619,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
619619
; GCN-IR-NEXT: s_mov_b32 s2, -1
620620
; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0
621621
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0
622-
; GCN-IR-NEXT: v_mul_hi_u32 v2, s7, v0
622+
; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, s7
623623
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
624624
; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
625625
; GCN-IR-NEXT: v_mov_b32_e32 v3, v1
@@ -730,7 +730,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
730730
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
731731
; GCN-NEXT: v_mul_lo_u32 v1, v1, s4
732732
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
733-
; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
733+
; GCN-NEXT: v_mul_hi_u32 v0, v0, s7
734734
; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
735735
; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v1
736736
; GCN-NEXT: v_readfirstlane_b32 s4, v0
@@ -777,7 +777,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
777777
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
778778
; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, s4
779779
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v4
780-
; GCN-IR-NEXT: v_mul_hi_u32 v0, s7, v0
780+
; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s7
781781
; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
782782
; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffff, v1
783783
; GCN-IR-NEXT: v_readfirstlane_b32 s4, v0

0 commit comments

Comments
 (0)