Skip to content

Commit db9d4ea

Browse files
committed
[DAG] Use known-bits when creating umulh/smulh.
This extends the creation of umulh/smulh instructions to handle cases where one operand is a zext/sext and the other has enough known-zero or sign bits to create a mulh. This can be useful when one of the operands is hoisted out of a loop.
1 parent ab1fd21 commit db9d4ea

File tree

5 files changed

+68
-136
lines changed

5 files changed

+68
-136
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10789,6 +10789,10 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
1078910789
SDValue LeftOp = ShiftOperand.getOperand(0);
1079010790
SDValue RightOp = ShiftOperand.getOperand(1);
1079110791

10792+
if (LeftOp.getOpcode() != ISD::SIGN_EXTEND &&
10793+
LeftOp.getOpcode() != ISD::ZERO_EXTEND)
10794+
std::swap(LeftOp, RightOp);
10795+
1079210796
bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
1079310797
bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
1079410798

@@ -10821,18 +10825,26 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
1082110825
}
1082210826

1082310827
SDValue MulhRightOp;
10824-
if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
10825-
unsigned ActiveBits = IsSignExt
10826-
? Constant->getAPIntValue().getSignificantBits()
10827-
: Constant->getAPIntValue().getActiveBits();
10828-
if (ActiveBits > NarrowVTSize)
10828+
if (LeftOp.getOpcode() != RightOp.getOpcode()) {
10829+
if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
10830+
unsigned ActiveBits = IsSignExt
10831+
? Constant->getAPIntValue().getSignificantBits()
10832+
: Constant->getAPIntValue().getActiveBits();
10833+
if (ActiveBits > NarrowVTSize)
10834+
return SDValue();
10835+
MulhRightOp = DAG.getConstant(
10836+
Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
10837+
NarrowVT);
10838+
} else if (IsZeroExt &&
10839+
DAG.computeKnownBits(RightOp).countMinLeadingZeros() >=
10840+
NarrowVTSize) {
10841+
MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
10842+
} else if (IsSignExt && DAG.ComputeNumSignBits(RightOp) > NarrowVTSize) {
10843+
MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
10844+
} else {
1082910845
return SDValue();
10830-
MulhRightOp = DAG.getConstant(
10831-
Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
10832-
NarrowVT);
10846+
}
1083310847
} else {
10834-
if (LeftOp.getOpcode() != RightOp.getOpcode())
10835-
return SDValue();
1083610848
// Check that the two extend nodes are the same type.
1083710849
if (NarrowVT != RightOp.getOperand(0).getValueType())
1083810850
return SDValue();

llvm/test/CodeGen/AMDGPU/sdiv64.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
571571
; GCN-NEXT: v_mul_lo_u32 v3, v3, v2
572572
; GCN-NEXT: v_mul_hi_u32 v3, v2, v3
573573
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
574-
; GCN-NEXT: v_mul_hi_u32 v2, v1, v2
574+
; GCN-NEXT: v_mul_hi_u32 v2, v2, v1
575575
; GCN-NEXT: v_mul_u32_u24_e32 v3, v2, v0
576576
; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2
577577
; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
@@ -598,7 +598,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
598598
; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v2
599599
; GCN-IR-NEXT: v_mul_hi_u32 v3, v2, v3
600600
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v3
601-
; GCN-IR-NEXT: v_mul_hi_u32 v2, v1, v2
601+
; GCN-IR-NEXT: v_mul_hi_u32 v2, v2, v1
602602
; GCN-IR-NEXT: v_mul_u32_u24_e32 v3, v2, v0
603603
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2
604604
; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, v1, v3

llvm/test/CodeGen/AMDGPU/udiv64.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
515515
; GCN-NEXT: s_mov_b32 s4, s0
516516
; GCN-NEXT: s_mov_b32 s5, s1
517517
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
518-
; GCN-NEXT: v_mul_hi_u32 v0, s3, v0
518+
; GCN-NEXT: v_mul_hi_u32 v0, v0, s3
519519
; GCN-NEXT: v_readfirstlane_b32 s0, v0
520520
; GCN-NEXT: s_mul_i32 s0, s0, s8
521521
; GCN-NEXT: s_sub_i32 s0, s3, s0
@@ -551,7 +551,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
551551
; GCN-IR-NEXT: s_mov_b32 s4, s0
552552
; GCN-IR-NEXT: s_mov_b32 s5, s1
553553
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
554-
; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0
554+
; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s3
555555
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
556556
; GCN-IR-NEXT: s_mul_i32 s0, s0, s8
557557
; GCN-IR-NEXT: s_sub_i32 s0, s3, s0
@@ -595,7 +595,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
595595
; GCN-NEXT: s_lshr_b32 s2, s3, 1
596596
; GCN-NEXT: s_mov_b32 s4, s0
597597
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
598-
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
598+
; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
599599
; GCN-NEXT: s_mov_b32 s5, s1
600600
; GCN-NEXT: v_readfirstlane_b32 s0, v0
601601
; GCN-NEXT: s_mul_i32 s0, s0, s8
@@ -633,7 +633,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
633633
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1
634634
; GCN-IR-NEXT: s_mov_b32 s4, s0
635635
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
636-
; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
636+
; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2
637637
; GCN-IR-NEXT: s_mov_b32 s5, s1
638638
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
639639
; GCN-IR-NEXT: s_mul_i32 s0, s0, s8

llvm/test/CodeGen/AMDGPU/urem64.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
467467
; GCN-NEXT: s_lshr_b32 s2, s3, 1
468468
; GCN-NEXT: s_mov_b32 s4, s0
469469
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
470-
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
470+
; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
471471
; GCN-NEXT: s_mov_b32 s5, s1
472472
; GCN-NEXT: v_mov_b32_e32 v1, 0
473473
; GCN-NEXT: v_readfirstlane_b32 s0, v0
@@ -502,7 +502,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
502502
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1
503503
; GCN-IR-NEXT: s_mov_b32 s4, s0
504504
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
505-
; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
505+
; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2
506506
; GCN-IR-NEXT: s_mov_b32 s5, s1
507507
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
508508
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
@@ -544,7 +544,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
544544
; GCN-NEXT: s_lshr_b32 s1, s9, 1
545545
; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
546546
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
547-
; GCN-NEXT: v_mul_hi_u32 v0, s1, v0
547+
; GCN-NEXT: v_mul_hi_u32 v0, v0, s1
548548
; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
549549
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
550550
; GCN-NEXT: v_readfirstlane_b32 s2, v0
@@ -562,7 +562,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
562562
; GCN-NEXT: s_mov_b32 s2, -1
563563
; GCN-NEXT: v_mul_hi_u32 v0, v1, v0
564564
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
565-
; GCN-NEXT: v_mul_hi_u32 v2, s7, v0
565+
; GCN-NEXT: v_mul_hi_u32 v2, v0, s7
566566
; GCN-NEXT: v_mov_b32_e32 v1, 0
567567
; GCN-NEXT: v_mov_b32_e32 v0, s8
568568
; GCN-NEXT: v_mov_b32_e32 v3, v1
@@ -599,7 +599,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
599599
; GCN-IR-NEXT: s_lshr_b32 s1, s9, 1
600600
; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1
601601
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
602-
; GCN-IR-NEXT: v_mul_hi_u32 v0, s1, v0
602+
; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s1
603603
; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
604604
; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1
605605
; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0
@@ -617,7 +617,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
617617
; GCN-IR-NEXT: s_mov_b32 s2, -1
618618
; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0
619619
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0
620-
; GCN-IR-NEXT: v_mul_hi_u32 v2, s7, v0
620+
; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, s7
621621
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
622622
; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
623623
; GCN-IR-NEXT: v_mov_b32_e32 v3, v1
@@ -728,7 +728,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
728728
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
729729
; GCN-NEXT: v_mul_lo_u32 v1, v1, s4
730730
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
731-
; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
731+
; GCN-NEXT: v_mul_hi_u32 v0, v0, s7
732732
; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
733733
; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v1
734734
; GCN-NEXT: v_readfirstlane_b32 s4, v0
@@ -775,7 +775,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
775775
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
776776
; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, s4
777777
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v4
778-
; GCN-IR-NEXT: v_mul_hi_u32 v0, s7, v0
778+
; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s7
779779
; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
780780
; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffff, v1
781781
; GCN-IR-NEXT: v_readfirstlane_b32 s4, v0

0 commit comments

Comments
 (0)