Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 63 additions & 14 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10952,22 +10952,71 @@ void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl,
SDValue &Hi) const {
EVT VT = LHS.getValueType();
assert(RHS.getValueType() == VT && "Mismatching operand types");
EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
// We can fall back to a libcall with an illegal type for the MUL if we
// have a libcall big enough.
RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
if (WideVT == MVT::i16)
LC = RTLIB::MUL_I16;
else if (WideVT == MVT::i32)
LC = RTLIB::MUL_I32;
else if (WideVT == MVT::i64)
LC = RTLIB::MUL_I64;
else if (WideVT == MVT::i128)
LC = RTLIB::MUL_I128;

SDValue HiLHS;
SDValue HiRHS;
if (Signed) {
// The high part is obtained by SRA'ing all but one of the bits of low
// part.
unsigned LoSize = VT.getFixedSizeInBits();
SDValue Shift = DAG.getShiftAmountConstant(LoSize - 1, VT, dl);
HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, Shift);
HiRHS = DAG.getNode(ISD::SRA, dl, VT, RHS, Shift);
} else {
HiLHS = DAG.getConstant(0, dl, VT);
HiRHS = DAG.getConstant(0, dl, VT);
if (LC != RTLIB::UNKNOWN_LIBCALL && getLibcallName(LC)) {
SDValue HiLHS, HiRHS;
if (Signed) {
// The high part is obtained by SRA'ing all but one of the bits of low
// part.
unsigned LoSize = VT.getFixedSizeInBits();
SDValue Shift = DAG.getShiftAmountConstant(LoSize - 1, VT, dl);
HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, Shift);
HiRHS = DAG.getNode(ISD::SRA, dl, VT, RHS, Shift);
} else {
HiLHS = DAG.getConstant(0, dl, VT);
HiRHS = DAG.getConstant(0, dl, VT);
}
forceExpandWideMUL(DAG, dl, Signed, WideVT, LHS, HiLHS, RHS, HiRHS, Lo, Hi);
return;
}
EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
forceExpandWideMUL(DAG, dl, Signed, WideVT, LHS, HiLHS, RHS, HiRHS, Lo, Hi);

// Expand the multiplication by brute force. This is a generalized-version of
// the code from Hacker's Delight (itself derived from Knuth's Algorithm M
// from section 4.3.1) combined with the Hacker's delight code
// for calculating mulhs.
unsigned Bits = VT.getSizeInBits();
unsigned HalfBits = Bits / 2;
SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT);
SDValue LL = DAG.getNode(ISD::AND, dl, VT, LHS, Mask);
SDValue RL = DAG.getNode(ISD::AND, dl, VT, RHS, Mask);

SDValue T = DAG.getNode(ISD::MUL, dl, VT, LL, RL);
SDValue TL = DAG.getNode(ISD::AND, dl, VT, T, Mask);

SDValue Shift = DAG.getShiftAmountConstant(HalfBits, VT, dl);
// This is always an unsigned shift.
SDValue TH = DAG.getNode(ISD::SRL, dl, VT, T, Shift);

unsigned ShiftOpc = Signed ? ISD::SRA : ISD::SRL;
SDValue LH = DAG.getNode(ShiftOpc, dl, VT, LHS, Shift);
SDValue RH = DAG.getNode(ShiftOpc, dl, VT, RHS, Shift);

SDValue U =
DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RL), TH);
SDValue UL = DAG.getNode(ISD::AND, dl, VT, U, Mask);
SDValue UH = DAG.getNode(ShiftOpc, dl, VT, U, Shift);

SDValue V =
DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LL, RH), UL);
SDValue VH = DAG.getNode(ShiftOpc, dl, VT, V, Shift);

Lo = DAG.getNode(ISD::ADD, dl, VT, TL,
DAG.getNode(ISD::SHL, dl, VT, V, Shift));

Hi = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RH),
DAG.getNode(ISD::ADD, dl, VT, UH, VH));
Comment on lines +10985 to +11019
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than copy the snippet in 2 places, would it be possible to move this into a function on its own that's called from the 2 forceExpandWideMULs?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code is not identical. Notably ShiftOpc is always ISD::SRL in the other code and the calculation for Hi is more complex in the other code since it needs to multiply RHLL and RLRH and add them in.

I have some other refactorings planned. Maybe I can look at sharing as part of that.

}

SDValue
Expand Down
178 changes: 77 additions & 101 deletions llvm/test/CodeGen/AArch64/i128-math.ll
Original file line number Diff line number Diff line change
Expand Up @@ -355,40 +355,32 @@ define i128 @i128_mul(i128 %x, i128 %y) {
define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
; CHECK-LABEL: i128_checked_mul:
; CHECK: // %bb.0:
; CHECK-NEXT: asr x8, x1, #63
; CHECK-NEXT: asr x11, x3, #63
; CHECK-NEXT: umulh x13, x0, x2
; CHECK-NEXT: mul x9, x2, x8
; CHECK-NEXT: umulh x10, x2, x8
; CHECK-NEXT: umulh x12, x11, x0
; CHECK-NEXT: mul x14, x1, x2
; CHECK-NEXT: add x10, x10, x9
; CHECK-NEXT: madd x8, x3, x8, x10
; CHECK-NEXT: madd x10, x11, x1, x12
; CHECK-NEXT: mul x11, x11, x0
; CHECK-NEXT: umulh x12, x1, x2
; CHECK-NEXT: mul x15, x0, x3
; CHECK-NEXT: add x10, x10, x11
; CHECK-NEXT: adds x9, x11, x9
; CHECK-NEXT: umulh x16, x0, x3
; CHECK-NEXT: adc x10, x10, x8
; CHECK-NEXT: adds x8, x14, x13
; CHECK-NEXT: cinc x12, x12, hs
; CHECK-NEXT: mul x11, x1, x3
; CHECK-NEXT: adds x8, x15, x8
; CHECK-NEXT: umulh x13, x1, x3
; CHECK-NEXT: asr x9, x1, #63
; CHECK-NEXT: umulh x10, x0, x2
; CHECK-NEXT: asr x13, x3, #63
; CHECK-NEXT: mul x11, x1, x2
; CHECK-NEXT: umulh x8, x1, x2
; CHECK-NEXT: mul x9, x9, x2
; CHECK-NEXT: adds x10, x11, x10
; CHECK-NEXT: mul x14, x0, x3
; CHECK-NEXT: umulh x12, x0, x3
; CHECK-NEXT: adc x9, x8, x9
; CHECK-NEXT: mul x13, x0, x13
; CHECK-NEXT: adds x8, x14, x10
; CHECK-NEXT: mul x15, x1, x3
; CHECK-NEXT: smulh x10, x1, x3
; CHECK-NEXT: mov x1, x8
; CHECK-NEXT: cinc x14, x16, hs
; CHECK-NEXT: adds x12, x12, x14
; CHECK-NEXT: adc x11, x12, x13
; CHECK-NEXT: asr x12, x9, #63
; CHECK-NEXT: asr x13, x11, #63
; CHECK-NEXT: adds x9, x9, x11
; CHECK-NEXT: asr x11, x8, #63
; CHECK-NEXT: mul x0, x0, x2
; CHECK-NEXT: cset w14, hs
; CHECK-NEXT: adds x11, x11, x12
; CHECK-NEXT: asr x12, x8, #63
; CHECK-NEXT: adc x13, x13, x14
; CHECK-NEXT: adds x9, x11, x9
; CHECK-NEXT: adc x10, x13, x10
; CHECK-NEXT: cmp x9, x12
; CHECK-NEXT: ccmp x10, x12, #0, eq
; CHECK-NEXT: adc x12, x12, x13
; CHECK-NEXT: adds x9, x15, x9
; CHECK-NEXT: adc x10, x10, x12
; CHECK-NEXT: cmp x9, x11
; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w2, eq
; CHECK-NEXT: ret
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
Expand All @@ -404,40 +396,32 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
; CHECK-LABEL: i128_overflowing_mul:
; CHECK: // %bb.0:
; CHECK-NEXT: asr x8, x1, #63
; CHECK-NEXT: asr x11, x3, #63
; CHECK-NEXT: umulh x13, x0, x2
; CHECK-NEXT: mul x9, x2, x8
; CHECK-NEXT: umulh x10, x2, x8
; CHECK-NEXT: umulh x12, x11, x0
; CHECK-NEXT: mul x14, x1, x2
; CHECK-NEXT: add x10, x10, x9
; CHECK-NEXT: madd x8, x3, x8, x10
; CHECK-NEXT: madd x10, x11, x1, x12
; CHECK-NEXT: mul x11, x11, x0
; CHECK-NEXT: umulh x12, x1, x2
; CHECK-NEXT: mul x15, x0, x3
; CHECK-NEXT: add x10, x10, x11
; CHECK-NEXT: adds x9, x11, x9
; CHECK-NEXT: umulh x16, x0, x3
; CHECK-NEXT: adc x10, x10, x8
; CHECK-NEXT: adds x8, x14, x13
; CHECK-NEXT: cinc x12, x12, hs
; CHECK-NEXT: mul x11, x1, x3
; CHECK-NEXT: adds x8, x15, x8
; CHECK-NEXT: umulh x13, x1, x3
; CHECK-NEXT: asr x9, x1, #63
; CHECK-NEXT: umulh x10, x0, x2
; CHECK-NEXT: asr x13, x3, #63
; CHECK-NEXT: mul x11, x1, x2
; CHECK-NEXT: umulh x8, x1, x2
; CHECK-NEXT: mul x9, x9, x2
; CHECK-NEXT: adds x10, x11, x10
; CHECK-NEXT: mul x14, x0, x3
; CHECK-NEXT: umulh x12, x0, x3
; CHECK-NEXT: adc x9, x8, x9
; CHECK-NEXT: mul x13, x0, x13
; CHECK-NEXT: adds x8, x14, x10
; CHECK-NEXT: mul x15, x1, x3
; CHECK-NEXT: smulh x10, x1, x3
; CHECK-NEXT: mov x1, x8
; CHECK-NEXT: cinc x14, x16, hs
; CHECK-NEXT: adds x12, x12, x14
; CHECK-NEXT: adc x11, x12, x13
; CHECK-NEXT: asr x12, x9, #63
; CHECK-NEXT: asr x13, x11, #63
; CHECK-NEXT: adds x9, x9, x11
; CHECK-NEXT: asr x11, x8, #63
; CHECK-NEXT: mul x0, x0, x2
; CHECK-NEXT: cset w14, hs
; CHECK-NEXT: adds x11, x11, x12
; CHECK-NEXT: asr x12, x8, #63
; CHECK-NEXT: adc x13, x13, x14
; CHECK-NEXT: adds x9, x11, x9
; CHECK-NEXT: adc x10, x13, x10
; CHECK-NEXT: cmp x9, x12
; CHECK-NEXT: ccmp x10, x12, #0, eq
; CHECK-NEXT: adc x12, x12, x13
; CHECK-NEXT: adds x9, x15, x9
; CHECK-NEXT: adc x10, x10, x12
; CHECK-NEXT: cmp x9, x11
; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w2, ne
; CHECK-NEXT: ret
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
Expand All @@ -452,46 +436,38 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
define i128 @i128_saturating_mul(i128 %x, i128 %y) {
; CHECK-LABEL: i128_saturating_mul:
; CHECK: // %bb.0:
; CHECK-NEXT: asr x8, x1, #63
; CHECK-NEXT: asr x11, x3, #63
; CHECK-NEXT: umulh x13, x0, x2
; CHECK-NEXT: mul x9, x2, x8
; CHECK-NEXT: umulh x10, x2, x8
; CHECK-NEXT: umulh x12, x11, x0
; CHECK-NEXT: mul x14, x1, x2
; CHECK-NEXT: add x10, x10, x9
; CHECK-NEXT: madd x8, x3, x8, x10
; CHECK-NEXT: madd x10, x11, x1, x12
; CHECK-NEXT: mul x11, x11, x0
; CHECK-NEXT: umulh x12, x1, x2
; CHECK-NEXT: mul x16, x0, x3
; CHECK-NEXT: add x10, x10, x11
; CHECK-NEXT: adds x9, x11, x9
; CHECK-NEXT: umulh x15, x0, x3
; CHECK-NEXT: adc x8, x10, x8
; CHECK-NEXT: adds x10, x14, x13
; CHECK-NEXT: cinc x12, x12, hs
; CHECK-NEXT: mul x17, x1, x3
; CHECK-NEXT: adds x10, x16, x10
; CHECK-NEXT: umulh x11, x1, x3
; CHECK-NEXT: cinc x13, x15, hs
; CHECK-NEXT: adds x12, x12, x13
; CHECK-NEXT: cset w13, hs
; CHECK-NEXT: adds x12, x17, x12
; CHECK-NEXT: adc x11, x11, x13
; CHECK-NEXT: adds x9, x12, x9
; CHECK-NEXT: asr x12, x10, #63
; CHECK-NEXT: asr x9, x1, #63
; CHECK-NEXT: umulh x10, x0, x2
; CHECK-NEXT: asr x13, x3, #63
; CHECK-NEXT: mul x11, x1, x2
; CHECK-NEXT: umulh x8, x1, x2
; CHECK-NEXT: mul x9, x9, x2
; CHECK-NEXT: adds x10, x11, x10
; CHECK-NEXT: mul x14, x0, x3
; CHECK-NEXT: umulh x12, x0, x3
; CHECK-NEXT: adc x8, x8, x9
; CHECK-NEXT: mul x13, x0, x13
; CHECK-NEXT: adds x9, x14, x10
; CHECK-NEXT: mul x11, x1, x3
; CHECK-NEXT: adc x10, x12, x13
; CHECK-NEXT: smulh x12, x1, x3
; CHECK-NEXT: asr x13, x8, #63
; CHECK-NEXT: asr x14, x10, #63
; CHECK-NEXT: adds x8, x8, x10
; CHECK-NEXT: adc x10, x13, x14
; CHECK-NEXT: adds x8, x11, x8
; CHECK-NEXT: asr x11, x9, #63
; CHECK-NEXT: mul x13, x0, x2
; CHECK-NEXT: adc x8, x11, x8
; CHECK-NEXT: eor x11, x3, x1
; CHECK-NEXT: eor x8, x8, x12
; CHECK-NEXT: eor x9, x9, x12
; CHECK-NEXT: asr x11, x11, #63
; CHECK-NEXT: orr x8, x9, x8
; CHECK-NEXT: eor x9, x11, #0x7fffffffffffffff
; CHECK-NEXT: adc x10, x12, x10
; CHECK-NEXT: eor x12, x3, x1
; CHECK-NEXT: eor x8, x8, x11
; CHECK-NEXT: eor x10, x10, x11
; CHECK-NEXT: asr x11, x12, #63
; CHECK-NEXT: orr x8, x8, x10
; CHECK-NEXT: eor x10, x11, #0x7fffffffffffffff
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: csel x1, x9, x10, ne
; CHECK-NEXT: csinv x0, x13, x11, eq
; CHECK-NEXT: csel x1, x10, x9, ne
; CHECK-NEXT: ret
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
%2 = extractvalue { i128, i1 } %1, 0
Expand Down
54 changes: 23 additions & 31 deletions llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,41 +35,33 @@ start:
define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
; AARCH-LABEL: __muloti4:
; AARCH: // %bb.0: // %Entry
; AARCH-NEXT: asr x10, x1, #63
; AARCH-NEXT: asr x11, x1, #63
; AARCH-NEXT: asr x9, x3, #63
; AARCH-NEXT: umulh x14, x0, x2
; AARCH-NEXT: umulh x12, x0, x2
; AARCH-NEXT: mov x8, x1
; AARCH-NEXT: str wzr, [x4]
; AARCH-NEXT: mul x12, x2, x10
; AARCH-NEXT: umulh x13, x2, x10
; AARCH-NEXT: umulh x11, x9, x0
; AARCH-NEXT: mul x15, x1, x2
; AARCH-NEXT: add x13, x13, x12
; AARCH-NEXT: madd x11, x9, x1, x11
; AARCH-NEXT: mul x9, x9, x0
; AARCH-NEXT: madd x10, x3, x10, x13
; AARCH-NEXT: umulh x13, x1, x2
; AARCH-NEXT: add x11, x11, x9
; AARCH-NEXT: adds x9, x9, x12
; AARCH-NEXT: mul x16, x0, x3
; AARCH-NEXT: adc x10, x11, x10
; AARCH-NEXT: adds x11, x15, x14
; AARCH-NEXT: umulh x17, x0, x3
; AARCH-NEXT: cinc x13, x13, hs
; AARCH-NEXT: mul x12, x1, x3
; AARCH-NEXT: adds x1, x16, x11
; AARCH-NEXT: umulh x11, x8, x3
; AARCH-NEXT: cinc x14, x17, hs
; AARCH-NEXT: adds x13, x13, x14
; AARCH-NEXT: mul x13, x1, x2
; AARCH-NEXT: umulh x10, x1, x2
; AARCH-NEXT: mul x11, x11, x2
; AARCH-NEXT: adds x12, x13, x12
; AARCH-NEXT: mul x15, x0, x3
; AARCH-NEXT: umulh x14, x0, x3
; AARCH-NEXT: adc x10, x10, x11
; AARCH-NEXT: mul x9, x0, x9
; AARCH-NEXT: mul x16, x1, x3
; AARCH-NEXT: adds x1, x15, x12
; AARCH-NEXT: asr x12, x10, #63
; AARCH-NEXT: smulh x11, x8, x3
; AARCH-NEXT: adc x9, x14, x9
; AARCH-NEXT: asr x13, x9, #63
; AARCH-NEXT: adds x9, x10, x9
; AARCH-NEXT: asr x10, x1, #63
; AARCH-NEXT: mul x0, x0, x2
; AARCH-NEXT: cset w14, hs
; AARCH-NEXT: adds x12, x12, x13
; AARCH-NEXT: asr x13, x1, #63
; AARCH-NEXT: adc x11, x11, x14
; AARCH-NEXT: adds x9, x12, x9
; AARCH-NEXT: adc x10, x11, x10
; AARCH-NEXT: cmp x9, x13
; AARCH-NEXT: ccmp x10, x13, #0, eq
; AARCH-NEXT: adc x12, x12, x13
; AARCH-NEXT: adds x9, x16, x9
; AARCH-NEXT: adc x11, x11, x12
; AARCH-NEXT: cmp x9, x10
; AARCH-NEXT: ccmp x11, x10, #0, eq
; AARCH-NEXT: cset w9, ne
; AARCH-NEXT: tbz x8, #63, .LBB1_2
; AARCH-NEXT: // %bb.1: // %Entry
Expand Down
Loading
Loading