diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 208d724f7ae28..184f96b872aa6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1046,7 +1046,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::StoreV4) MAKE_CASE(NVPTXISD::FSHL_CLAMP) MAKE_CASE(NVPTXISD::FSHR_CLAMP) - MAKE_CASE(NVPTXISD::IMAD) MAKE_CASE(NVPTXISD::BFE) MAKE_CASE(NVPTXISD::BFI) MAKE_CASE(NVPTXISD::PRMT) @@ -4451,14 +4450,8 @@ PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, if (!N0.getNode()->hasOneUse()) return SDValue(); - // fold (add (mul a, b), c) -> (mad a, b, c) - // - if (N0.getOpcode() == ISD::MUL) - return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0), - N0.getOperand(1), N1); - // fold (add (select cond, 0, (mul a, b)), c) - // -> (select cond, c, (mad a, b, c)) + // -> (select cond, c, (add (mul a, b), c)) // if (N0.getOpcode() == ISD::SELECT) { unsigned ZeroOpNum; @@ -4473,8 +4466,10 @@ PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse()) return SDValue(); - SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, - M->getOperand(0), M->getOperand(1), N1); + SDLoc DL(N); + SDValue Mul = + DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1)); + SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1); return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0), ((ZeroOpNum == 1) ? N1 : MAD), ((ZeroOpNum == 1) ? MAD : N1)); @@ -4911,8 +4906,10 @@ static SDValue matchMADConstOnePattern(SDValue Add) { static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI) { - if (SDValue Y = matchMADConstOnePattern(Add)) - return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X); + if (SDValue Y = matchMADConstOnePattern(Add)) { + SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y); + return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X); + } return SDValue(); } @@ -4959,7 +4956,7 @@ PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, SDLoc DL(N); - // (mul x, (add y, 1)) -> (mad x, y, x) + // (mul x, (add y, 1)) -> (add (mul x, y), x) if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI)) return Res; if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI)) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 4a98fe21b81dc..51265ed2179d8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -55,7 +55,6 @@ enum NodeType : unsigned { FSHR_CLAMP, MUL_WIDE_SIGNED, MUL_WIDE_UNSIGNED, - IMAD, SETP_F16X2, SETP_BF16X2, BFE, diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index f8dc66d598025..4cf36c8b5b633 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -141,6 +141,7 @@ def hasLDG : Predicate<"Subtarget->hasLDG()">; def hasLDU : Predicate<"Subtarget->hasLDU()">; def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">; def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">; +def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">; def doF32FTZ : Predicate<"useF32FTZ()">; def doNoF32FTZ : Predicate<"!useF32FTZ()">; @@ -1092,73 +1093,39 @@ def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)), // // Integer multiply-add // -def SDTIMAD : - SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, - SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; -def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; - -def MAD16rrr : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set i16:$dst, (imad i16:$a, i16:$b, i16:$c))]>; -def MAD16rri : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set i16:$dst, (imad i16:$a, i16:$b, imm:$c))]>; -def MAD16rir : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set i16:$dst, (imad i16:$a, imm:$b, i16:$c))]>; -def MAD16rii : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, i16imm:$b, i16imm:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set i16:$dst, (imad i16:$a, imm:$b, imm:$c))]>; - -def MAD32rrr : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set i32:$dst, (imad i32:$a, i32:$b, i32:$c))]>; -def MAD32rri : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set i32:$dst, (imad i32:$a, i32:$b, imm:$c))]>; -def MAD32rir : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set i32:$dst, (imad i32:$a, imm:$b, i32:$c))]>; -def MAD32rii : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, i32imm:$b, i32imm:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set i32:$dst, (imad i32:$a, imm:$b, imm:$c))]>; - -def MAD64rrr : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set i64:$dst, (imad i64:$a, i64:$b, i64:$c))]>; -def MAD64rri : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set i64:$dst, (imad i64:$a, i64:$b, imm:$c))]>; -def MAD64rir : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set i64:$dst, (imad i64:$a, imm:$b, i64:$c))]>; -def MAD64rii : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, i64imm:$b, i64imm:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set i64:$dst, (imad i64:$a, imm:$b, imm:$c))]>; +def mul_oneuse : PatFrag<(ops node:$a, node:$b), (mul node:$a, node:$b), [{ + return N->hasOneUse(); +}]>; + +multiclass MAD { + def rrr: + NVPTXInst<(outs Reg:$dst), + (ins Reg:$a, Reg:$b, Reg:$c), + Ptx # " \t$dst, $a, $b, $c;", + [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), VT:$c))]>; + + def rir: + NVPTXInst<(outs Reg:$dst), + (ins Reg:$a, Imm:$b, Reg:$c), + Ptx # " \t$dst, $a, $b, $c;", + [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), VT:$c))]>; + def rri: + NVPTXInst<(outs Reg:$dst), + (ins Reg:$a, Reg:$b, Imm:$c), + Ptx # " \t$dst, $a, $b, $c;", + [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), imm:$c))]>; + def rii: + NVPTXInst<(outs Reg:$dst), + (ins Reg:$a, Imm:$b, Imm:$c), + Ptx # " \t$dst, $a, $b, $c;", + [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), imm:$c))]>; +} + +let Predicates = [hasOptEnabled] in { +defm MAD16 : MAD<"mad.lo.s16", i16, Int16Regs, i16imm>; +defm MAD32 : MAD<"mad.lo.s32", i32, Int32Regs, i32imm>; +defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>; +} def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll index 1b22cfde39725..304025fdb15fe 100644 --- a/llvm/test/CodeGen/NVPTX/combine-mad.ll +++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll @@ -183,3 +183,58 @@ define i32 @test4_rev(i32 %a, i32 %b, i32 %c, i1 %p) { %add = add i32 %c, %sel ret i32 %add } + +declare i32 @use(i32 %0, i32 %1) + +define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_mad_multi_use( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_mad_multi_use_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_mad_multi_use_param_1]; +; CHECK-NEXT: mul.lo.s32 %r3, %r1, %r2; +; CHECK-NEXT: ld.param.u32 %r4, [test_mad_multi_use_param_2]; +; CHECK-NEXT: add.s32 %r5, %r3, %r4; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r3; +; CHECK-NEXT: .param .b32 param1; +; CHECK-NEXT: st.param.b32 [param1], %r5; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: use, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r6, [retval0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + %res = call i32 @use(i32 %mul, i32 %add) + ret i32 %res +} + +;; This case relies on mad x 1 y => add x y, previously we emit: +;; mad.lo.s32 %r3, %r1, 1, %r2; +define i32 @test_mad_fold(i32 %x) { +; CHECK-LABEL: test_mad_fold( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_mad_fold_param_0]; +; CHECK-NEXT: mul.hi.s32 %r2, %r1, -2147221471; +; CHECK-NEXT: add.s32 %r3, %r2, %r1; +; CHECK-NEXT: shr.u32 %r4, %r3, 31; +; CHECK-NEXT: shr.s32 %r5, %r3, 12; +; CHECK-NEXT: add.s32 %r6, %r5, %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %div = sdiv i32 %x, 8191 + ret i32 %div +} diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll index 27a523b9dd91d..de19d2983f343 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll @@ -12,7 +12,7 @@ ; CHECK-NOT: __local_depot ; CHECK-32: ld.param.u32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0]; -; CHECK-32-NEXT: mad.lo.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 1, 7; +; CHECK-32-NEXT: add.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 7; ; CHECK-32-NEXT: and.b32 %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8; ; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16; ; CHECK-32-NEXT: cvta.local.u32 %r[[ALLOCA]], %r[[ALLOCA]]; diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index 7ece0ccbd844e..ca1b5fdabbf8f 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -7,20 +7,20 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<19>; ; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<129>; +; CHECK-NEXT: .reg .b64 %rd<127>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0]; ; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1]; ; CHECK-NEXT: shr.s64 %rd2, %rd46, 63; -; CHECK-NEXT: mov.b64 %rd119, 0; -; CHECK-NEXT: sub.cc.s64 %rd52, %rd119, %rd45; -; CHECK-NEXT: subc.cc.s64 %rd53, %rd119, %rd46; +; CHECK-NEXT: mov.b64 %rd117, 0; +; CHECK-NEXT: sub.cc.s64 %rd52, %rd117, %rd45; +; CHECK-NEXT: subc.cc.s64 %rd53, %rd117, %rd46; ; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; ; CHECK-NEXT: selp.b64 %rd4, %rd53, %rd46, %p1; ; CHECK-NEXT: selp.b64 %rd3, %rd52, %rd45, %p1; -; CHECK-NEXT: sub.cc.s64 %rd54, %rd119, %rd49; -; CHECK-NEXT: subc.cc.s64 %rd55, %rd119, %rd50; +; CHECK-NEXT: sub.cc.s64 %rd54, %rd117, %rd49; +; CHECK-NEXT: subc.cc.s64 %rd55, %rd117, %rd50; ; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0; ; CHECK-NEXT: selp.b64 %rd6, %rd55, %rd50, %p2; ; CHECK-NEXT: selp.b64 %rd5, %rd54, %rd49, %p2; @@ -44,7 +44,7 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: add.s64 %rd64, %rd63, 64; ; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7; ; CHECK-NEXT: sub.cc.s64 %rd66, %rd61, %rd65; -; CHECK-NEXT: subc.cc.s64 %rd67, %rd119, 0; +; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0; ; CHECK-NEXT: setp.eq.s64 %p8, %rd67, 0; ; CHECK-NEXT: setp.ne.s64 %p9, %rd67, 0; ; CHECK-NEXT: selp.u32 %r5, -1, 0, %p9; @@ -57,14 +57,14 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: xor.b64 %rd68, %rd66, 127; ; CHECK-NEXT: or.b64 %rd69, %rd68, %rd67; ; CHECK-NEXT: setp.eq.s64 %p13, %rd69, 0; -; CHECK-NEXT: selp.b64 %rd128, 0, %rd4, %p12; -; CHECK-NEXT: selp.b64 %rd127, 0, %rd3, %p12; +; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p12; +; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p12; ; CHECK-NEXT: or.pred %p14, %p12, %p13; ; CHECK-NEXT: @%p14 bra $L__BB0_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd121, %rd66, 1; -; CHECK-NEXT: addc.cc.s64 %rd122, %rd67, 0; -; CHECK-NEXT: or.b64 %rd72, %rd121, %rd122; +; CHECK-NEXT: add.cc.s64 %rd119, %rd66, 1; +; CHECK-NEXT: addc.cc.s64 %rd120, %rd67, 0; +; CHECK-NEXT: or.b64 %rd72, %rd119, %rd120; ; CHECK-NEXT: setp.eq.s64 %p15, %rd72, 0; ; CHECK-NEXT: cvt.u32.u64 %r9, %rd66; ; CHECK-NEXT: sub.s32 %r10, 127, %r9; @@ -75,12 +75,12 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: sub.s32 %r12, 63, %r9; ; CHECK-NEXT: shl.b64 %rd76, %rd3, %r12; ; CHECK-NEXT: setp.gt.s32 %p16, %r10, 63; -; CHECK-NEXT: selp.b64 %rd126, %rd76, %rd75, %p16; -; CHECK-NEXT: shl.b64 %rd125, %rd3, %r10; -; CHECK-NEXT: mov.u64 %rd116, %rd119; +; CHECK-NEXT: selp.b64 %rd124, %rd76, %rd75, %p16; +; CHECK-NEXT: shl.b64 %rd123, %rd3, %r10; +; CHECK-NEXT: mov.u64 %rd114, %rd117; ; CHECK-NEXT: @%p15 bra $L__BB0_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r13, %rd121; +; CHECK-NEXT: cvt.u32.u64 %r13, %rd119; ; CHECK-NEXT: shr.u64 %rd79, %rd3, %r13; ; CHECK-NEXT: sub.s32 %r14, 64, %r13; ; CHECK-NEXT: shl.b64 %rd80, %rd4, %r14; @@ -88,61 +88,59 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: add.s32 %r15, %r13, -64; ; CHECK-NEXT: shr.u64 %rd82, %rd4, %r15; ; CHECK-NEXT: setp.gt.s32 %p17, %r13, 63; -; CHECK-NEXT: selp.b64 %rd123, %rd82, %rd81, %p17; -; CHECK-NEXT: shr.u64 %rd124, %rd4, %r13; +; CHECK-NEXT: selp.b64 %rd121, %rd82, %rd81, %p17; +; CHECK-NEXT: shr.u64 %rd122, %rd4, %r13; ; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1; -; CHECK-NEXT: mov.b64 %rd116, 0; -; CHECK-NEXT: mov.u64 %rd119, %rd116; +; CHECK-NEXT: mov.b64 %rd114, 0; +; CHECK-NEXT: mov.u64 %rd117, %rd114; ; CHECK-NEXT: $L__BB0_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd83, %rd123, 63; -; CHECK-NEXT: shl.b64 %rd84, %rd124, 1; +; CHECK-NEXT: shr.u64 %rd83, %rd121, 63; +; CHECK-NEXT: shl.b64 %rd84, %rd122, 1; ; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83; -; CHECK-NEXT: shl.b64 %rd86, %rd123, 1; -; CHECK-NEXT: shr.u64 %rd87, %rd126, 63; +; CHECK-NEXT: shl.b64 %rd86, %rd121, 1; +; CHECK-NEXT: shr.u64 %rd87, %rd124, 63; ; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87; -; CHECK-NEXT: shr.u64 %rd89, %rd125, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd126, 1; +; CHECK-NEXT: shr.u64 %rd89, %rd123, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd124, 1; ; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd125, 1; -; CHECK-NEXT: or.b64 %rd125, %rd119, %rd92; -; CHECK-NEXT: or.b64 %rd126, %rd116, %rd91; +; CHECK-NEXT: shl.b64 %rd92, %rd123, 1; +; CHECK-NEXT: or.b64 %rd123, %rd117, %rd92; +; CHECK-NEXT: or.b64 %rd124, %rd114, %rd91; ; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88; ; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85; ; CHECK-NEXT: shr.s64 %rd95, %rd94, 63; -; CHECK-NEXT: and.b64 %rd119, %rd95, 1; +; CHECK-NEXT: and.b64 %rd117, %rd95, 1; ; CHECK-NEXT: and.b64 %rd96, %rd95, %rd5; ; CHECK-NEXT: and.b64 %rd97, %rd95, %rd6; -; CHECK-NEXT: sub.cc.s64 %rd123, %rd88, %rd96; -; CHECK-NEXT: subc.cc.s64 %rd124, %rd85, %rd97; -; CHECK-NEXT: add.cc.s64 %rd121, %rd121, -1; -; CHECK-NEXT: addc.cc.s64 %rd122, %rd122, -1; -; CHECK-NEXT: or.b64 %rd98, %rd121, %rd122; +; CHECK-NEXT: sub.cc.s64 %rd121, %rd88, %rd96; +; CHECK-NEXT: subc.cc.s64 %rd122, %rd85, %rd97; +; CHECK-NEXT: add.cc.s64 %rd119, %rd119, -1; +; CHECK-NEXT: addc.cc.s64 %rd120, %rd120, -1; +; CHECK-NEXT: or.b64 %rd98, %rd119, %rd120; ; CHECK-NEXT: setp.eq.s64 %p18, %rd98, 0; ; CHECK-NEXT: @%p18 bra $L__BB0_4; ; CHECK-NEXT: bra.uni $L__BB0_2; ; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd99, %rd125, 63; -; CHECK-NEXT: shl.b64 %rd100, %rd126, 1; +; CHECK-NEXT: shr.u64 %rd99, %rd123, 63; +; CHECK-NEXT: shl.b64 %rd100, %rd124, 1; ; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99; -; CHECK-NEXT: shl.b64 %rd102, %rd125, 1; -; CHECK-NEXT: or.b64 %rd127, %rd119, %rd102; -; CHECK-NEXT: or.b64 %rd128, %rd116, %rd101; +; CHECK-NEXT: shl.b64 %rd102, %rd123, 1; +; CHECK-NEXT: or.b64 %rd125, %rd117, %rd102; +; CHECK-NEXT: or.b64 %rd126, %rd114, %rd101; ; CHECK-NEXT: $L__BB0_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd103, %rd5, %rd127; -; CHECK-NEXT: mul.lo.s64 %rd104, %rd5, %rd128; -; CHECK-NEXT: add.s64 %rd105, %rd103, %rd104; -; CHECK-NEXT: mul.lo.s64 %rd106, %rd6, %rd127; -; CHECK-NEXT: add.s64 %rd107, %rd105, %rd106; -; CHECK-NEXT: mul.lo.s64 %rd108, %rd5, %rd127; -; CHECK-NEXT: sub.cc.s64 %rd109, %rd3, %rd108; -; CHECK-NEXT: subc.cc.s64 %rd110, %rd4, %rd107; -; CHECK-NEXT: xor.b64 %rd111, %rd109, %rd2; -; CHECK-NEXT: xor.b64 %rd112, %rd110, %rd2; -; CHECK-NEXT: sub.cc.s64 %rd113, %rd111, %rd2; -; CHECK-NEXT: subc.cc.s64 %rd114, %rd112, %rd2; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd113, %rd114}; +; CHECK-NEXT: mul.hi.u64 %rd103, %rd5, %rd125; +; CHECK-NEXT: mad.lo.s64 %rd104, %rd5, %rd126, %rd103; +; CHECK-NEXT: mad.lo.s64 %rd105, %rd6, %rd125, %rd104; +; CHECK-NEXT: mul.lo.s64 %rd106, %rd5, %rd125; +; CHECK-NEXT: sub.cc.s64 %rd107, %rd3, %rd106; +; CHECK-NEXT: subc.cc.s64 %rd108, %rd4, %rd105; +; CHECK-NEXT: xor.b64 %rd109, %rd107, %rd2; +; CHECK-NEXT: xor.b64 %rd110, %rd108, %rd2; +; CHECK-NEXT: sub.cc.s64 %rd111, %rd109, %rd2; +; CHECK-NEXT: subc.cc.s64 %rd112, %rd110, %rd2; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd111, %rd112}; ; CHECK-NEXT: ret; %div = srem i128 %lhs, %rhs ret i128 %div @@ -153,7 +151,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<17>; ; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<115>; +; CHECK-NEXT: .reg .b64 %rd<113>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.u64 {%rd41, %rd42}, [urem_i128_param_0]; @@ -177,9 +175,9 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; ; CHECK-NEXT: add.s64 %rd53, %rd52, 64; ; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.b64 %rd105, 0; +; CHECK-NEXT: mov.b64 %rd103, 0; ; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd57, %rd105, 0; +; CHECK-NEXT: subc.cc.s64 %rd57, %rd103, 0; ; CHECK-NEXT: setp.eq.s64 %p6, %rd57, 0; ; CHECK-NEXT: setp.ne.s64 %p7, %rd57, 0; ; CHECK-NEXT: selp.u32 %r5, -1, 0, %p7; @@ -192,14 +190,14 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; ; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; ; CHECK-NEXT: setp.eq.s64 %p11, %rd59, 0; -; CHECK-NEXT: selp.b64 %rd114, 0, %rd42, %p10; -; CHECK-NEXT: selp.b64 %rd113, 0, %rd41, %p10; +; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p10; +; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p10; ; CHECK-NEXT: or.pred %p12, %p10, %p11; ; CHECK-NEXT: @%p12 bra $L__BB1_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd107, %rd56, 1; -; CHECK-NEXT: addc.cc.s64 %rd108, %rd57, 0; -; CHECK-NEXT: or.b64 %rd62, %rd107, %rd108; +; CHECK-NEXT: add.cc.s64 %rd105, %rd56, 1; +; CHECK-NEXT: addc.cc.s64 %rd106, %rd57, 0; +; CHECK-NEXT: or.b64 %rd62, %rd105, %rd106; ; CHECK-NEXT: setp.eq.s64 %p13, %rd62, 0; ; CHECK-NEXT: cvt.u32.u64 %r9, %rd56; ; CHECK-NEXT: sub.s32 %r10, 127, %r9; @@ -210,12 +208,12 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: sub.s32 %r12, 63, %r9; ; CHECK-NEXT: shl.b64 %rd66, %rd41, %r12; ; CHECK-NEXT: setp.gt.s32 %p14, %r10, 63; -; CHECK-NEXT: selp.b64 %rd112, %rd66, %rd65, %p14; -; CHECK-NEXT: shl.b64 %rd111, %rd41, %r10; -; CHECK-NEXT: mov.u64 %rd102, %rd105; +; CHECK-NEXT: selp.b64 %rd110, %rd66, %rd65, %p14; +; CHECK-NEXT: shl.b64 %rd109, %rd41, %r10; +; CHECK-NEXT: mov.u64 %rd100, %rd103; ; CHECK-NEXT: @%p13 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r13, %rd107; +; CHECK-NEXT: cvt.u32.u64 %r13, %rd105; ; CHECK-NEXT: shr.u64 %rd69, %rd41, %r13; ; CHECK-NEXT: sub.s32 %r14, 64, %r13; ; CHECK-NEXT: shl.b64 %rd70, %rd42, %r14; @@ -223,57 +221,55 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: add.s32 %r15, %r13, -64; ; CHECK-NEXT: shr.u64 %rd72, %rd42, %r15; ; CHECK-NEXT: setp.gt.s32 %p15, %r13, 63; -; CHECK-NEXT: selp.b64 %rd109, %rd72, %rd71, %p15; -; CHECK-NEXT: shr.u64 %rd110, %rd42, %r13; +; CHECK-NEXT: selp.b64 %rd107, %rd72, %rd71, %p15; +; CHECK-NEXT: shr.u64 %rd108, %rd42, %r13; ; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1; -; CHECK-NEXT: mov.b64 %rd102, 0; -; CHECK-NEXT: mov.u64 %rd105, %rd102; +; CHECK-NEXT: mov.b64 %rd100, 0; +; CHECK-NEXT: mov.u64 %rd103, %rd100; ; CHECK-NEXT: $L__BB1_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd73, %rd109, 63; -; CHECK-NEXT: shl.b64 %rd74, %rd110, 1; +; CHECK-NEXT: shr.u64 %rd73, %rd107, 63; +; CHECK-NEXT: shl.b64 %rd74, %rd108, 1; ; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; -; CHECK-NEXT: shl.b64 %rd76, %rd109, 1; -; CHECK-NEXT: shr.u64 %rd77, %rd112, 63; +; CHECK-NEXT: shl.b64 %rd76, %rd107, 1; +; CHECK-NEXT: shr.u64 %rd77, %rd110, 63; ; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; -; CHECK-NEXT: shr.u64 %rd79, %rd111, 63; -; CHECK-NEXT: shl.b64 %rd80, %rd112, 1; +; CHECK-NEXT: shr.u64 %rd79, %rd109, 63; +; CHECK-NEXT: shl.b64 %rd80, %rd110, 1; ; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; -; CHECK-NEXT: shl.b64 %rd82, %rd111, 1; -; CHECK-NEXT: or.b64 %rd111, %rd105, %rd82; -; CHECK-NEXT: or.b64 %rd112, %rd102, %rd81; +; CHECK-NEXT: shl.b64 %rd82, %rd109, 1; +; CHECK-NEXT: or.b64 %rd109, %rd103, %rd82; +; CHECK-NEXT: or.b64 %rd110, %rd100, %rd81; ; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; ; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; ; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; -; CHECK-NEXT: and.b64 %rd105, %rd85, 1; +; CHECK-NEXT: and.b64 %rd103, %rd85, 1; ; CHECK-NEXT: and.b64 %rd86, %rd85, %rd3; ; CHECK-NEXT: and.b64 %rd87, %rd85, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd109, %rd78, %rd86; -; CHECK-NEXT: subc.cc.s64 %rd110, %rd75, %rd87; -; CHECK-NEXT: add.cc.s64 %rd107, %rd107, -1; -; CHECK-NEXT: addc.cc.s64 %rd108, %rd108, -1; -; CHECK-NEXT: or.b64 %rd88, %rd107, %rd108; +; CHECK-NEXT: sub.cc.s64 %rd107, %rd78, %rd86; +; CHECK-NEXT: subc.cc.s64 %rd108, %rd75, %rd87; +; CHECK-NEXT: add.cc.s64 %rd105, %rd105, -1; +; CHECK-NEXT: addc.cc.s64 %rd106, %rd106, -1; +; CHECK-NEXT: or.b64 %rd88, %rd105, %rd106; ; CHECK-NEXT: setp.eq.s64 %p16, %rd88, 0; ; CHECK-NEXT: @%p16 bra $L__BB1_4; ; CHECK-NEXT: bra.uni $L__BB1_2; ; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd89, %rd111, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd112, 1; +; CHECK-NEXT: shr.u64 %rd89, %rd109, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd110, 1; ; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd111, 1; -; CHECK-NEXT: or.b64 %rd113, %rd105, %rd92; -; CHECK-NEXT: or.b64 %rd114, %rd102, %rd91; +; CHECK-NEXT: shl.b64 %rd92, %rd109, 1; +; CHECK-NEXT: or.b64 %rd111, %rd103, %rd92; +; CHECK-NEXT: or.b64 %rd112, %rd100, %rd91; ; CHECK-NEXT: $L__BB1_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd93, %rd3, %rd113; -; CHECK-NEXT: mul.lo.s64 %rd94, %rd3, %rd114; -; CHECK-NEXT: add.s64 %rd95, %rd93, %rd94; -; CHECK-NEXT: mul.lo.s64 %rd96, %rd4, %rd113; -; CHECK-NEXT: add.s64 %rd97, %rd95, %rd96; -; CHECK-NEXT: mul.lo.s64 %rd98, %rd3, %rd113; -; CHECK-NEXT: sub.cc.s64 %rd99, %rd41, %rd98; -; CHECK-NEXT: subc.cc.s64 %rd100, %rd42, %rd97; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd99, %rd100}; +; CHECK-NEXT: mul.hi.u64 %rd93, %rd3, %rd111; +; CHECK-NEXT: mad.lo.s64 %rd94, %rd3, %rd112, %rd93; +; CHECK-NEXT: mad.lo.s64 %rd95, %rd4, %rd111, %rd94; +; CHECK-NEXT: mul.lo.s64 %rd96, %rd3, %rd111; +; CHECK-NEXT: sub.cc.s64 %rd97, %rd41, %rd96; +; CHECK-NEXT: subc.cc.s64 %rd98, %rd42, %rd95; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd97, %rd98}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, %rhs ret i128 %div