From 0ac2841d82ebd89262761d841655e45333314edd Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Wed, 18 Dec 2024 18:46:43 +0000 Subject: [PATCH 1/2] [NVPTX] Avoid introducing unnecessary ProxyRegs and Movs in ISel --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 39 +-- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 4 +- llvm/test/CodeGen/NVPTX/atomics-sm70.ll | 26 +- llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 48 ++-- llvm/test/CodeGen/NVPTX/bf16-instructions.ll | 136 ++++----- .../test/CodeGen/NVPTX/bf16x2-instructions.ll | 40 +-- llvm/test/CodeGen/NVPTX/chain-different-as.ll | 4 +- llvm/test/CodeGen/NVPTX/cmpxchg.ll | 2 +- .../CodeGen/NVPTX/compute-ptx-value-vts.ll | 4 +- llvm/test/CodeGen/NVPTX/demote-vars.ll | 2 +- llvm/test/CodeGen/NVPTX/extractelement.ll | 42 +-- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 138 +++++----- llvm/test/CodeGen/NVPTX/fma-relu-contract.ll | 168 +++++------ .../CodeGen/NVPTX/fma-relu-fma-intrinsic.ll | 106 +++---- .../NVPTX/fma-relu-instruction-flag.ll | 244 ++++++++-------- llvm/test/CodeGen/NVPTX/i1-load-lower.ll | 2 +- llvm/test/CodeGen/NVPTX/i128.ll | 18 +- llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 72 ++--- llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 172 ++++++------ .../CodeGen/NVPTX/inline-asm-b128-test1.ll | 6 +- .../CodeGen/NVPTX/inline-asm-b128-test2.ll | 8 +- .../CodeGen/NVPTX/inline-asm-b128-test3.ll | 2 +- llvm/test/CodeGen/NVPTX/math-intrins.ll | 176 ++++++------ llvm/test/CodeGen/NVPTX/misched_func_call.ll | 2 +- llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll | 4 +- llvm/test/CodeGen/NVPTX/reg-types.ll | 12 +- ...unfold-masked-merge-vector-variablemask.ll | 260 +++++++++--------- llvm/test/CodeGen/NVPTX/vaargs.ll | 2 +- llvm/test/CodeGen/NVPTX/variadics-backend.ll | 12 +- llvm/test/CodeGen/NVPTX/vector-returns.ll | 50 ++-- 30 files changed, 895 insertions(+), 906 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index abaf8e0b0ec1f..eb4918c43f0dc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1994,22 +1994,15 @@ let IsSimpleMove=1, hasSideEffects=0 in { def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), "mov.pred \t$dst, $src;", [(set i1:$dst, imm:$src)]>; -def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), - "mov.u16 \t$dst, $src;", - [(set i16:$dst, imm:$src)]>; -def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), - "mov.u32 \t$dst, $src;", - [(set i32:$dst, imm:$src)]>; -def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), - "mov.u64 \t$dst, $src;", - [(set i64:$dst, imm:$src)]>; - def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), - "mov.b16 \t$dst, $src;", []>; + "mov.b16 \t$dst, $src;", + [(set i16:$dst, imm:$src)]>; def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), - "mov.b32 \t$dst, $src;", []>; + "mov.b32 \t$dst, $src;", + [(set i32:$dst, imm:$src)]>; def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), - "mov.b64 \t$dst, $src;", []>; + "mov.b64 \t$dst, $src;", + [(set i64:$dst, imm:$src)]>; def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), "mov.f32 \t$dst, $src;", @@ -2018,8 +2011,8 @@ def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), "mov.f64 \t$dst, $src;", [(set f64:$dst, fpimm:$src)]>; -def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; -def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>; +def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOVB32ri texternalsym:$dst)>; +def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOVB64ri texternalsym:$dst)>; //---- Copy Frame Index ---- def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), @@ -3104,21 +3097,17 @@ def: Pat<(f32 (bitconvert vt:$a)), (BITCONVERT_32_I2F Int32Regs:$a)>; } foreach vt = [f16, bf16] in { -def: Pat<(vt (bitconvert (i16 UInt16Const:$a))), - (IMOVB16ri UInt16Const:$a)>; -def: Pat<(vt (bitconvert i16:$a)), - (ProxyRegI16 Int16Regs:$a)>; -def: Pat<(i16 (bitconvert vt:$a)), - (ProxyRegI16 Int16Regs:$a)>; + def: Pat<(vt (bitconvert i16:$a)), + (vt Int16Regs:$a)>; + def: Pat<(i16 (bitconvert vt:$a)), + (i16 Int16Regs:$a)>; } foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in { - def: Pat<(ta (bitconvert (i32 UInt32Const:$a))), - (IMOVB32ri UInt32Const:$a)>; foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in { if !ne(ta, tb) then { - def: Pat<(ta (bitconvert (tb Int32Regs:$a))), - (ProxyRegI32 Int32Regs:$a)>; + def: Pat<(ta (bitconvert tb:$a)), + (ta Int32Regs:$a)>; } } } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 256161d5d79c7..6d4a56f191825 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2803,10 +2803,10 @@ def : Pat<(int_nvvm_ptr_param_to_gen i64:$src), // nvvm.ptr.gen.to.param def : Pat<(int_nvvm_ptr_gen_to_param i32:$src), - (IMOV32rr Int32Regs:$src)>; + (i32 Int32Regs:$src)>; def : Pat<(int_nvvm_ptr_gen_to_param i64:$src), - (IMOV64rr Int64Regs:$src)>; + (i64 Int64Regs:$src)>; // nvvm.move intrinsicc def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s), diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll index bae7109288b99..05f466f2138ec 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll @@ -46,7 +46,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-LABEL: test( ; CHECKPTX62: { ; CHECKPTX62-NEXT: .reg .pred %p<5>; -; CHECKPTX62-NEXT: .reg .b16 %rs<19>; +; CHECKPTX62-NEXT: .reg .b16 %rs<11>; ; CHECKPTX62-NEXT: .reg .b32 %r<58>; ; CHECKPTX62-EMPTY: ; CHECKPTX62-NEXT: // %bb.0: @@ -65,8 +65,8 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX62-NEXT: shr.u32 %r28, %r54, %r2; ; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r28; -; CHECKPTX62-NEXT: add.rn.f16 %rs4, %rs2, %rs1; -; CHECKPTX62-NEXT: cvt.u32.u16 %r29, %rs4; +; CHECKPTX62-NEXT: add.rn.f16 %rs3, %rs2, %rs1; +; CHECKPTX62-NEXT: cvt.u32.u16 %r29, %rs3; ; CHECKPTX62-NEXT: shl.b32 %r30, %r29, %r2; ; CHECKPTX62-NEXT: and.b32 %r31, %r54, %r3; ; CHECKPTX62-NEXT: or.b32 %r32, %r31, %r30; @@ -79,10 +79,10 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX62-NEXT: shr.u32 %r33, %r55, %r2; -; CHECKPTX62-NEXT: cvt.u16.u32 %rs6, %r33; -; CHECKPTX62-NEXT: mov.b16 %rs8, 0x3C00; -; CHECKPTX62-NEXT: add.rn.f16 %rs9, %rs6, %rs8; -; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs9; +; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r33; +; CHECKPTX62-NEXT: mov.b16 %rs5, 0x3C00; +; CHECKPTX62-NEXT: add.rn.f16 %rs6, %rs4, %rs5; +; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs6; ; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r2; ; CHECKPTX62-NEXT: and.b32 %r36, %r55, %r3; ; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35; @@ -100,9 +100,9 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX62-NEXT: shr.u32 %r41, %r56, %r11; -; CHECKPTX62-NEXT: cvt.u16.u32 %rs11, %r41; -; CHECKPTX62-NEXT: add.rn.f16 %rs13, %rs11, %rs1; -; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs13; +; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r41; +; CHECKPTX62-NEXT: add.rn.f16 %rs8, %rs7, %rs1; +; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs8; ; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11; ; CHECKPTX62-NEXT: and.b32 %r44, %r56, %r12; ; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43; @@ -120,9 +120,9 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX62-NEXT: shr.u32 %r49, %r57, %r17; -; CHECKPTX62-NEXT: cvt.u16.u32 %rs15, %r49; -; CHECKPTX62-NEXT: add.rn.f16 %rs17, %rs15, %rs1; -; CHECKPTX62-NEXT: cvt.u32.u16 %r50, %rs17; +; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r49; +; CHECKPTX62-NEXT: add.rn.f16 %rs10, %rs9, %rs1; +; CHECKPTX62-NEXT: cvt.u32.u16 %r50, %rs10; ; CHECKPTX62-NEXT: shl.b32 %r51, %r50, %r17; ; CHECKPTX62-NEXT: and.b32 %r52, %r57, %r18; ; CHECKPTX62-NEXT: or.b32 %r53, %r52, %r51; diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index 9ddb82321b4ea..f81b785f13225 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -46,7 +46,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-LABEL: test( ; CHECKPTX71: { ; CHECKPTX71-NEXT: .reg .pred %p<5>; -; CHECKPTX71-NEXT: .reg .b16 %rs<34>; +; CHECKPTX71-NEXT: .reg .b16 %rs<22>; ; CHECKPTX71-NEXT: .reg .b32 %r<4>; ; CHECKPTX71-NEXT: .reg .f32 %f<12>; ; CHECKPTX71-EMPTY: @@ -55,49 +55,49 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2]; ; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1]; ; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0]; -; CHECKPTX71-NEXT: ld.b16 %rs30, [%r1]; +; CHECKPTX71-NEXT: ld.b16 %rs18, [%r1]; ; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs13; ; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start14 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs30; +; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs18; ; CHECKPTX71-NEXT: add.rn.f32 %f3, %f2, %f1; ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs14, %f3; -; CHECKPTX71-NEXT: atom.cas.b16 %rs17, [%r1], %rs30, %rs14; -; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs17, %rs30; -; CHECKPTX71-NEXT: mov.u16 %rs30, %rs17; +; CHECKPTX71-NEXT: atom.cas.b16 %rs3, [%r1], %rs18, %rs14; +; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs3, %rs18; +; CHECKPTX71-NEXT: mov.u16 %rs18, %rs3; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; ; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end13 -; CHECKPTX71-NEXT: ld.b16 %rs31, [%r1]; +; CHECKPTX71-NEXT: ld.b16 %rs19, [%r1]; ; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start8 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs31; +; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs19; ; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; -; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs18, %f5; -; CHECKPTX71-NEXT: atom.cas.b16 %rs21, [%r1], %rs31, %rs18; -; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs21, %rs31; -; CHECKPTX71-NEXT: mov.u16 %rs31, %rs21; +; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs15, %f5; +; CHECKPTX71-NEXT: atom.cas.b16 %rs6, [%r1], %rs19, %rs15; +; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs6, %rs19; +; CHECKPTX71-NEXT: mov.u16 %rs19, %rs6; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; ; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end7 -; CHECKPTX71-NEXT: ld.global.b16 %rs32, [%r2]; +; CHECKPTX71-NEXT: ld.global.b16 %rs20, [%r2]; ; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start2 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs32; +; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs20; ; CHECKPTX71-NEXT: add.rn.f32 %f8, %f7, %f1; -; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs22, %f8; -; CHECKPTX71-NEXT: atom.global.cas.b16 %rs25, [%r2], %rs32, %rs22; -; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs25, %rs32; -; CHECKPTX71-NEXT: mov.u16 %rs32, %rs25; +; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f8; +; CHECKPTX71-NEXT: atom.global.cas.b16 %rs9, [%r2], %rs20, %rs16; +; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs9, %rs20; +; CHECKPTX71-NEXT: mov.u16 %rs20, %rs9; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; ; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end1 -; CHECKPTX71-NEXT: ld.shared.b16 %rs33, [%r3]; +; CHECKPTX71-NEXT: ld.shared.b16 %rs21, [%r3]; ; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs33; +; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs21; ; CHECKPTX71-NEXT: add.rn.f32 %f11, %f10, %f1; -; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs26, %f11; -; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs29, [%r3], %rs33, %rs26; -; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs29, %rs33; -; CHECKPTX71-NEXT: mov.u16 %rs33, %rs29; +; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs17, %f11; +; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs12, [%r3], %rs21, %rs17; +; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs12, %rs21; +; CHECKPTX71-NEXT: mov.u16 %rs21, %rs12; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; ; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end ; CHECKPTX71-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index 08ed317ef9300..6828bac18cad7 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -17,7 +17,7 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) { ; SM70-LABEL: test_fadd( ; SM70: { ; SM70-NEXT: .reg .pred %p<2>; -; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b16 %rs<2>; ; SM70-NEXT: .reg .b32 %r<11>; ; SM70-NEXT: .reg .f32 %f<4>; ; SM70-EMPTY: @@ -88,7 +88,7 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) { ; SM70-LABEL: test_fsub( ; SM70: { ; SM70-NEXT: .reg .pred %p<2>; -; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b16 %rs<2>; ; SM70-NEXT: .reg .b32 %r<11>; ; SM70-NEXT: .reg .f32 %f<4>; ; SM70-EMPTY: @@ -159,8 +159,8 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-LABEL: test_faddx2( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<9>; -; SM70-NEXT: .reg .b32 %r<25>; +; SM70-NEXT: .reg .b16 %rs<5>; +; SM70-NEXT: .reg .b32 %r<24>; ; SM70-NEXT: .reg .f32 %f<7>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -170,8 +170,8 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: cvt.u32.u16 %r3, %rs2; ; SM70-NEXT: shl.b32 %r4, %r3, 16; ; SM70-NEXT: mov.b32 %f1, %r4; -; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1; -; SM70-NEXT: cvt.u32.u16 %r5, %rs5; +; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM70-NEXT: cvt.u32.u16 %r5, %rs4; ; SM70-NEXT: shl.b32 %r6, %r5, 16; ; SM70-NEXT: mov.b32 %f2, %r6; ; SM70-NEXT: add.rn.f32 %f3, %f2, %f1; @@ -185,7 +185,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: shl.b32 %r14, %r13, 16; ; SM70-NEXT: mov.b32 %f4, %r14; -; SM70-NEXT: cvt.u32.u16 %r15, %rs4; +; SM70-NEXT: cvt.u32.u16 %r15, %rs3; ; SM70-NEXT: shl.b32 %r16, %r15, 16; ; SM70-NEXT: mov.b32 %f5, %r16; ; SM70-NEXT: add.rn.f32 %f6, %f5, %f4; @@ -260,8 +260,8 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-LABEL: test_fsubx2( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<9>; -; SM70-NEXT: .reg .b32 %r<25>; +; SM70-NEXT: .reg .b16 %rs<5>; +; SM70-NEXT: .reg .b32 %r<24>; ; SM70-NEXT: .reg .f32 %f<7>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -271,8 +271,8 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: cvt.u32.u16 %r3, %rs2; ; SM70-NEXT: shl.b32 %r4, %r3, 16; ; SM70-NEXT: mov.b32 %f1, %r4; -; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1; -; SM70-NEXT: cvt.u32.u16 %r5, %rs5; +; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM70-NEXT: cvt.u32.u16 %r5, %rs4; ; SM70-NEXT: shl.b32 %r6, %r5, 16; ; SM70-NEXT: mov.b32 %f2, %r6; ; SM70-NEXT: sub.rn.f32 %f3, %f2, %f1; @@ -286,7 +286,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: shl.b32 %r14, %r13, 16; ; SM70-NEXT: mov.b32 %f4, %r14; -; SM70-NEXT: cvt.u32.u16 %r15, %rs4; +; SM70-NEXT: cvt.u32.u16 %r15, %rs3; ; SM70-NEXT: shl.b32 %r16, %r15, 16; ; SM70-NEXT: mov.b32 %f5, %r16; ; SM70-NEXT: sub.rn.f32 %f6, %f5, %f4; @@ -361,8 +361,8 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-LABEL: test_fmulx2( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<9>; -; SM70-NEXT: .reg .b32 %r<25>; +; SM70-NEXT: .reg .b16 %rs<5>; +; SM70-NEXT: .reg .b32 %r<24>; ; SM70-NEXT: .reg .f32 %f<7>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -372,8 +372,8 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: cvt.u32.u16 %r3, %rs2; ; SM70-NEXT: shl.b32 %r4, %r3, 16; ; SM70-NEXT: mov.b32 %f1, %r4; -; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1; -; SM70-NEXT: cvt.u32.u16 %r5, %rs5; +; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM70-NEXT: cvt.u32.u16 %r5, %rs4; ; SM70-NEXT: shl.b32 %r6, %r5, 16; ; SM70-NEXT: mov.b32 %f2, %r6; ; SM70-NEXT: mul.rn.f32 %f3, %f2, %f1; @@ -387,7 +387,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: shl.b32 %r14, %r13, 16; ; SM70-NEXT: mov.b32 %f4, %r14; -; SM70-NEXT: cvt.u32.u16 %r15, %rs4; +; SM70-NEXT: cvt.u32.u16 %r15, %rs3; ; SM70-NEXT: shl.b32 %r16, %r15, 16; ; SM70-NEXT: mov.b32 %f5, %r16; ; SM70-NEXT: mul.rn.f32 %f6, %f5, %f4; @@ -462,8 +462,8 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-LABEL: test_fdiv( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<9>; -; SM70-NEXT: .reg .b32 %r<25>; +; SM70-NEXT: .reg .b16 %rs<5>; +; SM70-NEXT: .reg .b32 %r<24>; ; SM70-NEXT: .reg .f32 %f<7>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -473,8 +473,8 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: cvt.u32.u16 %r3, %rs2; ; SM70-NEXT: shl.b32 %r4, %r3, 16; ; SM70-NEXT: mov.b32 %f1, %r4; -; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1; -; SM70-NEXT: cvt.u32.u16 %r5, %rs5; +; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM70-NEXT: cvt.u32.u16 %r5, %rs4; ; SM70-NEXT: shl.b32 %r6, %r5, 16; ; SM70-NEXT: mov.b32 %f2, %r6; ; SM70-NEXT: div.rn.f32 %f3, %f2, %f1; @@ -488,7 +488,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: shl.b32 %r14, %r13, 16; ; SM70-NEXT: mov.b32 %f4, %r14; -; SM70-NEXT: cvt.u32.u16 %r15, %rs4; +; SM70-NEXT: cvt.u32.u16 %r15, %rs3; ; SM70-NEXT: shl.b32 %r16, %r15, 16; ; SM70-NEXT: mov.b32 %f5, %r16; ; SM70-NEXT: div.rn.f32 %f6, %f5, %f4; @@ -648,7 +648,7 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM70-LABEL: test_fptrunc_float( ; SM70: { ; SM70-NEXT: .reg .pred %p<2>; -; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b16 %rs<2>; ; SM70-NEXT: .reg .b32 %r<7>; ; SM70-NEXT: .reg .f32 %f<2>; ; SM70-EMPTY: @@ -705,7 +705,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 { ; SM70-LABEL: test_fadd_imm_1( ; SM70: { ; SM70-NEXT: .reg .pred %p<2>; -; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b16 %rs<2>; ; SM70-NEXT: .reg .b32 %r<9>; ; SM70-NEXT: .reg .f32 %f<3>; ; SM70-EMPTY: @@ -789,7 +789,7 @@ define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat % define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM70-LABEL: test_extload_bf16x8( ; SM70: { -; SM70-NEXT: .reg .b16 %rs<17>; +; SM70-NEXT: .reg .b16 %rs<9>; ; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .f32 %f<9>; ; SM70-NEXT: .reg .b64 %rd<2>; @@ -1033,7 +1033,7 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM70-LABEL: test_sitofp_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<2>; -; SM70-NEXT: .reg .b16 %rs<4>; +; SM70-NEXT: .reg .b16 %rs<3>; ; SM70-NEXT: .reg .b32 %r<7>; ; SM70-NEXT: .reg .f32 %f<2>; ; SM70-EMPTY: @@ -1092,7 +1092,7 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM70-LABEL: test_uitofp_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<2>; -; SM70-NEXT: .reg .b16 %rs<4>; +; SM70-NEXT: .reg .b16 %rs<3>; ; SM70-NEXT: .reg .b32 %r<7>; ; SM70-NEXT: .reg .f32 %f<2>; ; SM70-EMPTY: @@ -1151,7 +1151,7 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM70-LABEL: test_uitofp_i1( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<5>; +; SM70-NEXT: .reg .b16 %rs<4>; ; SM70-NEXT: .reg .b32 %r<8>; ; SM70-NEXT: .reg .f32 %f<2>; ; SM70-EMPTY: @@ -1228,7 +1228,7 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM70-LABEL: test_uitofp_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<2>; -; SM70-NEXT: .reg .b16 %rs<4>; +; SM70-NEXT: .reg .b16 %rs<3>; ; SM70-NEXT: .reg .b32 %r<7>; ; SM70-NEXT: .reg .f32 %f<2>; ; SM70-EMPTY: @@ -1287,7 +1287,7 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM70-LABEL: test_uitofp_i32( ; SM70: { ; SM70-NEXT: .reg .pred %p<2>; -; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b16 %rs<2>; ; SM70-NEXT: .reg .b32 %r<8>; ; SM70-NEXT: .reg .f32 %f<2>; ; SM70-EMPTY: @@ -1349,7 +1349,7 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM70-LABEL: test_uitofp_i64( ; SM70: { ; SM70-NEXT: .reg .pred %p<2>; -; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b16 %rs<2>; ; SM70-NEXT: .reg .b32 %r<7>; ; SM70-NEXT: .reg .f32 %f<2>; ; SM70-NEXT: .reg .b64 %rd<2>; @@ -1412,7 +1412,7 @@ define bfloat @test_roundeven(bfloat %a) { ; SM70-LABEL: test_roundeven( ; SM70: { ; SM70-NEXT: .reg .pred %p<2>; -; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b16 %rs<2>; ; SM70-NEXT: .reg .b32 %r<9>; ; SM70-NEXT: .reg .f32 %f<3>; ; SM70-EMPTY: @@ -1475,33 +1475,33 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) { ; SM70-LABEL: test_maximum( ; SM70: { ; SM70-NEXT: .reg .pred %p<6>; -; SM70-NEXT: .reg .b16 %rs<11>; +; SM70-NEXT: .reg .b16 %rs<8>; ; SM70-NEXT: .reg .b32 %r<7>; ; SM70-NEXT: .reg .f32 %f<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; -; SM70-NEXT: ld.param.b16 %rs3, [test_maximum_param_1]; -; SM70-NEXT: cvt.u32.u16 %r1, %rs3; +; SM70-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; +; SM70-NEXT: cvt.u32.u16 %r1, %rs2; ; SM70-NEXT: shl.b32 %r2, %r1, 16; ; SM70-NEXT: mov.b32 %f1, %r2; ; SM70-NEXT: cvt.u32.u16 %r3, %rs1; ; SM70-NEXT: shl.b32 %r4, %r3, 16; ; SM70-NEXT: mov.b32 %f2, %r4; ; SM70-NEXT: setp.gt.f32 %p1, %f2, %f1; -; SM70-NEXT: selp.b16 %rs5, %rs1, %rs3, %p1; +; SM70-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; SM70-NEXT: setp.nan.f32 %p2, %f2, %f1; -; SM70-NEXT: selp.b16 %rs6, 0x7FC0, %rs5, %p2; +; SM70-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2; ; SM70-NEXT: setp.eq.s16 %p3, %rs1, 0; -; SM70-NEXT: selp.b16 %rs7, %rs1, %rs6, %p3; -; SM70-NEXT: setp.eq.s16 %p4, %rs3, 0; -; SM70-NEXT: selp.b16 %rs8, %rs3, %rs7, %p4; -; SM70-NEXT: cvt.u32.u16 %r5, %rs6; +; SM70-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3; +; SM70-NEXT: setp.eq.s16 %p4, %rs2, 0; +; SM70-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4; +; SM70-NEXT: cvt.u32.u16 %r5, %rs4; ; SM70-NEXT: shl.b32 %r6, %r5, 16; ; SM70-NEXT: mov.b32 %f3, %r6; ; SM70-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; -; SM70-NEXT: selp.b16 %rs10, %rs8, %rs6, %p5; -; SM70-NEXT: st.param.b16 [func_retval0], %rs10; +; SM70-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5; +; SM70-NEXT: st.param.b16 [func_retval0], %rs7; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_maximum( @@ -1544,7 +1544,7 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) { ; SM70-LABEL: test_maxnum( ; SM70: { ; SM70-NEXT: .reg .pred %p<2>; -; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b16 %rs<2>; ; SM70-NEXT: .reg .b32 %r<11>; ; SM70-NEXT: .reg .f32 %f<4>; ; SM70-EMPTY: @@ -1607,7 +1607,7 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM70-LABEL: test_maximum_v2( ; SM70: { ; SM70-NEXT: .reg .pred %p<11>; -; SM70-NEXT: .reg .b16 %rs<21>; +; SM70-NEXT: .reg .b16 %rs<15>; ; SM70-NEXT: .reg .b32 %r<16>; ; SM70-NEXT: .reg .f32 %f<7>; ; SM70-EMPTY: @@ -1618,43 +1618,43 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM70-NEXT: cvt.u32.u16 %r3, %rs2; ; SM70-NEXT: shl.b32 %r4, %r3, 16; ; SM70-NEXT: mov.b32 %f1, %r4; -; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1; -; SM70-NEXT: cvt.u32.u16 %r5, %rs5; +; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM70-NEXT: cvt.u32.u16 %r5, %rs4; ; SM70-NEXT: shl.b32 %r6, %r5, 16; ; SM70-NEXT: mov.b32 %f2, %r6; ; SM70-NEXT: setp.gt.f32 %p1, %f2, %f1; -; SM70-NEXT: selp.b16 %rs7, %rs5, %rs2, %p1; +; SM70-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; ; SM70-NEXT: setp.nan.f32 %p2, %f2, %f1; -; SM70-NEXT: selp.b16 %rs8, 0x7FC0, %rs7, %p2; -; SM70-NEXT: setp.eq.s16 %p3, %rs5, 0; -; SM70-NEXT: selp.b16 %rs9, %rs5, %rs8, %p3; +; SM70-NEXT: selp.b16 %rs6, 0x7FC0, %rs5, %p2; +; SM70-NEXT: setp.eq.s16 %p3, %rs4, 0; +; SM70-NEXT: selp.b16 %rs7, %rs4, %rs6, %p3; ; SM70-NEXT: setp.eq.s16 %p4, %rs2, 0; -; SM70-NEXT: selp.b16 %rs10, %rs2, %rs9, %p4; -; SM70-NEXT: cvt.u32.u16 %r7, %rs8; +; SM70-NEXT: selp.b16 %rs8, %rs2, %rs7, %p4; +; SM70-NEXT: cvt.u32.u16 %r7, %rs6; ; SM70-NEXT: shl.b32 %r8, %r7, 16; ; SM70-NEXT: mov.b32 %f3, %r8; ; SM70-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; -; SM70-NEXT: selp.b16 %rs12, %rs10, %rs8, %p5; +; SM70-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5; ; SM70-NEXT: cvt.u32.u16 %r9, %rs1; ; SM70-NEXT: shl.b32 %r10, %r9, 16; ; SM70-NEXT: mov.b32 %f4, %r10; -; SM70-NEXT: cvt.u32.u16 %r11, %rs4; +; SM70-NEXT: cvt.u32.u16 %r11, %rs3; ; SM70-NEXT: shl.b32 %r12, %r11, 16; ; SM70-NEXT: mov.b32 %f5, %r12; ; SM70-NEXT: setp.gt.f32 %p6, %f5, %f4; -; SM70-NEXT: selp.b16 %rs15, %rs4, %rs1, %p6; +; SM70-NEXT: selp.b16 %rs10, %rs3, %rs1, %p6; ; SM70-NEXT: setp.nan.f32 %p7, %f5, %f4; -; SM70-NEXT: selp.b16 %rs16, 0x7FC0, %rs15, %p7; -; SM70-NEXT: setp.eq.s16 %p8, %rs4, 0; -; SM70-NEXT: selp.b16 %rs17, %rs4, %rs16, %p8; +; SM70-NEXT: selp.b16 %rs11, 0x7FC0, %rs10, %p7; +; SM70-NEXT: setp.eq.s16 %p8, %rs3, 0; +; SM70-NEXT: selp.b16 %rs12, %rs3, %rs11, %p8; ; SM70-NEXT: setp.eq.s16 %p9, %rs1, 0; -; SM70-NEXT: selp.b16 %rs18, %rs1, %rs17, %p9; -; SM70-NEXT: cvt.u32.u16 %r13, %rs16; +; SM70-NEXT: selp.b16 %rs13, %rs1, %rs12, %p9; +; SM70-NEXT: cvt.u32.u16 %r13, %rs11; ; SM70-NEXT: shl.b32 %r14, %r13, 16; ; SM70-NEXT: mov.b32 %f6, %r14; ; SM70-NEXT: setp.eq.f32 %p10, %f6, 0f00000000; -; SM70-NEXT: selp.b16 %rs20, %rs18, %rs16, %p10; -; SM70-NEXT: mov.b32 %r15, {%rs20, %rs12}; +; SM70-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10; +; SM70-NEXT: mov.b32 %r15, {%rs14, %rs9}; ; SM70-NEXT: st.param.b32 [func_retval0], %r15; ; SM70-NEXT: ret; ; @@ -1698,8 +1698,8 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM70-LABEL: test_maxnum_v2( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<9>; -; SM70-NEXT: .reg .b32 %r<25>; +; SM70-NEXT: .reg .b16 %rs<5>; +; SM70-NEXT: .reg .b32 %r<24>; ; SM70-NEXT: .reg .f32 %f<7>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -1709,8 +1709,8 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM70-NEXT: cvt.u32.u16 %r3, %rs2; ; SM70-NEXT: shl.b32 %r4, %r3, 16; ; SM70-NEXT: mov.b32 %f1, %r4; -; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1; -; SM70-NEXT: cvt.u32.u16 %r5, %rs5; +; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM70-NEXT: cvt.u32.u16 %r5, %rs4; ; SM70-NEXT: shl.b32 %r6, %r5, 16; ; SM70-NEXT: mov.b32 %f2, %r6; ; SM70-NEXT: max.f32 %f3, %f2, %f1; @@ -1724,7 +1724,7 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: shl.b32 %r14, %r13, 16; ; SM70-NEXT: mov.b32 %f4, %r14; -; SM70-NEXT: cvt.u32.u16 %r15, %rs4; +; SM70-NEXT: cvt.u32.u16 %r15, %rs3; ; SM70-NEXT: shl.b32 %r16, %r15, 16; ; SM70-NEXT: mov.b32 %f5, %r16; ; SM70-NEXT: max.f32 %f6, %f5, %f4; diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index 8eb6ae8bfae86..03cdeb9683aba 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -179,7 +179,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 { ; CHECK-LABEL: test_fneg( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_fneg_param_0]; @@ -211,7 +211,7 @@ define void @test_ldst_v3bf16(ptr %a, ptr %b) { ; CHECK-LABEL: test_ldst_v3bf16( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: @@ -529,7 +529,7 @@ define <2 x bfloat> @test_fmuladd(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 { ; CHECK-LABEL: test_fabs( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_fabs_param_0]; @@ -805,7 +805,7 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-LABEL: test_copysign( ; SM80: { ; SM80-NEXT: .reg .pred %p<3>; -; SM80-NEXT: .reg .b16 %rs<17>; +; SM80-NEXT: .reg .b16 %rs<15>; ; SM80-NEXT: .reg .b32 %r<4>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: @@ -815,31 +815,31 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-NEXT: abs.bf16 %rs3, %rs2; ; SM80-NEXT: neg.bf16 %rs4, %rs3; ; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; SM80-NEXT: shr.u16 %rs8, %rs6, 15; -; SM80-NEXT: and.b16 %rs9, %rs8, 1; -; SM80-NEXT: setp.eq.b16 %p1, %rs9, 1; -; SM80-NEXT: selp.b16 %rs10, %rs4, %rs3, %p1; -; SM80-NEXT: abs.bf16 %rs11, %rs1; -; SM80-NEXT: neg.bf16 %rs12, %rs11; -; SM80-NEXT: shr.u16 %rs14, %rs5, 15; -; SM80-NEXT: and.b16 %rs15, %rs14, 1; -; SM80-NEXT: setp.eq.b16 %p2, %rs15, 1; -; SM80-NEXT: selp.b16 %rs16, %rs12, %rs11, %p2; -; SM80-NEXT: mov.b32 %r3, {%rs16, %rs10}; +; SM80-NEXT: shr.u16 %rs7, %rs6, 15; +; SM80-NEXT: and.b16 %rs8, %rs7, 1; +; SM80-NEXT: setp.eq.b16 %p1, %rs8, 1; +; SM80-NEXT: selp.b16 %rs9, %rs4, %rs3, %p1; +; SM80-NEXT: abs.bf16 %rs10, %rs1; +; SM80-NEXT: neg.bf16 %rs11, %rs10; +; SM80-NEXT: shr.u16 %rs12, %rs5, 15; +; SM80-NEXT: and.b16 %rs13, %rs12, 1; +; SM80-NEXT: setp.eq.b16 %p2, %rs13, 1; +; SM80-NEXT: selp.b16 %rs14, %rs11, %rs10, %p2; +; SM80-NEXT: mov.b32 %r3, {%rs14, %rs9}; ; SM80-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-NEXT: ret; ; ; SM90-LABEL: test_copysign( ; SM90: { -; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b32 %r<6>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; ; SM90-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; -; SM90-NEXT: and.b32 %r4, %r2, -2147450880; -; SM90-NEXT: and.b32 %r6, %r1, 2147450879; -; SM90-NEXT: or.b32 %r7, %r6, %r4; -; SM90-NEXT: st.param.b32 [func_retval0], %r7; +; SM90-NEXT: and.b32 %r3, %r2, -2147450880; +; SM90-NEXT: and.b32 %r4, %r1, 2147450879; +; SM90-NEXT: or.b32 %r5, %r4, %r3; +; SM90-NEXT: st.param.b32 [func_retval0], %r5; ; SM90-NEXT: ret; %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %r diff --git a/llvm/test/CodeGen/NVPTX/chain-different-as.ll b/llvm/test/CodeGen/NVPTX/chain-different-as.ll index 293281e17dd36..704ed234f7fe6 100644 --- a/llvm/test/CodeGen/NVPTX/chain-different-as.ll +++ b/llvm/test/CodeGen/NVPTX/chain-different-as.ll @@ -7,8 +7,8 @@ define i64 @test() nounwind readnone { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u64 %rd1, 1; -; CHECK-NEXT: mov.u64 %rd2, 42; +; CHECK-NEXT: mov.b64 %rd1, 1; +; CHECK-NEXT: mov.b64 %rd2, 42; ; CHECK-NEXT: st.u64 [%rd1], %rd2; ; CHECK-NEXT: ld.global.u64 %rd3, [%rd1]; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 8508b5f5c7283..608dbb3a0ba73 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -65,7 +65,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b64 %rd3, %rd2, 1; ; SM70-NEXT: shl.b64 %rd4, %rd3, 3; ; SM70-NEXT: cvt.u32.u64 %r1, %rd4; -; SM70-NEXT: mov.u16 %rs11, 255; +; SM70-NEXT: mov.b16 %rs11, 255; ; SM70-NEXT: shl.b16 %rs12, %rs11, %r1; ; SM70-NEXT: not.b16 %rs2, %rs12; ; SM70-NEXT: shl.b16 %rs3, %rs9, %r1; diff --git a/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll b/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll index 99292d69c6a08..aa08b9605790c 100644 --- a/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll +++ b/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll @@ -36,7 +36,7 @@ define <12 x i8> @byte12() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; @@ -50,7 +50,7 @@ define <20 x i8> @byte20() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; diff --git a/llvm/test/CodeGen/NVPTX/demote-vars.ll b/llvm/test/CodeGen/NVPTX/demote-vars.ll index faa657083ba79..16ae80dca1edc 100644 --- a/llvm/test/CodeGen/NVPTX/demote-vars.ll +++ b/llvm/test/CodeGen/NVPTX/demote-vars.ll @@ -67,7 +67,7 @@ define void @define_private_global(i64 %val) { ; Also check that the if-then is still here, otherwise we may not be testing ; the "more-than-one-use" part. ; CHECK: st.shared.u64 [private_global_used_more_than_once_in_same_fct], -; CHECK: mov.u64 %[[VAR:.*]], 25 +; CHECK: mov.b64 %[[VAR:.*]], 25 ; CHECK: st.shared.u64 [private_global_used_more_than_once_in_same_fct], %[[VAR]] define void @define_private_global_more_than_one_use(i64 %val, i1 %cond) { store i64 %val, ptr addrspace(3) @private_global_used_more_than_once_in_same_fct diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll index f7b410e550d03..159d100336436 100644 --- a/llvm/test/CodeGen/NVPTX/extractelement.ll +++ b/llvm/test/CodeGen/NVPTX/extractelement.ll @@ -157,29 +157,29 @@ define i16 @test_v8i8(i64 %a) { ; CHECK-LABEL: test_v8i8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<16>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<12>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [test_v8i8_param_0]; -; CHECK-NEXT: cvt.u32.u64 %r1, %rd1; -; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r2}, %rd1; } -; CHECK-NEXT: bfe.s32 %r5, %r1, 0, 8; -; CHECK-NEXT: cvt.s8.s32 %rs1, %r5; -; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; -; CHECK-NEXT: cvt.s8.s32 %rs2, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; -; CHECK-NEXT: cvt.s8.s32 %rs3, %r7; -; CHECK-NEXT: bfe.s32 %r8, %r1, 24, 8; -; CHECK-NEXT: cvt.s8.s32 %rs4, %r8; -; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; -; CHECK-NEXT: cvt.s8.s32 %rs5, %r9; -; CHECK-NEXT: bfe.s32 %r10, %r2, 8, 8; -; CHECK-NEXT: cvt.s8.s32 %rs6, %r10; -; CHECK-NEXT: bfe.s32 %r11, %r2, 16, 8; -; CHECK-NEXT: cvt.s8.s32 %rs7, %r11; -; CHECK-NEXT: bfe.s32 %r12, %r2, 24, 8; -; CHECK-NEXT: cvt.s8.s32 %rs8, %r12; +; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; } +; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; +; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: cvt.s8.s32 %rs1, %r3; +; CHECK-NEXT: bfe.s32 %r4, %r2, 8, 8; +; CHECK-NEXT: cvt.s8.s32 %rs2, %r4; +; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; +; CHECK-NEXT: cvt.s8.s32 %rs3, %r5; +; CHECK-NEXT: bfe.s32 %r6, %r2, 24, 8; +; CHECK-NEXT: cvt.s8.s32 %rs4, %r6; +; CHECK-NEXT: bfe.s32 %r7, %r1, 0, 8; +; CHECK-NEXT: cvt.s8.s32 %rs5, %r7; +; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; +; CHECK-NEXT: cvt.s8.s32 %rs6, %r8; +; CHECK-NEXT: bfe.s32 %r9, %r1, 16, 8; +; CHECK-NEXT: cvt.s8.s32 %rs7, %r9; +; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.s8.s32 %rs8, %r10; ; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2; ; CHECK-NEXT: add.s16 %rs10, %rs3, %rs4; ; CHECK-NEXT: add.s16 %rs11, %rs5, %rs6; @@ -187,8 +187,8 @@ define i16 @test_v8i8(i64 %a) { ; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10; ; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12; ; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14; -; CHECK-NEXT: cvt.u32.u16 %r13, %rs15; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs15; +; CHECK-NEXT: st.param.b32 [func_retval0], %r11; ; CHECK-NEXT: ret; %v = bitcast i64 %a to <8 x i8> %r0 = extractelement <8 x i8> %v, i64 0 diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index eb0b00e883846..1905fec8ab7a8 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -408,7 +408,7 @@ define void @test_ldst_v3f16(ptr %a, ptr %b) { ; CHECK-LABEL: test_ldst_v3f16( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: @@ -1616,11 +1616,11 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { ; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_2xhalf_to_2xi16_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xhalf_to_2xi16_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = bitcast <2 x half> %a to <2 x i16> ret <2 x i16> %r @@ -1629,11 +1629,11 @@ define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 { ; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_2xi16_to_2xhalf_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = bitcast <2 x i16> %a to <2 x half> ret <2 x half> %r @@ -1657,12 +1657,12 @@ define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 { define float @test_bitcast_2xhalf_to_float(<2 x half> %a) #0 { ; CHECK-LABEL: test_bitcast_2xhalf_to_float( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .f32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_2xhalf_to_float_param_0]; -; CHECK-NEXT: mov.b32 %f1, %r2; +; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xhalf_to_float_param_0]; +; CHECK-NEXT: mov.b32 %f1, %r1; ; CHECK-NEXT: st.param.f32 [func_retval0], %f1; ; CHECK-NEXT: ret; %r = bitcast <2 x half> %a to float @@ -1858,12 +1858,12 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { define <2 x half> @test_fabs(<2 x half> %a) #0 { ; CHECK-F16-LABEL: test_fabs( ; CHECK-F16: { -; CHECK-F16-NEXT: .reg .b32 %r<5>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fabs_param_0]; -; CHECK-F16-NEXT: and.b32 %r3, %r1, 2147450879; -; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: and.b32 %r2, %r1, 2147450879; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-F16-NEXT: ret; ; ; CHECK-NOF16-LABEL: test_fabs( @@ -1945,34 +1945,34 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-F16-LABEL: test_copysign( ; CHECK-F16: { -; CHECK-F16-NEXT: .reg .b32 %r<9>; +; CHECK-F16-NEXT: .reg .b32 %r<6>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; -; CHECK-F16-NEXT: and.b32 %r4, %r2, -2147450880; -; CHECK-F16-NEXT: and.b32 %r6, %r1, 2147450879; -; CHECK-F16-NEXT: or.b32 %r7, %r6, %r4; -; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-F16-NEXT: and.b32 %r3, %r2, -2147450880; +; CHECK-F16-NEXT: and.b32 %r4, %r1, 2147450879; +; CHECK-F16-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-F16-NEXT: ret; ; ; CHECK-NOF16-LABEL: test_copysign( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<11>; ; CHECK-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; ; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: and.b16 %rs4, %rs2, -32768; -; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs8, %rs6, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs9, %rs8, %rs4; -; CHECK-NOF16-NEXT: and.b16 %rs12, %rs1, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs14, %rs5, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs15, %rs14, %rs12; -; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs15, %rs9}; +; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, -32768; +; CHECK-NOF16-NEXT: mov.b32 {%rs4, %rs5}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs6, %rs5, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs3; +; CHECK-NOF16-NEXT: and.b16 %rs8, %rs1, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs9, %rs4, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; +; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs10, %rs7}; ; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF16-NEXT: ret; %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) @@ -1983,7 +1983,7 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-F16-LABEL: test_copysign_f32( ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .b16 %rs<3>; -; CHECK-F16-NEXT: .reg .b32 %r<9>; +; CHECK-F16-NEXT: .reg .b32 %r<6>; ; CHECK-F16-NEXT: .reg .f32 %f<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: @@ -1992,33 +1992,33 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %f2; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %f1; ; CHECK-F16-NEXT: mov.b32 %r2, {%rs2, %rs1}; -; CHECK-F16-NEXT: and.b32 %r4, %r2, -2147450880; -; CHECK-F16-NEXT: and.b32 %r6, %r1, 2147450879; -; CHECK-F16-NEXT: or.b32 %r7, %r6, %r4; -; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-F16-NEXT: and.b32 %r3, %r2, -2147450880; +; CHECK-F16-NEXT: and.b32 %r4, %r1, 2147450879; +; CHECK-F16-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-F16-NEXT: ret; ; ; CHECK-NOF16-LABEL: test_copysign_f32( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .b16 %rs<13>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-NEXT: .reg .f32 %f<3>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1]; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; ; CHECK-NOF16-NEXT: mov.b32 %r2, %f2; ; CHECK-NOF16-NEXT: and.b32 %r3, %r2, -2147483648; -; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r3; } -; CHECK-NOF16-NEXT: mov.b32 {%rs2, %rs3}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs5, %rs3, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs6, %rs5, %rs1; +; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } +; CHECK-NOF16-NEXT: or.b16 %rs5, %rs3, %rs4; +; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767; ; CHECK-NOF16-NEXT: mov.b32 %r4, %f1; ; CHECK-NOF16-NEXT: and.b32 %r5, %r4, -2147483648; -; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; } -; CHECK-NOF16-NEXT: and.b16 %rs10, %rs2, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs11, %rs10, %rs8; -; CHECK-NOF16-NEXT: mov.b32 %r6, {%rs11, %rs6}; +; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r5; } +; CHECK-NOF16-NEXT: or.b16 %rs8, %rs6, %rs7; +; CHECK-NOF16-NEXT: mov.b32 %r6, {%rs8, %rs5}; ; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NOF16-NEXT: ret; %tb = fptrunc <2 x float> %b to <2 x half> @@ -2030,7 +2030,7 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { ; CHECK-F16-LABEL: test_copysign_f64( ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .b16 %rs<3>; -; CHECK-F16-NEXT: .reg .b32 %r<9>; +; CHECK-F16-NEXT: .reg .b32 %r<6>; ; CHECK-F16-NEXT: .reg .f64 %fd<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: @@ -2039,15 +2039,15 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { ; CHECK-F16-NEXT: cvt.rn.f16.f64 %rs1, %fd2; ; CHECK-F16-NEXT: cvt.rn.f16.f64 %rs2, %fd1; ; CHECK-F16-NEXT: mov.b32 %r2, {%rs2, %rs1}; -; CHECK-F16-NEXT: and.b32 %r4, %r2, -2147450880; -; CHECK-F16-NEXT: and.b32 %r6, %r1, 2147450879; -; CHECK-F16-NEXT: or.b32 %r7, %r6, %r4; -; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-F16-NEXT: and.b32 %r3, %r2, -2147450880; +; CHECK-F16-NEXT: and.b32 %r4, %r1, 2147450879; +; CHECK-F16-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-F16-NEXT: ret; ; ; CHECK-NOF16-LABEL: test_copysign_f64( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .b16 %rs<13>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; ; CHECK-NOF16-NEXT: .reg .b32 %r<3>; ; CHECK-NOF16-NEXT: .reg .b64 %rd<7>; ; CHECK-NOF16-NEXT: .reg .f64 %fd<3>; @@ -2056,19 +2056,19 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { ; CHECK-NOF16-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1]; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0]; ; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs4, %rs2, 32767; +; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; ; CHECK-NOF16-NEXT: mov.b64 %rd1, %fd2; ; CHECK-NOF16-NEXT: and.b64 %rd2, %rd1, -9223372036854775808; ; CHECK-NOF16-NEXT: shr.u64 %rd3, %rd2, 48; -; CHECK-NOF16-NEXT: cvt.u16.u64 %rs5, %rd3; -; CHECK-NOF16-NEXT: or.b16 %rs6, %rs4, %rs5; -; CHECK-NOF16-NEXT: and.b16 %rs9, %rs1, 32767; +; CHECK-NOF16-NEXT: cvt.u16.u64 %rs4, %rd3; +; CHECK-NOF16-NEXT: or.b16 %rs5, %rs3, %rs4; +; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767; ; CHECK-NOF16-NEXT: mov.b64 %rd4, %fd1; ; CHECK-NOF16-NEXT: and.b64 %rd5, %rd4, -9223372036854775808; ; CHECK-NOF16-NEXT: shr.u64 %rd6, %rd5, 48; -; CHECK-NOF16-NEXT: cvt.u16.u64 %rs10, %rd6; -; CHECK-NOF16-NEXT: or.b16 %rs11, %rs9, %rs10; -; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs11, %rs6}; +; CHECK-NOF16-NEXT: cvt.u16.u64 %rs7, %rd6; +; CHECK-NOF16-NEXT: or.b16 %rs8, %rs6, %rs7; +; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs8, %rs5}; ; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NOF16-NEXT: ret; %tb = fptrunc <2 x double> %b to <2 x half> @@ -2080,16 +2080,16 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-F16-LABEL: test_copysign_extended( ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .b16 %rs<3>; -; CHECK-F16-NEXT: .reg .b32 %r<9>; +; CHECK-F16-NEXT: .reg .b32 %r<6>; ; CHECK-F16-NEXT: .reg .f32 %f<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_extended_param_0]; -; CHECK-F16-NEXT: and.b32 %r4, %r2, -2147450880; -; CHECK-F16-NEXT: and.b32 %r6, %r1, 2147450879; -; CHECK-F16-NEXT: or.b32 %r7, %r6, %r4; -; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-F16-NEXT: and.b32 %r3, %r2, -2147450880; +; CHECK-F16-NEXT: and.b32 %r4, %r1, 2147450879; +; CHECK-F16-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r5; ; CHECK-F16-NEXT: cvt.f32.f16 %f1, %rs2; ; CHECK-F16-NEXT: cvt.f32.f16 %f2, %rs1; ; CHECK-F16-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1}; @@ -2097,7 +2097,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; ; CHECK-NOF16-LABEL: test_copysign_extended( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<11>; ; CHECK-NOF16-NEXT: .reg .b32 %r<3>; ; CHECK-NOF16-NEXT: .reg .f32 %f<3>; ; CHECK-NOF16-EMPTY: @@ -2105,15 +2105,15 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_extended_param_0]; ; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: and.b16 %rs4, %rs1, -32768; -; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs8, %rs5, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs9, %rs8, %rs4; -; CHECK-NOF16-NEXT: and.b16 %rs12, %rs2, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs14, %rs6, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs15, %rs14, %rs12; -; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs15; -; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs9; +; CHECK-NOF16-NEXT: and.b16 %rs3, %rs1, -32768; +; CHECK-NOF16-NEXT: mov.b32 {%rs4, %rs5}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs6, %rs4, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs3; +; CHECK-NOF16-NEXT: and.b16 %rs8, %rs2, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs9, %rs5, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs10; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs7; ; CHECK-NOF16-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1}; ; CHECK-NOF16-NEXT: ret; %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll index fde2255a78343..48c94f275274b 100644 --- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll +++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll @@ -248,7 +248,7 @@ define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat % ; CHECK-SM70-LABEL: fma_bf16_expanded_unsafe_with_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<3>; -; CHECK-SM70-NEXT: .reg .b16 %rs<4>; +; CHECK-SM70-NEXT: .reg .b16 %rs<3>; ; CHECK-SM70-NEXT: .reg .b32 %r<14>; ; CHECK-SM70-NEXT: .reg .f32 %f<6>; ; CHECK-SM70-EMPTY: @@ -274,8 +274,8 @@ define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat % ; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; ; CHECK-SM70-NEXT: mov.b32 %f5, %r13; ; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs3, %rs1, 0x0000, %p2; -; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; +; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; ; CHECK-SM70-NEXT: ret; %1 = fmul bfloat %a, %b %2 = fadd bfloat %1, %c @@ -312,7 +312,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<3>; -; CHECK-SM70-NEXT: .reg .b16 %rs<4>; +; CHECK-SM70-NEXT: .reg .b16 %rs<3>; ; CHECK-SM70-NEXT: .reg .b32 %r<14>; ; CHECK-SM70-NEXT: .reg .f32 %f<6>; ; CHECK-SM70-EMPTY: @@ -338,8 +338,8 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; ; CHECK-SM70-NEXT: mov.b32 %f5, %r13; ; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs3, %rs1, 0x0000, %p2; -; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; +; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; ; CHECK-SM70-NEXT: ret; %1 = fmul bfloat %a, %b %2 = fadd bfloat %1, %c @@ -352,7 +352,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<12>; +; CHECK-NEXT: .reg .b16 %rs<9>; ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-NEXT: .reg .f32 %f<6>; ; CHECK-EMPTY: @@ -367,21 +367,21 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-NEXT: mov.b32 %f1, %r2; ; CHECK-NEXT: add.f32 %f2, %f1, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs8, %f2; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs7, %f2; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs6; ; CHECK-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-NEXT: mov.b32 %f3, %r4; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs8; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs7; ; CHECK-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-NEXT: mov.b32 %f4, %r6; ; CHECK-NEXT: add.f32 %f5, %f3, %f4; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs11, %f5; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs11; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs8, %f5; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs8; ; CHECK-NEXT: ret; ; ; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( ; CHECK-FTZ: { -; CHECK-FTZ-NEXT: .reg .b16 %rs<12>; +; CHECK-FTZ-NEXT: .reg .b16 %rs<9>; ; CHECK-FTZ-NEXT: .reg .b32 %r<7>; ; CHECK-FTZ-NEXT: .reg .f32 %f<6>; ; CHECK-FTZ-EMPTY: @@ -396,22 +396,22 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-FTZ-NEXT: mov.b32 %f1, %r2; ; CHECK-FTZ-NEXT: add.ftz.f32 %f2, %f1, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %f2; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs7, %f2; ; CHECK-FTZ-NEXT: cvt.u32.u16 %r3, %rs6; ; CHECK-FTZ-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-FTZ-NEXT: mov.b32 %f3, %r4; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs8; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs7; ; CHECK-FTZ-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-FTZ-NEXT: mov.b32 %f4, %r6; ; CHECK-FTZ-NEXT: add.ftz.f32 %f5, %f3, %f4; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs11, %f5; -; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs11; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %f5; +; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs8; ; CHECK-FTZ-NEXT: ret; ; ; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<5>; -; CHECK-SM70-NEXT: .reg .b16 %rs<7>; +; CHECK-SM70-NEXT: .reg .b16 %rs<4>; ; CHECK-SM70-NEXT: .reg .b32 %r<29>; ; CHECK-SM70-NEXT: .reg .f32 %f<10>; ; CHECK-SM70-EMPTY: @@ -437,7 +437,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; ; CHECK-SM70-NEXT: mov.b32 %f5, %r13; ; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs3, %rs1, 0x0000, %p2; +; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; ; CHECK-SM70-NEXT: add.f32 %f6, %f5, 0f40E00000; ; CHECK-SM70-NEXT: mov.b32 %r14, %f6; ; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1; @@ -446,7 +446,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f6, %f6; ; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p3; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs3; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: and.b32 %r22, %r19, -65536; @@ -459,8 +459,8 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f9, %f9; ; CHECK-SM70-NEXT: or.b32 %r27, %r23, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r28, %r27, %r26, %p4; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs5}, %r28; } -; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; } +; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-SM70-NEXT: ret; %1 = fmul bfloat %a, %b %2 = fadd bfloat %1, %c @@ -499,7 +499,7 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) ; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<3>; -; CHECK-SM70-NEXT: .reg .b16 %rs<3>; +; CHECK-SM70-NEXT: .reg .b16 %rs<2>; ; CHECK-SM70-NEXT: .reg .b32 %r<20>; ; CHECK-SM70-NEXT: .reg .f32 %f<7>; ; CHECK-SM70-EMPTY: @@ -797,7 +797,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x ; CHECK-SM70-LABEL: fma_bf16x2_expanded_unsafe_with_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<5>; -; CHECK-SM70-NEXT: .reg .b16 %rs<19>; +; CHECK-SM70-NEXT: .reg .b16 %rs<11>; ; CHECK-SM70-NEXT: .reg .b32 %r<31>; ; CHECK-SM70-NEXT: .reg .f32 %f<11>; ; CHECK-SM70-EMPTY: @@ -809,12 +809,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs7; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -825,14 +825,14 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x ; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; ; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs5; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs8; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; @@ -843,16 +843,16 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x ; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; ; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } ; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; ; CHECK-SM70-NEXT: mov.b32 %f9, %r28; ; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; ; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; ; CHECK-SM70-NEXT: mov.b32 %f10, %r29; ; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs17, %rs15, 0x0000, %p4; -; CHECK-SM70-NEXT: selp.b16 %rs18, %rs10, 0x0000, %p3; -; CHECK-SM70-NEXT: mov.b32 %r30, {%rs18, %rs17}; +; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; +; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; +; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9}; ; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30; ; CHECK-SM70-NEXT: ret; %1 = fmul <2 x bfloat> %a, %b @@ -890,7 +890,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> % ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<5>; -; CHECK-SM70-NEXT: .reg .b16 %rs<19>; +; CHECK-SM70-NEXT: .reg .b16 %rs<11>; ; CHECK-SM70-NEXT: .reg .b32 %r<31>; ; CHECK-SM70-NEXT: .reg .f32 %f<11>; ; CHECK-SM70-EMPTY: @@ -902,12 +902,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> % ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs7; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -918,14 +918,14 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> % ; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; ; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs5; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs8; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; @@ -936,16 +936,16 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> % ; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; ; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } ; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; ; CHECK-SM70-NEXT: mov.b32 %f9, %r28; ; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; ; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; ; CHECK-SM70-NEXT: mov.b32 %f10, %r29; ; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs17, %rs15, 0x0000, %p4; -; CHECK-SM70-NEXT: selp.b16 %rs18, %rs10, 0x0000, %p3; -; CHECK-SM70-NEXT: mov.b32 %r30, {%rs18, %rs17}; +; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; +; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; +; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9}; ; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30; ; CHECK-SM70-NEXT: ret; %1 = fmul <2 x bfloat> %a, %b @@ -959,7 +959,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> % define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { ; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<13>; +; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-NEXT: .reg .b32 %r<20>; ; CHECK-NEXT: .reg .f32 %f<11>; ; CHECK-EMPTY: @@ -975,24 +975,24 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-NEXT: shl.b32 %r8, %r7, 16; ; CHECK-NEXT: mov.b32 %f1, %r8; ; CHECK-NEXT: add.f32 %f2, %f1, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f2; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2; ; CHECK-NEXT: cvt.u32.u16 %r9, %rs1; ; CHECK-NEXT: shl.b32 %r10, %r9, 16; ; CHECK-NEXT: mov.b32 %f3, %r10; ; CHECK-NEXT: add.f32 %f4, %f3, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs6, %f4; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r6; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs7; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r6; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs5; ; CHECK-NEXT: shl.b32 %r12, %r11, 16; ; CHECK-NEXT: mov.b32 %f5, %r12; -; CHECK-NEXT: cvt.u32.u16 %r13, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r13, %rs4; ; CHECK-NEXT: shl.b32 %r14, %r13, 16; ; CHECK-NEXT: mov.b32 %f6, %r14; ; CHECK-NEXT: add.f32 %f7, %f5, %f6; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs8; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs6; ; CHECK-NEXT: shl.b32 %r16, %r15, 16; ; CHECK-NEXT: mov.b32 %f8, %r16; -; CHECK-NEXT: cvt.u32.u16 %r17, %rs4; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs3; ; CHECK-NEXT: shl.b32 %r18, %r17, 16; ; CHECK-NEXT: mov.b32 %f9, %r18; ; CHECK-NEXT: add.f32 %f10, %f8, %f9; @@ -1002,7 +1002,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; ; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( ; CHECK-FTZ: { -; CHECK-FTZ-NEXT: .reg .b16 %rs<13>; +; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; ; CHECK-FTZ-NEXT: .reg .b32 %r<20>; ; CHECK-FTZ-NEXT: .reg .f32 %f<11>; ; CHECK-FTZ-EMPTY: @@ -1018,24 +1018,24 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-FTZ-NEXT: shl.b32 %r8, %r7, 16; ; CHECK-FTZ-NEXT: mov.b32 %f1, %r8; ; CHECK-FTZ-NEXT: add.ftz.f32 %f2, %f1, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f2; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f2; ; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs1; ; CHECK-FTZ-NEXT: shl.b32 %r10, %r9, 16; ; CHECK-FTZ-NEXT: mov.b32 %f3, %r10; ; CHECK-FTZ-NEXT: add.ftz.f32 %f4, %f3, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f4; -; CHECK-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r6; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs7; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; CHECK-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r6; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs5; ; CHECK-FTZ-NEXT: shl.b32 %r12, %r11, 16; ; CHECK-FTZ-NEXT: mov.b32 %f5, %r12; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r13, %rs6; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r13, %rs4; ; CHECK-FTZ-NEXT: shl.b32 %r14, %r13, 16; ; CHECK-FTZ-NEXT: mov.b32 %f6, %r14; ; CHECK-FTZ-NEXT: add.ftz.f32 %f7, %f5, %f6; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r15, %rs8; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r15, %rs6; ; CHECK-FTZ-NEXT: shl.b32 %r16, %r15, 16; ; CHECK-FTZ-NEXT: mov.b32 %f8, %r16; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r17, %rs4; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r17, %rs3; ; CHECK-FTZ-NEXT: shl.b32 %r18, %r17, 16; ; CHECK-FTZ-NEXT: mov.b32 %f9, %r18; ; CHECK-FTZ-NEXT: add.ftz.f32 %f10, %f8, %f9; @@ -1046,8 +1046,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<9>; -; CHECK-SM70-NEXT: .reg .b16 %rs<21>; -; CHECK-SM70-NEXT: .reg .b32 %r<62>; +; CHECK-SM70-NEXT: .reg .b16 %rs<11>; +; CHECK-SM70-NEXT: .reg .b32 %r<61>; ; CHECK-SM70-NEXT: .reg .f32 %f<19>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: @@ -1058,12 +1058,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs5; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs8; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -1074,14 +1074,14 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; ; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs7; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; @@ -1092,15 +1092,15 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; ; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } ; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; ; CHECK-SM70-NEXT: mov.b32 %f9, %r28; ; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; ; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; ; CHECK-SM70-NEXT: mov.b32 %f10, %r29; ; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs17, %rs15, 0x0000, %p4; -; CHECK-SM70-NEXT: selp.b16 %rs18, %rs10, 0x0000, %p3; +; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; +; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; ; CHECK-SM70-NEXT: add.f32 %f11, %f10, 0f40E00000; ; CHECK-SM70-NEXT: mov.b32 %r30, %f11; ; CHECK-SM70-NEXT: bfe.u32 %r31, %r30, 16, 1; @@ -1117,7 +1117,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-NEXT: setp.nan.f32 %p6, %f12, %f12; ; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p6; -; CHECK-SM70-NEXT: cvt.u32.u16 %r42, %rs18; +; CHECK-SM70-NEXT: cvt.u32.u16 %r42, %rs10; ; CHECK-SM70-NEXT: shl.b32 %r43, %r42, 16; ; CHECK-SM70-NEXT: mov.b32 %f13, %r43; ; CHECK-SM70-NEXT: and.b32 %r44, %r41, -65536; @@ -1130,7 +1130,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-NEXT: setp.nan.f32 %p7, %f15, %f15; ; CHECK-SM70-NEXT: or.b32 %r49, %r45, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r50, %r49, %r48, %p7; -; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs17; +; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs9; ; CHECK-SM70-NEXT: shl.b32 %r52, %r51, 16; ; CHECK-SM70-NEXT: mov.b32 %f16, %r52; ; CHECK-SM70-NEXT: and.b32 %r53, %r35, -65536; @@ -1183,8 +1183,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf ; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<5>; -; CHECK-SM70-NEXT: .reg .b16 %rs<13>; -; CHECK-SM70-NEXT: .reg .b32 %r<44>; +; CHECK-SM70-NEXT: .reg .b16 %rs<7>; +; CHECK-SM70-NEXT: .reg .b32 %r<43>; ; CHECK-SM70-NEXT: .reg .f32 %f<13>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: @@ -1195,12 +1195,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs7; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -1214,10 +1214,10 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs5; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs8; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll index 723c72165f785..561f2b0cc0673 100644 --- a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll +++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll @@ -182,7 +182,7 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-SM70-LABEL: fma_bf16_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<3>; -; CHECK-SM70-NEXT: .reg .b16 %rs<4>; +; CHECK-SM70-NEXT: .reg .b16 %rs<3>; ; CHECK-SM70-NEXT: .reg .b32 %r<14>; ; CHECK-SM70-NEXT: .reg .f32 %f<6>; ; CHECK-SM70-EMPTY: @@ -208,8 +208,8 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; ; CHECK-SM70-NEXT: mov.b32 %f5, %r13; ; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs3, %rs1, 0x0000, %p2; -; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; +; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; ; CHECK-SM70-NEXT: ret; %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) %2 = fcmp ogt bfloat %1, 0.0 @@ -221,7 +221,7 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-LABEL: fma_bf16_no_nans_multiple_uses_of_fma( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .f32 %f<5>; ; CHECK-EMPTY: @@ -234,18 +234,18 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa ; CHECK-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-NEXT: mov.b32 %f1, %r2; ; CHECK-NEXT: add.f32 %f2, %f1, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs6, %f2; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs6; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs5, %f2; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs5; ; CHECK-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-NEXT: mov.b32 %f3, %r4; ; CHECK-NEXT: add.f32 %f4, %f3, %f1; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs8, %f4; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs8; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs6, %f4; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs6; ; CHECK-NEXT: ret; ; ; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma( ; CHECK-FTZ: { -; CHECK-FTZ-NEXT: .reg .b16 %rs<9>; +; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; ; CHECK-FTZ-NEXT: .reg .b32 %r<5>; ; CHECK-FTZ-NEXT: .reg .f32 %f<5>; ; CHECK-FTZ-EMPTY: @@ -258,19 +258,19 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa ; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-FTZ-NEXT: mov.b32 %f1, %r2; ; CHECK-FTZ-NEXT: add.ftz.f32 %f2, %f1, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f2; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r3, %rs6; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f2; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r3, %rs5; ; CHECK-FTZ-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-FTZ-NEXT: mov.b32 %f3, %r4; ; CHECK-FTZ-NEXT: add.ftz.f32 %f4, %f3, %f1; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %f4; -; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs8; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f4; +; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs6; ; CHECK-FTZ-NEXT: ret; ; ; CHECK-SM70-LABEL: fma_bf16_no_nans_multiple_uses_of_fma( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<4>; -; CHECK-SM70-NEXT: .reg .b16 %rs<3>; +; CHECK-SM70-NEXT: .reg .b16 %rs<2>; ; CHECK-SM70-NEXT: .reg .b32 %r<27>; ; CHECK-SM70-NEXT: .reg .f32 %f<9>; ; CHECK-SM70-EMPTY: @@ -351,7 +351,7 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-SM70-LABEL: fma_bf16_maxnum_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<3>; -; CHECK-SM70-NEXT: .reg .b16 %rs<3>; +; CHECK-SM70-NEXT: .reg .b16 %rs<2>; ; CHECK-SM70-NEXT: .reg .b32 %r<20>; ; CHECK-SM70-NEXT: .reg .f32 %f<7>; ; CHECK-SM70-EMPTY: @@ -574,7 +574,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b ; CHECK-SM70-LABEL: fma_bf16x2_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<5>; -; CHECK-SM70-NEXT: .reg .b16 %rs<19>; +; CHECK-SM70-NEXT: .reg .b16 %rs<11>; ; CHECK-SM70-NEXT: .reg .b32 %r<31>; ; CHECK-SM70-NEXT: .reg .f32 %f<11>; ; CHECK-SM70-EMPTY: @@ -586,12 +586,12 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs7; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -602,14 +602,14 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b ; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; ; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs5; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs8; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; @@ -620,16 +620,16 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b ; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; ; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } ; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; ; CHECK-SM70-NEXT: mov.b32 %f9, %r28; ; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; ; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; ; CHECK-SM70-NEXT: mov.b32 %f10, %r29; ; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs17, %rs15, 0x0000, %p4; -; CHECK-SM70-NEXT: selp.b16 %rs18, %rs10, 0x0000, %p3; -; CHECK-SM70-NEXT: mov.b32 %r30, {%rs18, %rs17}; +; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; +; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; +; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9}; ; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30; ; CHECK-SM70-NEXT: ret; %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) @@ -642,7 +642,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { ; CHECK-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b16 %rs<5>; ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-NEXT: .reg .f32 %f<9>; ; CHECK-EMPTY: @@ -656,17 +656,17 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; CHECK-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-NEXT: mov.b32 %f1, %r6; ; CHECK-NEXT: add.f32 %f2, %f1, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f2; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2; ; CHECK-NEXT: cvt.u32.u16 %r7, %rs1; ; CHECK-NEXT: shl.b32 %r8, %r7, 16; ; CHECK-NEXT: mov.b32 %f3, %r8; ; CHECK-NEXT: add.f32 %f4, %f3, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs6, %f4; -; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs4; ; CHECK-NEXT: shl.b32 %r10, %r9, 16; ; CHECK-NEXT: mov.b32 %f5, %r10; ; CHECK-NEXT: add.f32 %f6, %f5, %f3; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs4; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs3; ; CHECK-NEXT: shl.b32 %r12, %r11, 16; ; CHECK-NEXT: mov.b32 %f7, %r12; ; CHECK-NEXT: add.f32 %f8, %f7, %f1; @@ -676,7 +676,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; ; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma( ; CHECK-FTZ: { -; CHECK-FTZ-NEXT: .reg .b16 %rs<9>; +; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; ; CHECK-FTZ-NEXT: .reg .b32 %r<14>; ; CHECK-FTZ-NEXT: .reg .f32 %f<9>; ; CHECK-FTZ-EMPTY: @@ -690,17 +690,17 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; CHECK-FTZ-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-FTZ-NEXT: mov.b32 %f1, %r6; ; CHECK-FTZ-NEXT: add.ftz.f32 %f2, %f1, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f2; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f2; ; CHECK-FTZ-NEXT: cvt.u32.u16 %r7, %rs1; ; CHECK-FTZ-NEXT: shl.b32 %r8, %r7, 16; ; CHECK-FTZ-NEXT: mov.b32 %f3, %r8; ; CHECK-FTZ-NEXT: add.ftz.f32 %f4, %f3, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f4; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs4; ; CHECK-FTZ-NEXT: shl.b32 %r10, %r9, 16; ; CHECK-FTZ-NEXT: mov.b32 %f5, %r10; ; CHECK-FTZ-NEXT: add.ftz.f32 %f6, %f5, %f3; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs4; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs3; ; CHECK-FTZ-NEXT: shl.b32 %r12, %r11, 16; ; CHECK-FTZ-NEXT: mov.b32 %f7, %r12; ; CHECK-FTZ-NEXT: add.ftz.f32 %f8, %f7, %f1; @@ -711,8 +711,8 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<7>; -; CHECK-SM70-NEXT: .reg .b16 %rs<13>; -; CHECK-SM70-NEXT: .reg .b32 %r<58>; +; CHECK-SM70-NEXT: .reg .b16 %rs<7>; +; CHECK-SM70-NEXT: .reg .b32 %r<57>; ; CHECK-SM70-NEXT: .reg .f32 %f<17>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: @@ -723,12 +723,12 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs5; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs8; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -742,10 +742,10 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs7; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; @@ -835,8 +835,8 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, ; CHECK-SM70-LABEL: fma_bf16x2_maxnum_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<5>; -; CHECK-SM70-NEXT: .reg .b16 %rs<13>; -; CHECK-SM70-NEXT: .reg .b32 %r<44>; +; CHECK-SM70-NEXT: .reg .b16 %rs<7>; +; CHECK-SM70-NEXT: .reg .b32 %r<43>; ; CHECK-SM70-NEXT: .reg .f32 %f<13>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: @@ -847,12 +847,12 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs7; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -866,10 +866,10 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs5; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs8; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll index ce35d9066a475..b20ca24dd91a0 100644 --- a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll +++ b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll @@ -193,7 +193,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<3>; -; CHECK-SM70-NEXT: .reg .b16 %rs<4>; +; CHECK-SM70-NEXT: .reg .b16 %rs<3>; ; CHECK-SM70-NEXT: .reg .b32 %r<14>; ; CHECK-SM70-NEXT: .reg .f32 %f<6>; ; CHECK-SM70-EMPTY: @@ -219,8 +219,8 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; ; CHECK-SM70-NEXT: mov.b32 %f5, %r13; ; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs3, %rs1, 0x0000, %p2; -; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; +; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; ; CHECK-SM70-NEXT: ret; %1 = fmul fast bfloat %a, %b %2 = fadd fast bfloat %1, %c @@ -233,7 +233,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) { define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<12>; +; CHECK-NEXT: .reg .b16 %rs<9>; ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-NEXT: .reg .f32 %f<6>; ; CHECK-EMPTY: @@ -248,21 +248,21 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-NEXT: mov.b32 %f1, %r2; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs8, %f2; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs7, %f2; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs6; ; CHECK-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-NEXT: mov.b32 %f3, %r4; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs8; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs7; ; CHECK-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-NEXT: mov.b32 %f4, %r6; ; CHECK-NEXT: add.rn.f32 %f5, %f3, %f4; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs11, %f5; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs11; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs8, %f5; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs8; ; CHECK-NEXT: ret; ; ; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( ; CHECK-FTZ: { -; CHECK-FTZ-NEXT: .reg .b16 %rs<12>; +; CHECK-FTZ-NEXT: .reg .b16 %rs<9>; ; CHECK-FTZ-NEXT: .reg .b32 %r<7>; ; CHECK-FTZ-NEXT: .reg .f32 %f<6>; ; CHECK-FTZ-EMPTY: @@ -277,22 +277,22 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-FTZ-NEXT: mov.b32 %f1, %r2; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f2, %f1, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %f2; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs7, %f2; ; CHECK-FTZ-NEXT: cvt.u32.u16 %r3, %rs6; ; CHECK-FTZ-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-FTZ-NEXT: mov.b32 %f3, %r4; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs8; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs7; ; CHECK-FTZ-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-FTZ-NEXT: mov.b32 %f4, %r6; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f5, %f3, %f4; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs11, %f5; -; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs11; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %f5; +; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs8; ; CHECK-FTZ-NEXT: ret; ; ; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<5>; -; CHECK-SM70-NEXT: .reg .b16 %rs<7>; +; CHECK-SM70-NEXT: .reg .b16 %rs<4>; ; CHECK-SM70-NEXT: .reg .b32 %r<29>; ; CHECK-SM70-NEXT: .reg .f32 %f<10>; ; CHECK-SM70-EMPTY: @@ -318,7 +318,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; ; CHECK-SM70-NEXT: mov.b32 %f5, %r13; ; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs3, %rs1, 0x0000, %p2; +; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; ; CHECK-SM70-NEXT: add.rn.f32 %f6, %f5, 0f40E00000; ; CHECK-SM70-NEXT: mov.b32 %r14, %f6; ; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1; @@ -327,7 +327,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f6, %f6; ; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p3; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs3; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: and.b32 %r22, %r19, -65536; @@ -340,8 +340,8 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f9, %f9; ; CHECK-SM70-NEXT: or.b32 %r27, %r23, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r28, %r27, %r26, %p4; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs5}, %r28; } -; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; } +; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-SM70-NEXT: ret; %1 = fmul fast bfloat %a, %b %2 = fadd fast bfloat %1, %c @@ -382,7 +382,7 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) ; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<3>; -; CHECK-SM70-NEXT: .reg .b16 %rs<3>; +; CHECK-SM70-NEXT: .reg .b16 %rs<2>; ; CHECK-SM70-NEXT: .reg .b32 %r<20>; ; CHECK-SM70-NEXT: .reg .f32 %f<7>; ; CHECK-SM70-EMPTY: @@ -625,7 +625,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> % ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<5>; -; CHECK-SM70-NEXT: .reg .b16 %rs<19>; +; CHECK-SM70-NEXT: .reg .b16 %rs<11>; ; CHECK-SM70-NEXT: .reg .b32 %r<31>; ; CHECK-SM70-NEXT: .reg .f32 %f<11>; ; CHECK-SM70-EMPTY: @@ -637,12 +637,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> % ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs7; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -653,14 +653,14 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> % ; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; ; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs5; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs8; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; @@ -671,16 +671,16 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> % ; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; ; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } ; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; ; CHECK-SM70-NEXT: mov.b32 %f9, %r28; ; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; ; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; ; CHECK-SM70-NEXT: mov.b32 %f10, %r29; ; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs17, %rs15, 0x0000, %p4; -; CHECK-SM70-NEXT: selp.b16 %rs18, %rs10, 0x0000, %p3; -; CHECK-SM70-NEXT: mov.b32 %r30, {%rs18, %rs17}; +; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; +; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; +; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9}; ; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30; ; CHECK-SM70-NEXT: ret; %1 = fmul fast <2 x bfloat> %a, %b @@ -694,7 +694,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> % define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { ; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<13>; +; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-NEXT: .reg .b32 %r<20>; ; CHECK-NEXT: .reg .f32 %f<11>; ; CHECK-EMPTY: @@ -710,24 +710,24 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-NEXT: shl.b32 %r8, %r7, 16; ; CHECK-NEXT: mov.b32 %f1, %r8; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f2; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2; ; CHECK-NEXT: cvt.u32.u16 %r9, %rs1; ; CHECK-NEXT: shl.b32 %r10, %r9, 16; ; CHECK-NEXT: mov.b32 %f3, %r10; ; CHECK-NEXT: add.rn.f32 %f4, %f3, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs6, %f4; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r6; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs7; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r6; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs5; ; CHECK-NEXT: shl.b32 %r12, %r11, 16; ; CHECK-NEXT: mov.b32 %f5, %r12; -; CHECK-NEXT: cvt.u32.u16 %r13, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r13, %rs4; ; CHECK-NEXT: shl.b32 %r14, %r13, 16; ; CHECK-NEXT: mov.b32 %f6, %r14; ; CHECK-NEXT: add.rn.f32 %f7, %f5, %f6; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs8; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs6; ; CHECK-NEXT: shl.b32 %r16, %r15, 16; ; CHECK-NEXT: mov.b32 %f8, %r16; -; CHECK-NEXT: cvt.u32.u16 %r17, %rs4; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs3; ; CHECK-NEXT: shl.b32 %r18, %r17, 16; ; CHECK-NEXT: mov.b32 %f9, %r18; ; CHECK-NEXT: add.rn.f32 %f10, %f8, %f9; @@ -737,7 +737,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; ; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( ; CHECK-FTZ: { -; CHECK-FTZ-NEXT: .reg .b16 %rs<13>; +; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; ; CHECK-FTZ-NEXT: .reg .b32 %r<20>; ; CHECK-FTZ-NEXT: .reg .f32 %f<11>; ; CHECK-FTZ-EMPTY: @@ -753,24 +753,24 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-FTZ-NEXT: shl.b32 %r8, %r7, 16; ; CHECK-FTZ-NEXT: mov.b32 %f1, %r8; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f2, %f1, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f2; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f2; ; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs1; ; CHECK-FTZ-NEXT: shl.b32 %r10, %r9, 16; ; CHECK-FTZ-NEXT: mov.b32 %f3, %r10; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f4, %f3, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f4; -; CHECK-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r6; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs7; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; CHECK-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r6; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs5; ; CHECK-FTZ-NEXT: shl.b32 %r12, %r11, 16; ; CHECK-FTZ-NEXT: mov.b32 %f5, %r12; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r13, %rs6; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r13, %rs4; ; CHECK-FTZ-NEXT: shl.b32 %r14, %r13, 16; ; CHECK-FTZ-NEXT: mov.b32 %f6, %r14; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f7, %f5, %f6; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r15, %rs8; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r15, %rs6; ; CHECK-FTZ-NEXT: shl.b32 %r16, %r15, 16; ; CHECK-FTZ-NEXT: mov.b32 %f8, %r16; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r17, %rs4; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r17, %rs3; ; CHECK-FTZ-NEXT: shl.b32 %r18, %r17, 16; ; CHECK-FTZ-NEXT: mov.b32 %f9, %r18; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f10, %f8, %f9; @@ -781,8 +781,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<9>; -; CHECK-SM70-NEXT: .reg .b16 %rs<21>; -; CHECK-SM70-NEXT: .reg .b32 %r<62>; +; CHECK-SM70-NEXT: .reg .b16 %rs<11>; +; CHECK-SM70-NEXT: .reg .b32 %r<61>; ; CHECK-SM70-NEXT: .reg .f32 %f<19>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: @@ -793,12 +793,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs5; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs8; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -809,14 +809,14 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; ; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs7; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; @@ -827,15 +827,15 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; ; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } ; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; ; CHECK-SM70-NEXT: mov.b32 %f9, %r28; ; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; ; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; ; CHECK-SM70-NEXT: mov.b32 %f10, %r29; ; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs17, %rs15, 0x0000, %p4; -; CHECK-SM70-NEXT: selp.b16 %rs18, %rs10, 0x0000, %p3; +; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; +; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; ; CHECK-SM70-NEXT: add.rn.f32 %f11, %f10, 0f40E00000; ; CHECK-SM70-NEXT: mov.b32 %r30, %f11; ; CHECK-SM70-NEXT: bfe.u32 %r31, %r30, 16, 1; @@ -852,7 +852,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-NEXT: setp.nan.f32 %p6, %f12, %f12; ; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p6; -; CHECK-SM70-NEXT: cvt.u32.u16 %r42, %rs18; +; CHECK-SM70-NEXT: cvt.u32.u16 %r42, %rs10; ; CHECK-SM70-NEXT: shl.b32 %r43, %r42, 16; ; CHECK-SM70-NEXT: mov.b32 %f13, %r43; ; CHECK-SM70-NEXT: and.b32 %r44, %r41, -65536; @@ -865,7 +865,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa ; CHECK-SM70-NEXT: setp.nan.f32 %p7, %f15, %f15; ; CHECK-SM70-NEXT: or.b32 %r49, %r45, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r50, %r49, %r48, %p7; -; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs17; +; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs9; ; CHECK-SM70-NEXT: shl.b32 %r52, %r51, 16; ; CHECK-SM70-NEXT: mov.b32 %f16, %r52; ; CHECK-SM70-NEXT: and.b32 %r53, %r35, -65536; @@ -918,8 +918,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf ; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<5>; -; CHECK-SM70-NEXT: .reg .b16 %rs<13>; -; CHECK-SM70-NEXT: .reg .b32 %r<44>; +; CHECK-SM70-NEXT: .reg .b16 %rs<7>; +; CHECK-SM70-NEXT: .reg .b32 %r<43>; ; CHECK-SM70-NEXT: .reg .f32 %f<13>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: @@ -930,12 +930,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs7; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -949,10 +949,10 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs5; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs8; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; @@ -1165,7 +1165,7 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-SM70-LABEL: fma_bf16_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<3>; -; CHECK-SM70-NEXT: .reg .b16 %rs<4>; +; CHECK-SM70-NEXT: .reg .b16 %rs<3>; ; CHECK-SM70-NEXT: .reg .b32 %r<14>; ; CHECK-SM70-NEXT: .reg .f32 %f<6>; ; CHECK-SM70-EMPTY: @@ -1191,8 +1191,8 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; ; CHECK-SM70-NEXT: mov.b32 %f5, %r13; ; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs3, %rs1, 0x0000, %p2; -; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; +; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; ; CHECK-SM70-NEXT: ret; %1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) %2 = fcmp nsz ogt bfloat %1, 0.0 @@ -1204,7 +1204,7 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) { define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-LABEL: fma_bf16_no_nans_multiple_uses_of_fma( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .f32 %f<5>; ; CHECK-EMPTY: @@ -1217,18 +1217,18 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa ; CHECK-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-NEXT: mov.b32 %f1, %r2; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs6, %f2; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs6; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs5, %f2; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs5; ; CHECK-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-NEXT: mov.b32 %f3, %r4; ; CHECK-NEXT: add.rn.f32 %f4, %f3, %f1; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs8, %f4; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs8; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs6, %f4; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs6; ; CHECK-NEXT: ret; ; ; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma( ; CHECK-FTZ: { -; CHECK-FTZ-NEXT: .reg .b16 %rs<9>; +; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; ; CHECK-FTZ-NEXT: .reg .b32 %r<5>; ; CHECK-FTZ-NEXT: .reg .f32 %f<5>; ; CHECK-FTZ-EMPTY: @@ -1241,19 +1241,19 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa ; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-FTZ-NEXT: mov.b32 %f1, %r2; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f2, %f1, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f2; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r3, %rs6; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f2; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r3, %rs5; ; CHECK-FTZ-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-FTZ-NEXT: mov.b32 %f3, %r4; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f4, %f3, %f1; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %f4; -; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs8; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f4; +; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs6; ; CHECK-FTZ-NEXT: ret; ; ; CHECK-SM70-LABEL: fma_bf16_no_nans_multiple_uses_of_fma( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<4>; -; CHECK-SM70-NEXT: .reg .b16 %rs<3>; +; CHECK-SM70-NEXT: .reg .b16 %rs<2>; ; CHECK-SM70-NEXT: .reg .b32 %r<27>; ; CHECK-SM70-NEXT: .reg .f32 %f<9>; ; CHECK-SM70-EMPTY: @@ -1334,7 +1334,7 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-SM70-LABEL: fma_bf16_maxnum_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<3>; -; CHECK-SM70-NEXT: .reg .b16 %rs<3>; +; CHECK-SM70-NEXT: .reg .b16 %rs<2>; ; CHECK-SM70-NEXT: .reg .b32 %r<20>; ; CHECK-SM70-NEXT: .reg .f32 %f<7>; ; CHECK-SM70-EMPTY: @@ -1561,7 +1561,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b ; CHECK-SM70-LABEL: fma_bf16x2_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<5>; -; CHECK-SM70-NEXT: .reg .b16 %rs<19>; +; CHECK-SM70-NEXT: .reg .b16 %rs<11>; ; CHECK-SM70-NEXT: .reg .b32 %r<31>; ; CHECK-SM70-NEXT: .reg .f32 %f<11>; ; CHECK-SM70-EMPTY: @@ -1573,12 +1573,12 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs7; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -1589,14 +1589,14 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b ; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; ; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs5; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs8; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; @@ -1607,16 +1607,16 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b ; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; ; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; ; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; -; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; } +; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } ; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; ; CHECK-SM70-NEXT: mov.b32 %f9, %r28; ; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; ; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; ; CHECK-SM70-NEXT: mov.b32 %f10, %r29; ; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; -; CHECK-SM70-NEXT: selp.b16 %rs17, %rs15, 0x0000, %p4; -; CHECK-SM70-NEXT: selp.b16 %rs18, %rs10, 0x0000, %p3; -; CHECK-SM70-NEXT: mov.b32 %r30, {%rs18, %rs17}; +; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; +; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; +; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9}; ; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30; ; CHECK-SM70-NEXT: ret; %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) @@ -1629,7 +1629,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { ; CHECK-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b16 %rs<5>; ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-NEXT: .reg .f32 %f<9>; ; CHECK-EMPTY: @@ -1643,17 +1643,17 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; CHECK-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-NEXT: mov.b32 %f1, %r6; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f2; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2; ; CHECK-NEXT: cvt.u32.u16 %r7, %rs1; ; CHECK-NEXT: shl.b32 %r8, %r7, 16; ; CHECK-NEXT: mov.b32 %f3, %r8; ; CHECK-NEXT: add.rn.f32 %f4, %f3, 0f40E00000; -; CHECK-NEXT: cvt.rn.bf16.f32 %rs6, %f4; -; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs4; ; CHECK-NEXT: shl.b32 %r10, %r9, 16; ; CHECK-NEXT: mov.b32 %f5, %r10; ; CHECK-NEXT: add.rn.f32 %f6, %f5, %f3; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs4; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs3; ; CHECK-NEXT: shl.b32 %r12, %r11, 16; ; CHECK-NEXT: mov.b32 %f7, %r12; ; CHECK-NEXT: add.rn.f32 %f8, %f7, %f1; @@ -1663,7 +1663,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; ; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma( ; CHECK-FTZ: { -; CHECK-FTZ-NEXT: .reg .b16 %rs<9>; +; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; ; CHECK-FTZ-NEXT: .reg .b32 %r<14>; ; CHECK-FTZ-NEXT: .reg .f32 %f<9>; ; CHECK-FTZ-EMPTY: @@ -1677,17 +1677,17 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; CHECK-FTZ-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-FTZ-NEXT: mov.b32 %f1, %r6; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f2, %f1, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f2; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f2; ; CHECK-FTZ-NEXT: cvt.u32.u16 %r7, %rs1; ; CHECK-FTZ-NEXT: shl.b32 %r8, %r7, 16; ; CHECK-FTZ-NEXT: mov.b32 %f3, %r8; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f4, %f3, 0f40E00000; -; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f4; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs4; ; CHECK-FTZ-NEXT: shl.b32 %r10, %r9, 16; ; CHECK-FTZ-NEXT: mov.b32 %f5, %r10; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f3; -; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs4; +; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs3; ; CHECK-FTZ-NEXT: shl.b32 %r12, %r11, 16; ; CHECK-FTZ-NEXT: mov.b32 %f7, %r12; ; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f8, %f7, %f1; @@ -1698,8 +1698,8 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<7>; -; CHECK-SM70-NEXT: .reg .b16 %rs<13>; -; CHECK-SM70-NEXT: .reg .b32 %r<58>; +; CHECK-SM70-NEXT: .reg .b16 %rs<7>; +; CHECK-SM70-NEXT: .reg .b32 %r<57>; ; CHECK-SM70-NEXT: .reg .f32 %f<17>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: @@ -1710,12 +1710,12 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs5; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs8; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -1729,10 +1729,10 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs7; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; @@ -1822,8 +1822,8 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, ; CHECK-SM70-LABEL: fma_bf16x2_maxnum_no_nans( ; CHECK-SM70: { ; CHECK-SM70-NEXT: .reg .pred %p<5>; -; CHECK-SM70-NEXT: .reg .b16 %rs<13>; -; CHECK-SM70-NEXT: .reg .b32 %r<44>; +; CHECK-SM70-NEXT: .reg .b16 %rs<7>; +; CHECK-SM70-NEXT: .reg .b32 %r<43>; ; CHECK-SM70-NEXT: .reg .f32 %f<13>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: @@ -1834,12 +1834,12 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, ; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; ; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r5; -; CHECK-SM70-NEXT: mov.b32 {%rs4, %rs5}, %r2; -; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; ; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r7; -; CHECK-SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs7; +; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; ; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r9; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -1853,10 +1853,10 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, ; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; ; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; ; CHECK-SM70-NEXT: mov.b32 %f5, %r17; -; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs5; +; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; ; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; ; CHECK-SM70-NEXT: mov.b32 %f6, %r19; -; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs8; +; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; ; CHECK-SM70-NEXT: mov.b32 %f7, %r21; ; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; diff --git a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll index 4b56c267ab324..f4a5ff9a333f7 100644 --- a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll +++ b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll @@ -16,7 +16,7 @@ define void @foo() { ; CHECK: and.b16 %rs2, %rs1, 1; ; CHECK: setp.eq.b16 %p1, %rs2, 1; ; CHECK: @%p1 bra $L__BB0_2; -; CHECK: mov.u16 %rs3, 1; +; CHECK: mov.b16 %rs3, 1; ; CHECK: st.global.u8 [i1g], %rs3; ; CHECK: ret; %tmp = load i1, ptr addrspace(1) @i1g, align 2 diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index 895787d68adfe..accfbe4af0313 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -13,7 +13,7 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0]; ; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1]; ; CHECK-NEXT: shr.s64 %rd2, %rd46, 63; -; CHECK-NEXT: mov.u64 %rd119, 0; +; CHECK-NEXT: mov.b64 %rd119, 0; ; CHECK-NEXT: sub.cc.s64 %rd52, %rd119, %rd45; ; CHECK-NEXT: subc.cc.s64 %rd53, %rd119, %rd46; ; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; @@ -95,7 +95,7 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: shr.u64 %rd124, %rd4, %r16; ; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1; -; CHECK-NEXT: mov.u64 %rd116, 0; +; CHECK-NEXT: mov.b64 %rd116, 0; ; CHECK-NEXT: mov.u64 %rd119, %rd116; ; CHECK-NEXT: $L__BB0_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -180,7 +180,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; ; CHECK-NEXT: add.s64 %rd53, %rd52, 64; ; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.u64 %rd105, 0; +; CHECK-NEXT: mov.b64 %rd105, 0; ; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; ; CHECK-NEXT: subc.cc.s64 %rd57, %rd105, 0; ; CHECK-NEXT: setp.eq.s64 %p6, %rd57, 0; @@ -233,7 +233,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: shr.u64 %rd110, %rd42, %r16; ; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1; -; CHECK-NEXT: mov.u64 %rd102, 0; +; CHECK-NEXT: mov.b64 %rd102, 0; ; CHECK-NEXT: mov.u64 %rd105, %rd102; ; CHECK-NEXT: $L__BB1_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -313,7 +313,7 @@ define i128 @urem_i128_pow2k(i128 %lhs) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [urem_i128_pow2k_param_0]; ; CHECK-NEXT: and.b64 %rd3, %rd1, 8589934591; -; CHECK-NEXT: mov.u64 %rd4, 0; +; CHECK-NEXT: mov.b64 %rd4, 0; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, 8589934592 @@ -330,7 +330,7 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0]; ; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1]; -; CHECK-NEXT: mov.u64 %rd112, 0; +; CHECK-NEXT: mov.b64 %rd112, 0; ; CHECK-NEXT: sub.cc.s64 %rd52, %rd112, %rd45; ; CHECK-NEXT: subc.cc.s64 %rd53, %rd112, %rd46; ; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; @@ -414,7 +414,7 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: shr.u64 %rd117, %rd2, %r16; ; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1; -; CHECK-NEXT: mov.u64 %rd109, 0; +; CHECK-NEXT: mov.b64 %rd109, 0; ; CHECK-NEXT: mov.u64 %rd112, %rd109; ; CHECK-NEXT: $L__BB4_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -491,7 +491,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; ; CHECK-NEXT: add.s64 %rd53, %rd52, 64; ; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.u64 %rd97, 0; +; CHECK-NEXT: mov.b64 %rd97, 0; ; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; ; CHECK-NEXT: subc.cc.s64 %rd57, %rd97, 0; ; CHECK-NEXT: setp.eq.s64 %p6, %rd57, 0; @@ -544,7 +544,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: shr.u64 %rd102, %rd42, %r16; ; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1; -; CHECK-NEXT: mov.u64 %rd94, 0; +; CHECK-NEXT: mov.b64 %rd94, 0; ; CHECK-NEXT: mov.u64 %rd97, %rd94; ; CHECK-NEXT: $L__BB5_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 5d849517096dc..a39e25582f759 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -347,13 +347,13 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 { define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-LABEL: test_or( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<7>; +; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r3, [test_or_param_1]; -; COMMON-NEXT: ld.param.u32 %r4, [test_or_param_0]; -; COMMON-NEXT: or.b32 %r5, %r4, %r3; -; COMMON-NEXT: st.param.b32 [func_retval0], %r5; +; COMMON-NEXT: ld.param.u32 %r2, [test_or_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_or_param_0]; +; COMMON-NEXT: or.b32 %r3, %r1, %r2; +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; ; COMMON-NEXT: ret; %r = or <2 x i16> %a, %b ret <2 x i16> %r @@ -369,9 +369,9 @@ define <2 x i16> @test_or_computed(i16 %a) { ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.u16 %rs1, [test_or_computed_param_0]; -; COMMON-NEXT: mov.u16 %rs2, 0; +; COMMON-NEXT: mov.b16 %rs2, 0; ; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; -; COMMON-NEXT: mov.u16 %rs3, 5; +; COMMON-NEXT: mov.b16 %rs3, 5; ; COMMON-NEXT: mov.b32 %r2, {%rs1, %rs3}; ; COMMON-NEXT: or.b32 %r3, %r2, %r1; ; COMMON-NEXT: st.param.b32 [func_retval0], %r3; @@ -414,13 +414,13 @@ define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 { define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-LABEL: test_xor( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<7>; +; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r3, [test_xor_param_1]; -; COMMON-NEXT: ld.param.u32 %r4, [test_xor_param_0]; -; COMMON-NEXT: xor.b32 %r5, %r4, %r3; -; COMMON-NEXT: st.param.b32 [func_retval0], %r5; +; COMMON-NEXT: ld.param.u32 %r2, [test_xor_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_xor_param_0]; +; COMMON-NEXT: xor.b32 %r3, %r1, %r2; +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; ; COMMON-NEXT: ret; %r = xor <2 x i16> %a, %b ret <2 x i16> %r @@ -434,9 +434,9 @@ define <2 x i16> @test_xor_computed(i16 %a) { ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.u16 %rs1, [test_xor_computed_param_0]; -; COMMON-NEXT: mov.u16 %rs2, 0; +; COMMON-NEXT: mov.b16 %rs2, 0; ; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; -; COMMON-NEXT: mov.u16 %rs3, 5; +; COMMON-NEXT: mov.b16 %rs3, 5; ; COMMON-NEXT: mov.b32 %r2, {%rs1, %rs3}; ; COMMON-NEXT: xor.b32 %r3, %r2, %r1; ; COMMON-NEXT: st.param.b32 [func_retval0], %r3; @@ -479,13 +479,13 @@ define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 { define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-LABEL: test_and( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<7>; +; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r3, [test_and_param_1]; -; COMMON-NEXT: ld.param.u32 %r4, [test_and_param_0]; -; COMMON-NEXT: and.b32 %r5, %r4, %r3; -; COMMON-NEXT: st.param.b32 [func_retval0], %r5; +; COMMON-NEXT: ld.param.u32 %r2, [test_and_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_and_param_0]; +; COMMON-NEXT: and.b32 %r3, %r1, %r2; +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; ; COMMON-NEXT: ret; %r = and <2 x i16> %a, %b ret <2 x i16> %r @@ -501,9 +501,9 @@ define <2 x i16> @test_and_computed(i16 %a) { ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.u16 %rs1, [test_and_computed_param_0]; -; COMMON-NEXT: mov.u16 %rs2, 0; +; COMMON-NEXT: mov.b16 %rs2, 0; ; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; -; COMMON-NEXT: mov.u16 %rs3, 5; +; COMMON-NEXT: mov.b16 %rs3, 5; ; COMMON-NEXT: mov.b32 %r2, {%rs1, %rs3}; ; COMMON-NEXT: and.b32 %r3, %r2, %r1; ; COMMON-NEXT: st.param.b32 [func_retval0], %r3; @@ -807,7 +807,7 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 { ; COMMON-LABEL: test_trunc_2xi32( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<5>; +; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_param_0]; @@ -821,16 +821,16 @@ define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 { define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 { ; I16x2-LABEL: test_trunc_2xi32_muliple_use0( ; I16x2: { -; I16x2-NEXT: .reg .b32 %r<7>; +; I16x2-NEXT: .reg .b32 %r<6>; ; I16x2-NEXT: .reg .b64 %rd<2>; ; I16x2-EMPTY: ; I16x2-NEXT: // %bb.0: ; I16x2-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0]; ; I16x2-NEXT: ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1]; ; I16x2-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U; -; I16x2-NEXT: mov.b32 %r5, 65537; -; I16x2-NEXT: add.s16x2 %r6, %r3, %r5; -; I16x2-NEXT: st.u32 [%rd1], %r6; +; I16x2-NEXT: mov.b32 %r4, 65537; +; I16x2-NEXT: add.s16x2 %r5, %r3, %r4; +; I16x2-NEXT: st.u32 [%rd1], %r5; ; I16x2-NEXT: st.param.b32 [func_retval0], %r3; ; I16x2-NEXT: ret; ; @@ -863,16 +863,16 @@ define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 { define <2 x i16> @test_trunc_2xi32_muliple_use1(<2 x i32> %a, ptr %p) #0 { ; COMMON-LABEL: test_trunc_2xi32_muliple_use1( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<7>; +; COMMON-NEXT: .reg .b32 %r<6>; ; COMMON-NEXT: .reg .b64 %rd<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use1_param_0]; ; COMMON-NEXT: ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use1_param_1]; ; COMMON-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U; -; COMMON-NEXT: add.s32 %r5, %r2, 1; -; COMMON-NEXT: add.s32 %r6, %r1, 1; -; COMMON-NEXT: st.v2.u32 [%rd1], {%r6, %r5}; +; COMMON-NEXT: add.s32 %r4, %r2, 1; +; COMMON-NEXT: add.s32 %r5, %r1, 1; +; COMMON-NEXT: st.v2.u32 [%rd1], {%r5, %r4}; ; COMMON-NEXT: st.param.b32 [func_retval0], %r3; ; COMMON-NEXT: ret; %r = trunc <2 x i32> %a to <2 x i16> @@ -939,7 +939,7 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 { define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 { ; COMMON-LABEL: test_bitcast_i32_to_2xi16( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_2xi16_param_0]; @@ -952,11 +952,11 @@ define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 { define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 { ; COMMON-LABEL: test_bitcast_2xi16_to_i32( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r2, [test_bitcast_2xi16_to_i32_param_0]; -; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ld.param.u32 %r1, [test_bitcast_2xi16_to_i32_param_0]; +; COMMON-NEXT: st.param.b32 [func_retval0], %r1; ; COMMON-NEXT: ret; %r = bitcast <2 x i16> %a to i32 ret i32 %r @@ -966,11 +966,11 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 { ; COMMON-LABEL: test_bitcast_2xi16_to_2xhalf( ; COMMON: { ; COMMON-NEXT: .reg .b16 %rs<3>; -; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.u16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0]; -; COMMON-NEXT: mov.u16 %rs2, 5; +; COMMON-NEXT: mov.b16 %rs2, 5; ; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; ; COMMON-NEXT: st.param.b32 [func_retval0], %r1; ; COMMON-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 3853ec5c4151a..c2f166770a7ad 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -528,13 +528,13 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 { define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_or( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r3, [test_or_param_1]; -; CHECK-NEXT: ld.param.u32 %r4, [test_or_param_0]; -; CHECK-NEXT: or.b32 %r5, %r4, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ld.param.u32 %r2, [test_or_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_or_param_0]; +; CHECK-NEXT: or.b32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %r = or <4 x i8> %a, %b ret <4 x i8> %r @@ -544,7 +544,7 @@ define <4 x i8> @test_or_computed(i8 %a) { ; CHECK-LABEL: test_or_computed( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_or_computed_param_0]; @@ -554,8 +554,8 @@ define <4 x i8> @test_or_computed(i8 %a) { ; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; ; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; ; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: or.b32 %r8, %r6, %r5; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: or.b32 %r7, %r6, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 @@ -594,13 +594,13 @@ define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 { define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_xor( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r3, [test_xor_param_1]; -; CHECK-NEXT: ld.param.u32 %r4, [test_xor_param_0]; -; CHECK-NEXT: xor.b32 %r5, %r4, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ld.param.u32 %r2, [test_xor_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_xor_param_0]; +; CHECK-NEXT: xor.b32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %r = xor <4 x i8> %a, %b ret <4 x i8> %r @@ -610,7 +610,7 @@ define <4 x i8> @test_xor_computed(i8 %a) { ; CHECK-LABEL: test_xor_computed( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_xor_computed_param_0]; @@ -620,8 +620,8 @@ define <4 x i8> @test_xor_computed(i8 %a) { ; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; ; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; ; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: xor.b32 %r8, %r6, %r5; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: xor.b32 %r7, %r6, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 @@ -660,13 +660,13 @@ define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 { define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_and( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r3, [test_and_param_1]; -; CHECK-NEXT: ld.param.u32 %r4, [test_and_param_0]; -; CHECK-NEXT: and.b32 %r5, %r4, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ld.param.u32 %r2, [test_and_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_and_param_0]; +; CHECK-NEXT: and.b32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %r = and <4 x i8> %a, %b ret <4 x i8> %r @@ -676,7 +676,7 @@ define <4 x i8> @test_and_computed(i8 %a) { ; CHECK-LABEL: test_and_computed( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_and_computed_param_0]; @@ -686,8 +686,8 @@ define <4 x i8> @test_and_computed(i8 %a) { ; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; ; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; ; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: and.b32 %r8, %r6, %r5; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: and.b32 %r7, %r6, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 @@ -743,7 +743,7 @@ define void @test_ldst_v2i8(ptr %a, ptr %b) { define void @test_ldst_v3i8(ptr %a, ptr %b) { ; CHECK-LABEL: test_ldst_v3i8( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: @@ -751,8 +751,8 @@ define void @test_ldst_v3i8(ptr %a, ptr %b) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; ; CHECK-NEXT: st.u16 [%rd2], %r1; -; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; -; CHECK-NEXT: st.u8 [%rd2+2], %r3; +; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8; +; CHECK-NEXT: st.u8 [%rd2+2], %r2; ; CHECK-NEXT: ret; %t1 = load <3 x i8>, ptr %a store <3 x i8> %t1, ptr %b, align 16 @@ -1127,7 +1127,7 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 { define <4 x i8> @test_bitcast_i32_to_4xi8(i32 %a) #0 { ; CHECK-LABEL: test_bitcast_i32_to_4xi8( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_4xi8_param_0]; @@ -1155,11 +1155,11 @@ define <4 x i8> @test_bitcast_float_to_4xi8(float %a) #0 { define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 { ; CHECK-LABEL: test_bitcast_4xi8_to_i32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_4xi8_to_i32_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_4xi8_to_i32_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = bitcast <4 x i8> %a to i32 ret i32 %r @@ -1168,12 +1168,12 @@ define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 { define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 { ; CHECK-LABEL: test_bitcast_4xi8_to_float( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .f32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_4xi8_to_float_param_0]; -; CHECK-NEXT: mov.b32 %f1, %r2; +; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_4xi8_to_float_param_0]; +; CHECK-NEXT: mov.b32 %f1, %r1; ; CHECK-NEXT: st.param.f32 [func_retval0], %f1; ; CHECK-NEXT: ret; %r = bitcast <4 x i8> %a to float @@ -1185,7 +1185,7 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 { ; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0]; @@ -1240,7 +1240,7 @@ define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 { ; CHECK-LABEL: test_insertelement( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_insertelement_param_1]; @@ -1257,28 +1257,28 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<12>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptosi_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; -; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; -; CHECK-NEXT: cvt.u32.u16 %r6, %rs6; -; CHECK-NEXT: cvt.u32.u16 %r7, %rs5; -; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 0x3340U; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; CHECK-NEXT: mov.b32 %r3, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs5; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs8; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs7; -; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; -; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs12; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs11; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r10, 0x3340U; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r8, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-NEXT: mov.b32 %r7, {%rs10, %rs9}; +; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r7; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs12; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs11; +; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x3340U; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r6, 0x5410U; +; CHECK-NEXT: st.param.b32 [func_retval0], %r11; ; CHECK-NEXT: ret; %r = fptosi <4 x half> %a to <4 x i8> ret <4 x i8> %r @@ -1288,28 +1288,28 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<12>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptoui_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; -; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; -; CHECK-NEXT: cvt.u32.u16 %r6, %rs6; -; CHECK-NEXT: cvt.u32.u16 %r7, %rs5; -; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 0x3340U; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; CHECK-NEXT: mov.b32 %r3, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs5; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs8; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs7; -; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; -; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs12; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs11; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r10, 0x3340U; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r8, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-NEXT: mov.b32 %r7, {%rs10, %rs9}; +; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r7; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs12; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs11; +; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x3340U; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r6, 0x5410U; +; CHECK-NEXT: st.param.b32 [func_retval0], %r11; ; CHECK-NEXT: ret; %r = fptoui <4 x half> %a to <4 x i8> ret <4 x i8> %r @@ -1375,7 +1375,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: test_srem_v3i8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<20>; -; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry @@ -1392,27 +1392,27 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ld.u8 %rs7, [%rd2+1]; ; CHECK-NEXT: shl.b16 %rs8, %rs7, 8; ; CHECK-NEXT: or.b16 %rs9, %rs8, %rs6; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs9; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs9; ; CHECK-NEXT: ld.s8 %rs10, [%rd2+2]; -; CHECK-NEXT: bfe.s32 %r5, %r3, 8, 8; -; CHECK-NEXT: cvt.s8.s32 %rs11, %r5; -; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; -; CHECK-NEXT: cvt.s8.s32 %rs12, %r6; +; CHECK-NEXT: bfe.s32 %r3, %r2, 8, 8; +; CHECK-NEXT: cvt.s8.s32 %rs11, %r3; +; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.s8.s32 %rs12, %r4; ; CHECK-NEXT: rem.s16 %rs13, %rs12, %rs11; -; CHECK-NEXT: cvt.u32.u16 %r7, %rs13; -; CHECK-NEXT: bfe.s32 %r8, %r3, 0, 8; -; CHECK-NEXT: cvt.s8.s32 %rs14, %r8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.s8.s32 %rs15, %r9; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs13; +; CHECK-NEXT: bfe.s32 %r6, %r2, 0, 8; +; CHECK-NEXT: cvt.s8.s32 %rs14, %r6; +; CHECK-NEXT: bfe.s32 %r7, %r1, 0, 8; +; CHECK-NEXT: cvt.s8.s32 %rs15, %r7; ; CHECK-NEXT: rem.s16 %rs16, %rs15, %rs14; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs16; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r7, 0x3340U; -; CHECK-NEXT: // implicit-def: %r13 -; CHECK-NEXT: // implicit-def: %r14 -; CHECK-NEXT: prmt.b32 %r12, %r13, %r14, 0x3340U; -; CHECK-NEXT: prmt.b32 %r15, %r11, %r12, 0x5410U; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs16; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; +; CHECK-NEXT: // implicit-def: %r11 +; CHECK-NEXT: // implicit-def: %r12 +; CHECK-NEXT: prmt.b32 %r10, %r11, %r12, 0x3340U; +; CHECK-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U; ; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10; -; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r15; } +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r13; } ; CHECK-NEXT: st.u8 [%rd3], %rs18; ; CHECK-NEXT: shr.u16 %rs19, %rs18, 8; ; CHECK-NEXT: st.u8 [%rd3+1], %rs19; diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll index 4da77e28547b6..4ebeba06032c6 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll @@ -13,8 +13,8 @@ define void @test_b128_input_from_const() { ; CHECK-NEXT: .reg .b128 %rq<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u64 %rd2, 0; -; CHECK-NEXT: mov.u64 %rd3, 42; +; CHECK-NEXT: mov.b64 %rd2, 0; +; CHECK-NEXT: mov.b64 %rd3, 42; ; CHECK-NEXT: mov.b128 %rq1, {%rd3, %rd2}; ; CHECK-NEXT: mov.u64 %rd4, value; ; CHECK-NEXT: cvta.global.u64 %rd1, %rd4; @@ -65,7 +65,7 @@ define void @test_b128_input_from_select(ptr nocapture readonly %flag) { ; CHECK-NEXT: ld.global.u8 %rs1, [%rd3]; ; CHECK-NEXT: setp.eq.s16 %p1, %rs1, 0; ; CHECK-NEXT: selp.b64 %rd4, 24, 42, %p1; -; CHECK-NEXT: mov.u64 %rd5, 0; +; CHECK-NEXT: mov.b64 %rd5, 0; ; CHECK-NEXT: mov.b128 %rq1, {%rd4, %rd5}; ; CHECK-NEXT: mov.u64 %rd6, value; ; CHECK-NEXT: cvta.global.u64 %rd1, %rd6; diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll index e7980accf457e..d5f3d83573e29 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll @@ -23,7 +23,7 @@ define void @test_corner_values() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.global.u64 %rd1, [v64]; ; CHECK-NEXT: add.s64 %rd2, %rd1, 8; -; CHECK-NEXT: mov.u64 %rd13, -1; +; CHECK-NEXT: mov.b64 %rd13, -1; ; CHECK-NEXT: mov.b128 %rq1, {%rd13, %rd13}; ; CHECK-NEXT: mov.u64 %rd14, v_u128_max; ; CHECK-NEXT: cvta.global.u64 %rd3, %rd14; @@ -40,7 +40,7 @@ define void @test_corner_values() { ; CHECK-NEXT: ld.global.u64 %rd15, [v64]; ; CHECK-NEXT: add.s64 %rd4, %rd15, 16; ; CHECK-NEXT: add.s64 %rd5, %rd15, 24; -; CHECK-NEXT: mov.u64 %rd16, 9223372036854775807; +; CHECK-NEXT: mov.b64 %rd16, 9223372036854775807; ; CHECK-NEXT: mov.b128 %rq2, {%rd13, %rd16}; ; CHECK-NEXT: mov.u64 %rd17, v_i128_max; ; CHECK-NEXT: cvta.global.u64 %rd6, %rd17; @@ -57,8 +57,8 @@ define void @test_corner_values() { ; CHECK-NEXT: ld.global.u64 %rd18, [v64]; ; CHECK-NEXT: add.s64 %rd7, %rd18, 32; ; CHECK-NEXT: add.s64 %rd8, %rd18, 40; -; CHECK-NEXT: mov.u64 %rd19, -9223372036854775808; -; CHECK-NEXT: mov.u64 %rd20, 0; +; CHECK-NEXT: mov.b64 %rd19, -9223372036854775808; +; CHECK-NEXT: mov.b64 %rd20, 0; ; CHECK-NEXT: mov.b128 %rq3, {%rd20, %rd19}; ; CHECK-NEXT: mov.u64 %rd21, v_i128_min; ; CHECK-NEXT: cvta.global.u64 %rd9, %rd21; diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll index 00cfa3daf4a7c..a3d40169c4695 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll @@ -21,7 +21,7 @@ define void @test_b128_in_loop() { ; CHECK-NEXT: // %bb.1: // %BB1 ; CHECK-NEXT: ld.global.u64 %rd13, [x+8]; ; CHECK-NEXT: ld.global.u64 %rd12, [x]; -; CHECK-NEXT: mov.u64 %rd14, 0; +; CHECK-NEXT: mov.b64 %rd14, 0; ; CHECK-NEXT: $L__BB0_2: // %BB2 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov.b128 %rq1, {%rd12, %rd13}; diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll index 5161e5d029777..189f3421cd03a 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll @@ -621,26 +621,26 @@ define half @minimum_half(half %a, half %b) { ; CHECK-NOF16-LABEL: minimum_half( ; CHECK-NOF16: { ; CHECK-NOF16-NEXT: .reg .pred %p<6>; -; CHECK-NOF16-NEXT: .reg .b16 %rs<10>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<8>; ; CHECK-NOF16-NEXT: .reg .f32 %f<4>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [minimum_half_param_0]; -; CHECK-NOF16-NEXT: ld.param.b16 %rs3, [minimum_half_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs3; +; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [minimum_half_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; ; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs1; ; CHECK-NOF16-NEXT: setp.lt.f32 %p1, %f2, %f1; -; CHECK-NOF16-NEXT: selp.b16 %rs4, %rs1, %rs3, %p1; +; CHECK-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %f2, %f1; -; CHECK-NOF16-NEXT: selp.b16 %rs5, 0x7E00, %rs4, %p2; +; CHECK-NOF16-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2; ; CHECK-NOF16-NEXT: setp.eq.s16 %p3, %rs1, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs1, %rs5, %p3; -; CHECK-NOF16-NEXT: setp.eq.s16 %p4, %rs3, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs3, %rs6, %p4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs5; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3; +; CHECK-NOF16-NEXT: setp.eq.s16 %p4, %rs2, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs4; ; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs5, %p5; -; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs9; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_half( @@ -657,26 +657,26 @@ define half @minimum_half(half %a, half %b) { ; CHECK-SM80-NOF16-LABEL: minimum_half( ; CHECK-SM80-NOF16: { ; CHECK-SM80-NOF16-NEXT: .reg .pred %p<6>; -; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<10>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<8>; ; CHECK-SM80-NOF16-NEXT: .reg .f32 %f<4>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [minimum_half_param_0]; -; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs3, [minimum_half_param_1]; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f1, %rs3; +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [minimum_half_param_1]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f2, %rs1; ; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p1, %f2, %f1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs4, %rs1, %rs3, %p1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %f2, %f1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, 0x7E00, %rs4, %p2; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2; ; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p3, %rs1, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs1, %rs5, %p3; -; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p4, %rs3, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs3, %rs6, %p4; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f3, %rs5; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3; +; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p4, %rs2, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f3, %rs4; ; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs5, %p5; -; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs9; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs7; ; CHECK-SM80-NOF16-NEXT: ret; %x = call half @llvm.minimum.f16(half %a, half %b) ret half %x @@ -897,7 +897,7 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-LABEL: minimum_v2half( ; CHECK-NOF16: { ; CHECK-NOF16-NEXT: .reg .pred %p<11>; -; CHECK-NOF16-NEXT: .reg .b16 %rs<19>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<15>; ; CHECK-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-NOF16-NEXT: .reg .f32 %f<7>; ; CHECK-NOF16-EMPTY: @@ -913,26 +913,26 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %f2, %f1; ; CHECK-NOF16-NEXT: selp.b16 %rs6, 0x7E00, %rs5, %p2; ; CHECK-NOF16-NEXT: setp.eq.s16 %p3, %rs4, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs4, %rs6, %p3; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs4, %rs6, %p3; ; CHECK-NOF16-NEXT: setp.eq.s16 %p4, %rs2, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs2, %rs8, %p4; +; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs2, %rs7, %p4; ; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs6; ; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs10, %rs6, %p5; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5; ; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs1; ; CHECK-NOF16-NEXT: cvt.f32.f16 %f5, %rs3; ; CHECK-NOF16-NEXT: setp.lt.f32 %p6, %f5, %f4; -; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs3, %rs1, %p6; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs3, %rs1, %p6; ; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %f5, %f4; -; CHECK-NOF16-NEXT: selp.b16 %rs13, 0x7E00, %rs12, %p7; +; CHECK-NOF16-NEXT: selp.b16 %rs11, 0x7E00, %rs10, %p7; ; CHECK-NOF16-NEXT: setp.eq.s16 %p8, %rs3, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs3, %rs13, %p8; +; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs3, %rs11, %p8; ; CHECK-NOF16-NEXT: setp.eq.s16 %p9, %rs1, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs17, %rs1, %rs15, %p9; -; CHECK-NOF16-NEXT: cvt.f32.f16 %f6, %rs13; +; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs1, %rs12, %p9; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f6, %rs11; ; CHECK-NOF16-NEXT: setp.eq.f32 %p10, %f6, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs18, %rs17, %rs13, %p10; -; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs18, %rs11}; +; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10; +; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs14, %rs9}; ; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF16-NEXT: ret; ; @@ -950,7 +950,7 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-SM80-NOF16-LABEL: minimum_v2half( ; CHECK-SM80-NOF16: { ; CHECK-SM80-NOF16-NEXT: .reg .pred %p<11>; -; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<19>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<15>; ; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-SM80-NOF16-NEXT: .reg .f32 %f<7>; ; CHECK-SM80-NOF16-EMPTY: @@ -966,26 +966,26 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %f2, %f1; ; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, 0x7E00, %rs5, %p2; ; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p3, %rs4, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs4, %rs6, %p3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs4, %rs6, %p3; ; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p4, %rs2, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs2, %rs8, %p4; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs2, %rs7, %p4; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f3, %rs6; ; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs10, %rs6, %p5; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f4, %rs1; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f5, %rs3; ; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p6, %f5, %f4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs3, %rs1, %p6; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs3, %rs1, %p6; ; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %f5, %f4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, 0x7E00, %rs12, %p7; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, 0x7E00, %rs10, %p7; ; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p8, %rs3, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs3, %rs13, %p8; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs3, %rs11, %p8; ; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p9, %rs1, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs17, %rs1, %rs15, %p9; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f6, %rs13; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs1, %rs12, %p9; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f6, %rs11; ; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p10, %f6, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs18, %rs17, %rs13, %p10; -; CHECK-SM80-NOF16-NEXT: mov.b32 %r3, {%rs18, %rs11}; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10; +; CHECK-SM80-NOF16-NEXT: mov.b32 %r3, {%rs14, %rs9}; ; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) @@ -1179,26 +1179,26 @@ define half @maximum_half(half %a, half %b) { ; CHECK-NOF16-LABEL: maximum_half( ; CHECK-NOF16: { ; CHECK-NOF16-NEXT: .reg .pred %p<6>; -; CHECK-NOF16-NEXT: .reg .b16 %rs<10>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<8>; ; CHECK-NOF16-NEXT: .reg .f32 %f<4>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [maximum_half_param_0]; -; CHECK-NOF16-NEXT: ld.param.b16 %rs3, [maximum_half_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs3; +; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [maximum_half_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; ; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs1; ; CHECK-NOF16-NEXT: setp.gt.f32 %p1, %f2, %f1; -; CHECK-NOF16-NEXT: selp.b16 %rs4, %rs1, %rs3, %p1; +; CHECK-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %f2, %f1; -; CHECK-NOF16-NEXT: selp.b16 %rs5, 0x7E00, %rs4, %p2; +; CHECK-NOF16-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2; ; CHECK-NOF16-NEXT: setp.eq.s16 %p3, %rs1, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs1, %rs5, %p3; -; CHECK-NOF16-NEXT: setp.eq.s16 %p4, %rs3, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs3, %rs6, %p4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs5; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3; +; CHECK-NOF16-NEXT: setp.eq.s16 %p4, %rs2, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs4; ; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs5, %p5; -; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs9; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_half( @@ -1215,26 +1215,26 @@ define half @maximum_half(half %a, half %b) { ; CHECK-SM80-NOF16-LABEL: maximum_half( ; CHECK-SM80-NOF16: { ; CHECK-SM80-NOF16-NEXT: .reg .pred %p<6>; -; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<10>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<8>; ; CHECK-SM80-NOF16-NEXT: .reg .f32 %f<4>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [maximum_half_param_0]; -; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs3, [maximum_half_param_1]; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f1, %rs3; +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [maximum_half_param_1]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f2, %rs1; ; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p1, %f2, %f1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs4, %rs1, %rs3, %p1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %f2, %f1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, 0x7E00, %rs4, %p2; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2; ; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p3, %rs1, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs1, %rs5, %p3; -; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p4, %rs3, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs3, %rs6, %p4; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f3, %rs5; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3; +; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p4, %rs2, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f3, %rs4; ; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs5, %p5; -; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs9; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs7; ; CHECK-SM80-NOF16-NEXT: ret; %x = call half @llvm.maximum.f16(half %a, half %b) ret half %x @@ -1447,7 +1447,7 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-LABEL: maximum_v2half( ; CHECK-NOF16: { ; CHECK-NOF16-NEXT: .reg .pred %p<11>; -; CHECK-NOF16-NEXT: .reg .b16 %rs<19>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<15>; ; CHECK-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-NOF16-NEXT: .reg .f32 %f<7>; ; CHECK-NOF16-EMPTY: @@ -1463,26 +1463,26 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %f2, %f1; ; CHECK-NOF16-NEXT: selp.b16 %rs6, 0x7E00, %rs5, %p2; ; CHECK-NOF16-NEXT: setp.eq.s16 %p3, %rs4, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs4, %rs6, %p3; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs4, %rs6, %p3; ; CHECK-NOF16-NEXT: setp.eq.s16 %p4, %rs2, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs2, %rs8, %p4; +; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs2, %rs7, %p4; ; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs6; ; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs10, %rs6, %p5; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5; ; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs1; ; CHECK-NOF16-NEXT: cvt.f32.f16 %f5, %rs3; ; CHECK-NOF16-NEXT: setp.gt.f32 %p6, %f5, %f4; -; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs3, %rs1, %p6; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs3, %rs1, %p6; ; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %f5, %f4; -; CHECK-NOF16-NEXT: selp.b16 %rs13, 0x7E00, %rs12, %p7; +; CHECK-NOF16-NEXT: selp.b16 %rs11, 0x7E00, %rs10, %p7; ; CHECK-NOF16-NEXT: setp.eq.s16 %p8, %rs3, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs3, %rs13, %p8; +; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs3, %rs11, %p8; ; CHECK-NOF16-NEXT: setp.eq.s16 %p9, %rs1, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs17, %rs1, %rs15, %p9; -; CHECK-NOF16-NEXT: cvt.f32.f16 %f6, %rs13; +; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs1, %rs12, %p9; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f6, %rs11; ; CHECK-NOF16-NEXT: setp.eq.f32 %p10, %f6, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs18, %rs17, %rs13, %p10; -; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs18, %rs11}; +; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10; +; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs14, %rs9}; ; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF16-NEXT: ret; ; @@ -1500,7 +1500,7 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-SM80-NOF16-LABEL: maximum_v2half( ; CHECK-SM80-NOF16: { ; CHECK-SM80-NOF16-NEXT: .reg .pred %p<11>; -; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<19>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<15>; ; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-SM80-NOF16-NEXT: .reg .f32 %f<7>; ; CHECK-SM80-NOF16-EMPTY: @@ -1516,26 +1516,26 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %f2, %f1; ; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, 0x7E00, %rs5, %p2; ; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p3, %rs4, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs4, %rs6, %p3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs4, %rs6, %p3; ; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p4, %rs2, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs2, %rs8, %p4; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs2, %rs7, %p4; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f3, %rs6; ; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs10, %rs6, %p5; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f4, %rs1; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f5, %rs3; ; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p6, %f5, %f4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs3, %rs1, %p6; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs3, %rs1, %p6; ; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %f5, %f4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, 0x7E00, %rs12, %p7; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, 0x7E00, %rs10, %p7; ; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p8, %rs3, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs3, %rs13, %p8; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs3, %rs11, %p8; ; CHECK-SM80-NOF16-NEXT: setp.eq.s16 %p9, %rs1, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs17, %rs1, %rs15, %p9; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f6, %rs13; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs1, %rs12, %p9; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f6, %rs11; ; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p10, %f6, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs18, %rs17, %rs13, %p10; -; CHECK-SM80-NOF16-NEXT: mov.b32 %r3, {%rs18, %rs11}; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10; +; CHECK-SM80-NOF16-NEXT: mov.b32 %r3, {%rs14, %rs9}; ; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll index da0e6b7ca5860..32eb998e7d1ff 100644 --- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll +++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll @@ -16,7 +16,7 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: ld.param.u32 %r3, [wombat_param_1]; ; CHECK-NEXT: ld.param.u32 %r2, [wombat_param_0]; ; CHECK-NEXT: mov.b32 %r10, 0; -; CHECK-NEXT: mov.u64 %rd1, 0; +; CHECK-NEXT: mov.b64 %rd1, 0; ; CHECK-NEXT: mov.b32 %r6, 1; ; CHECK-NEXT: $L__BB0_1: // %bb3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll index 4e6a5ea12bb75..a48ea6a0a9551 100644 --- a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll +++ b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll @@ -4,9 +4,9 @@ ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define ptx_kernel void @t1(ptr %a) { -; PTX32: mov.u16 %rs{{[0-9]+}}, 0; +; PTX32: mov.b16 %rs{{[0-9]+}}, 0; ; PTX32-NEXT: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}; -; PTX64: mov.u16 %rs{{[0-9]+}}, 0; +; PTX64: mov.b16 %rs{{[0-9]+}}, 0; ; PTX64-NEXT: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}; store i1 false, ptr %a ret void diff --git a/llvm/test/CodeGen/NVPTX/reg-types.ll b/llvm/test/CodeGen/NVPTX/reg-types.ll index 218ed65374491..cf2433ad75a97 100644 --- a/llvm/test/CodeGen/NVPTX/reg-types.ll +++ b/llvm/test/CodeGen/NVPTX/reg-types.ll @@ -31,16 +31,16 @@ entry: ; Verify that we use correct register types. store i8 1, ptr %s8, align 1 -; CHECK: mov.u16 [[R1:%rs[0-9]]], 1; +; CHECK: mov.b16 [[R1:%rs[0-9]]], 1; ; CHECK-NEXT: st.u8 {{.*}}, [[R1]] store i8 2, ptr %u8, align 1 -; CHECK: mov.u16 [[R2:%rs[0-9]]], 2; +; CHECK: mov.b16 [[R2:%rs[0-9]]], 2; ; CHECK-NEXT: st.u8 {{.*}}, [[R2]] store i16 3, ptr %s16, align 2 -; CHECK: mov.u16 [[R3:%rs[0-9]]], 3; +; CHECK: mov.b16 [[R3:%rs[0-9]]], 3; ; CHECK-NEXT: st.u16 {{.*}}, [[R3]] store i16 4, ptr %u16, align 2 -; CHECK: mov.u16 [[R4:%rs[0-9]]], 4; +; CHECK: mov.b16 [[R4:%rs[0-9]]], 4; ; CHECK-NEXT: st.u16 {{.*}}, [[R4]] store i32 5, ptr %s32, align 4 ; CHECK: mov.b32 [[R5:%r[0-9]]], 5; @@ -49,10 +49,10 @@ entry: ; CHECK: mov.b32 [[R6:%r[0-9]]], 6; ; CHECK-NEXT: st.u32 {{.*}}, [[R6]] store i64 7, ptr %s64, align 8 -; CHECK: mov.u64 [[R7:%rd[0-9]]], 7; +; CHECK: mov.b64 [[R7:%rd[0-9]]], 7; ; CHECK-NEXT: st.u64 {{.*}}, [[R7]] store i64 8, ptr %u64, align 8 -; CHECK: mov.u64 [[R8:%rd[0-9]]], 8; +; CHECK: mov.b64 [[R8:%rd[0-9]]], 8; ; CHECK-NEXT: st.u64 {{.*}}, [[R8]] ; FP constants are stored via integer registers, but that's an diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll index 044d21643ed9d..303c649b794fd 100644 --- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll @@ -60,17 +60,17 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwin define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-LABEL: out_v4i8( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_param_2]; -; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_param_1]; -; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_param_0]; -; CHECK-NEXT: and.b32 %r5, %r4, %r1; -; CHECK-NEXT: xor.b32 %r7, %r1, -1; -; CHECK-NEXT: and.b32 %r8, %r3, %r7; -; CHECK-NEXT: or.b32 %r9, %r5, %r8; -; CHECK-NEXT: st.param.b32 [func_retval0], %r9; +; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_param_1]; +; CHECK-NEXT: ld.param.u32 %r2, [out_v4i8_param_0]; +; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_param_2]; +; CHECK-NEXT: and.b32 %r4, %r2, %r3; +; CHECK-NEXT: xor.b32 %r5, %r3, -1; +; CHECK-NEXT: and.b32 %r6, %r1, %r5; +; CHECK-NEXT: or.b32 %r7, %r4, %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %mx = and <4 x i8> %x, %mask %notmask = xor <4 x i8> %mask, @@ -82,17 +82,17 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-LABEL: out_v4i8_undef( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_undef_param_2]; -; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_undef_param_1]; -; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_undef_param_0]; -; CHECK-NEXT: and.b32 %r5, %r4, %r1; -; CHECK-NEXT: xor.b32 %r7, %r1, -16711681; -; CHECK-NEXT: and.b32 %r8, %r3, %r7; -; CHECK-NEXT: or.b32 %r9, %r5, %r8; -; CHECK-NEXT: st.param.b32 [func_retval0], %r9; +; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_undef_param_1]; +; CHECK-NEXT: ld.param.u32 %r2, [out_v4i8_undef_param_0]; +; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_undef_param_2]; +; CHECK-NEXT: and.b32 %r4, %r2, %r3; +; CHECK-NEXT: xor.b32 %r5, %r3, -16711681; +; CHECK-NEXT: and.b32 %r6, %r1, %r5; +; CHECK-NEXT: or.b32 %r7, %r4, %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %mx = and <4 x i8> %x, %mask %notmask = xor <4 x i8> %mask, @@ -104,17 +104,17 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { ; CHECK-LABEL: out_v2i16( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [out_v2i16_param_2]; -; CHECK-NEXT: ld.param.u32 %r3, [out_v2i16_param_1]; -; CHECK-NEXT: ld.param.u32 %r4, [out_v2i16_param_0]; -; CHECK-NEXT: and.b32 %r5, %r4, %r1; -; CHECK-NEXT: xor.b32 %r7, %r1, -1; -; CHECK-NEXT: and.b32 %r8, %r3, %r7; -; CHECK-NEXT: or.b32 %r9, %r5, %r8; -; CHECK-NEXT: st.param.b32 [func_retval0], %r9; +; CHECK-NEXT: ld.param.u32 %r1, [out_v2i16_param_1]; +; CHECK-NEXT: ld.param.u32 %r2, [out_v2i16_param_0]; +; CHECK-NEXT: ld.param.u32 %r3, [out_v2i16_param_2]; +; CHECK-NEXT: and.b32 %r4, %r2, %r3; +; CHECK-NEXT: xor.b32 %r5, %r3, -1; +; CHECK-NEXT: and.b32 %r6, %r1, %r5; +; CHECK-NEXT: or.b32 %r7, %r4, %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %mx = and <2 x i16> %x, %mask %notmask = xor <2 x i16> %mask, @@ -152,21 +152,21 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-LABEL: out_v8i8( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<21>; +; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v8i8_param_1]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [out_v8i8_param_2]; -; CHECK-NEXT: ld.param.v2.u32 {%r9, %r10}, [out_v8i8_param_0]; -; CHECK-NEXT: and.b32 %r11, %r9, %r5; -; CHECK-NEXT: and.b32 %r13, %r10, %r6; -; CHECK-NEXT: xor.b32 %r15, %r6, -1; -; CHECK-NEXT: xor.b32 %r16, %r5, -1; -; CHECK-NEXT: and.b32 %r17, %r1, %r16; -; CHECK-NEXT: and.b32 %r18, %r2, %r15; -; CHECK-NEXT: or.b32 %r19, %r13, %r18; -; CHECK-NEXT: or.b32 %r20, %r11, %r17; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r20, %r19}; +; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v8i8_param_0]; +; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [out_v8i8_param_2]; +; CHECK-NEXT: and.b32 %r5, %r1, %r3; +; CHECK-NEXT: and.b32 %r6, %r2, %r4; +; CHECK-NEXT: ld.param.v2.u32 {%r7, %r8}, [out_v8i8_param_1]; +; CHECK-NEXT: xor.b32 %r9, %r4, -1; +; CHECK-NEXT: xor.b32 %r10, %r3, -1; +; CHECK-NEXT: and.b32 %r11, %r7, %r10; +; CHECK-NEXT: and.b32 %r12, %r8, %r9; +; CHECK-NEXT: or.b32 %r13, %r6, %r12; +; CHECK-NEXT: or.b32 %r14, %r5, %r11; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r13}; ; CHECK-NEXT: ret; %mx = and <8 x i8> %x, %mask %notmask = xor <8 x i8> %mask, @@ -178,21 +178,21 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: out_v4i16( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<21>; +; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v4i16_param_1]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [out_v4i16_param_2]; -; CHECK-NEXT: ld.param.v2.u32 {%r9, %r10}, [out_v4i16_param_0]; -; CHECK-NEXT: and.b32 %r11, %r9, %r5; -; CHECK-NEXT: and.b32 %r13, %r10, %r6; -; CHECK-NEXT: xor.b32 %r15, %r6, -1; -; CHECK-NEXT: xor.b32 %r16, %r5, -1; -; CHECK-NEXT: and.b32 %r17, %r1, %r16; -; CHECK-NEXT: and.b32 %r18, %r2, %r15; -; CHECK-NEXT: or.b32 %r19, %r13, %r18; -; CHECK-NEXT: or.b32 %r20, %r11, %r17; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r20, %r19}; +; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v4i16_param_0]; +; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [out_v4i16_param_2]; +; CHECK-NEXT: and.b32 %r5, %r1, %r3; +; CHECK-NEXT: and.b32 %r6, %r2, %r4; +; CHECK-NEXT: ld.param.v2.u32 {%r7, %r8}, [out_v4i16_param_1]; +; CHECK-NEXT: xor.b32 %r9, %r4, -1; +; CHECK-NEXT: xor.b32 %r10, %r3, -1; +; CHECK-NEXT: and.b32 %r11, %r7, %r10; +; CHECK-NEXT: and.b32 %r12, %r8, %r9; +; CHECK-NEXT: or.b32 %r13, %r6, %r12; +; CHECK-NEXT: or.b32 %r14, %r5, %r11; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r13}; ; CHECK-NEXT: ret; %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -204,21 +204,21 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: out_v4i16_undef( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<21>; +; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v4i16_undef_param_1]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [out_v4i16_undef_param_2]; -; CHECK-NEXT: ld.param.v2.u32 {%r9, %r10}, [out_v4i16_undef_param_0]; -; CHECK-NEXT: and.b32 %r11, %r9, %r5; -; CHECK-NEXT: and.b32 %r13, %r10, %r6; -; CHECK-NEXT: xor.b32 %r15, %r6, -65536; -; CHECK-NEXT: xor.b32 %r16, %r5, -1; -; CHECK-NEXT: and.b32 %r17, %r1, %r16; -; CHECK-NEXT: and.b32 %r18, %r2, %r15; -; CHECK-NEXT: or.b32 %r19, %r13, %r18; -; CHECK-NEXT: or.b32 %r20, %r11, %r17; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r20, %r19}; +; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v4i16_undef_param_0]; +; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [out_v4i16_undef_param_2]; +; CHECK-NEXT: and.b32 %r5, %r1, %r3; +; CHECK-NEXT: and.b32 %r6, %r2, %r4; +; CHECK-NEXT: ld.param.v2.u32 {%r7, %r8}, [out_v4i16_undef_param_1]; +; CHECK-NEXT: xor.b32 %r9, %r4, -65536; +; CHECK-NEXT: xor.b32 %r10, %r3, -1; +; CHECK-NEXT: and.b32 %r11, %r7, %r10; +; CHECK-NEXT: and.b32 %r12, %r8, %r9; +; CHECK-NEXT: or.b32 %r13, %r6, %r12; +; CHECK-NEXT: or.b32 %r14, %r5, %r11; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r13}; ; CHECK-NEXT: ret; %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -282,29 +282,29 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { ; CHECK-LABEL: out_v16i8( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<41>; +; CHECK-NEXT: .reg .b32 %r<29>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_1]; -; CHECK-NEXT: ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [out_v16i8_param_2]; -; CHECK-NEXT: ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [out_v16i8_param_0]; -; CHECK-NEXT: and.b32 %r21, %r17, %r9; -; CHECK-NEXT: and.b32 %r23, %r18, %r10; -; CHECK-NEXT: and.b32 %r25, %r19, %r11; -; CHECK-NEXT: and.b32 %r27, %r20, %r12; -; CHECK-NEXT: xor.b32 %r29, %r12, -1; -; CHECK-NEXT: xor.b32 %r30, %r11, -1; -; CHECK-NEXT: xor.b32 %r31, %r10, -1; -; CHECK-NEXT: xor.b32 %r32, %r9, -1; -; CHECK-NEXT: and.b32 %r33, %r1, %r32; -; CHECK-NEXT: and.b32 %r34, %r2, %r31; -; CHECK-NEXT: and.b32 %r35, %r3, %r30; -; CHECK-NEXT: and.b32 %r36, %r4, %r29; -; CHECK-NEXT: or.b32 %r37, %r27, %r36; -; CHECK-NEXT: or.b32 %r38, %r25, %r35; -; CHECK-NEXT: or.b32 %r39, %r23, %r34; -; CHECK-NEXT: or.b32 %r40, %r21, %r33; -; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r40, %r39, %r38, %r37}; +; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_0]; +; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v16i8_param_2]; +; CHECK-NEXT: and.b32 %r9, %r1, %r5; +; CHECK-NEXT: and.b32 %r10, %r2, %r6; +; CHECK-NEXT: and.b32 %r11, %r3, %r7; +; CHECK-NEXT: and.b32 %r12, %r4, %r8; +; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v16i8_param_1]; +; CHECK-NEXT: xor.b32 %r17, %r8, -1; +; CHECK-NEXT: xor.b32 %r18, %r7, -1; +; CHECK-NEXT: xor.b32 %r19, %r6, -1; +; CHECK-NEXT: xor.b32 %r20, %r5, -1; +; CHECK-NEXT: and.b32 %r21, %r13, %r20; +; CHECK-NEXT: and.b32 %r22, %r14, %r19; +; CHECK-NEXT: and.b32 %r23, %r15, %r18; +; CHECK-NEXT: and.b32 %r24, %r16, %r17; +; CHECK-NEXT: or.b32 %r25, %r12, %r24; +; CHECK-NEXT: or.b32 %r26, %r11, %r23; +; CHECK-NEXT: or.b32 %r27, %r10, %r22; +; CHECK-NEXT: or.b32 %r28, %r9, %r21; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25}; ; CHECK-NEXT: ret; %mx = and <16 x i8> %x, %mask %notmask = xor <16 x i8> %mask, @@ -316,29 +316,29 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-LABEL: out_v8i16( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<41>; +; CHECK-NEXT: .reg .b32 %r<29>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v8i16_param_1]; -; CHECK-NEXT: ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [out_v8i16_param_2]; -; CHECK-NEXT: ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [out_v8i16_param_0]; -; CHECK-NEXT: and.b32 %r21, %r17, %r9; -; CHECK-NEXT: and.b32 %r23, %r18, %r10; -; CHECK-NEXT: and.b32 %r25, %r19, %r11; -; CHECK-NEXT: and.b32 %r27, %r20, %r12; -; CHECK-NEXT: xor.b32 %r29, %r12, -1; -; CHECK-NEXT: xor.b32 %r30, %r11, -1; -; CHECK-NEXT: xor.b32 %r31, %r10, -1; -; CHECK-NEXT: xor.b32 %r32, %r9, -1; -; CHECK-NEXT: and.b32 %r33, %r1, %r32; -; CHECK-NEXT: and.b32 %r34, %r2, %r31; -; CHECK-NEXT: and.b32 %r35, %r3, %r30; -; CHECK-NEXT: and.b32 %r36, %r4, %r29; -; CHECK-NEXT: or.b32 %r37, %r27, %r36; -; CHECK-NEXT: or.b32 %r38, %r25, %r35; -; CHECK-NEXT: or.b32 %r39, %r23, %r34; -; CHECK-NEXT: or.b32 %r40, %r21, %r33; -; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r40, %r39, %r38, %r37}; +; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v8i16_param_0]; +; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v8i16_param_2]; +; CHECK-NEXT: and.b32 %r9, %r1, %r5; +; CHECK-NEXT: and.b32 %r10, %r2, %r6; +; CHECK-NEXT: and.b32 %r11, %r3, %r7; +; CHECK-NEXT: and.b32 %r12, %r4, %r8; +; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v8i16_param_1]; +; CHECK-NEXT: xor.b32 %r17, %r8, -1; +; CHECK-NEXT: xor.b32 %r18, %r7, -1; +; CHECK-NEXT: xor.b32 %r19, %r6, -1; +; CHECK-NEXT: xor.b32 %r20, %r5, -1; +; CHECK-NEXT: and.b32 %r21, %r13, %r20; +; CHECK-NEXT: and.b32 %r22, %r14, %r19; +; CHECK-NEXT: and.b32 %r23, %r15, %r18; +; CHECK-NEXT: and.b32 %r24, %r16, %r17; +; CHECK-NEXT: or.b32 %r25, %r12, %r24; +; CHECK-NEXT: or.b32 %r26, %r11, %r23; +; CHECK-NEXT: or.b32 %r27, %r10, %r22; +; CHECK-NEXT: or.b32 %r28, %r9, %r21; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25}; ; CHECK-NEXT: ret; %mx = and <8 x i16> %x, %mask %notmask = xor <8 x i16> %mask, @@ -497,7 +497,7 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-LABEL: in_v4i8( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [in_v4i8_param_0]; @@ -517,7 +517,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { ; CHECK-LABEL: in_v2i16( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [in_v2i16_param_0]; @@ -561,7 +561,7 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-LABEL: in_v8i8( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [in_v8i8_param_0]; @@ -570,10 +570,10 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-NEXT: xor.b32 %r7, %r2, %r4; ; CHECK-NEXT: and.b32 %r8, %r7, %r6; ; CHECK-NEXT: xor.b32 %r9, %r8, %r4; -; CHECK-NEXT: xor.b32 %r11, %r1, %r3; -; CHECK-NEXT: and.b32 %r12, %r11, %r5; -; CHECK-NEXT: xor.b32 %r13, %r12, %r3; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r13, %r9}; +; CHECK-NEXT: xor.b32 %r10, %r1, %r3; +; CHECK-NEXT: and.b32 %r11, %r10, %r5; +; CHECK-NEXT: xor.b32 %r12, %r11, %r3; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r9}; ; CHECK-NEXT: ret; %n0 = xor <8 x i8> %x, %y %n1 = and <8 x i8> %n0, %mask @@ -584,7 +584,7 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: in_v4i16( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [in_v4i16_param_0]; @@ -593,10 +593,10 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind ; CHECK-NEXT: xor.b32 %r7, %r2, %r4; ; CHECK-NEXT: and.b32 %r8, %r7, %r6; ; CHECK-NEXT: xor.b32 %r9, %r8, %r4; -; CHECK-NEXT: xor.b32 %r11, %r1, %r3; -; CHECK-NEXT: and.b32 %r12, %r11, %r5; -; CHECK-NEXT: xor.b32 %r13, %r12, %r3; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r13, %r9}; +; CHECK-NEXT: xor.b32 %r10, %r1, %r3; +; CHECK-NEXT: and.b32 %r11, %r10, %r5; +; CHECK-NEXT: xor.b32 %r12, %r11, %r3; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r9}; ; CHECK-NEXT: ret; %n0 = xor <4 x i16> %x, %y %n1 = and <4 x i16> %n0, %mask @@ -654,7 +654,7 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { ; CHECK-LABEL: in_v16i8( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<29>; +; CHECK-NEXT: .reg .b32 %r<25>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v16i8_param_0]; @@ -669,10 +669,10 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind ; CHECK-NEXT: and.b32 %r19, %r10, %r15; ; CHECK-NEXT: and.b32 %r20, %r9, %r16; ; CHECK-NEXT: xor.b32 %r21, %r20, %r8; -; CHECK-NEXT: xor.b32 %r23, %r19, %r7; -; CHECK-NEXT: xor.b32 %r25, %r18, %r6; -; CHECK-NEXT: xor.b32 %r27, %r17, %r5; -; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r27, %r25, %r23, %r21}; +; CHECK-NEXT: xor.b32 %r22, %r19, %r7; +; CHECK-NEXT: xor.b32 %r23, %r18, %r6; +; CHECK-NEXT: xor.b32 %r24, %r17, %r5; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r24, %r23, %r22, %r21}; ; CHECK-NEXT: ret; %n0 = xor <16 x i8> %x, %y %n1 = and <16 x i8> %n0, %mask @@ -683,7 +683,7 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-LABEL: in_v8i16( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<29>; +; CHECK-NEXT: .reg .b32 %r<25>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v8i16_param_0]; @@ -698,10 +698,10 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind ; CHECK-NEXT: and.b32 %r19, %r10, %r15; ; CHECK-NEXT: and.b32 %r20, %r9, %r16; ; CHECK-NEXT: xor.b32 %r21, %r20, %r8; -; CHECK-NEXT: xor.b32 %r23, %r19, %r7; -; CHECK-NEXT: xor.b32 %r25, %r18, %r6; -; CHECK-NEXT: xor.b32 %r27, %r17, %r5; -; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r27, %r25, %r23, %r21}; +; CHECK-NEXT: xor.b32 %r22, %r19, %r7; +; CHECK-NEXT: xor.b32 %r23, %r18, %r6; +; CHECK-NEXT: xor.b32 %r24, %r17, %r5; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r24, %r23, %r22, %r21}; ; CHECK-NEXT: ret; %n0 = xor <8 x i16> %x, %y %n1 = and <8 x i16> %n0, %mask diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll index 8ecdff9d65ac1..697bdcd935fae 100644 --- a/llvm/test/CodeGen/NVPTX/vaargs.ll +++ b/llvm/test/CodeGen/NVPTX/vaargs.ll @@ -16,7 +16,7 @@ entry: ; Test va_start ; CHECK: .param .align 8 .b8 foo_vararg[] -; CHECK: mov.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], foo_vararg; +; CHECK: mov.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], foo_vararg; ; CHECK-NEXT: st.u[[BITS]] [%SP], [[VA_PTR]]; call void @llvm.va_start(ptr %al) diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index 98b051f92a20f..cb54812dea6d9 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -111,13 +111,13 @@ define dso_local i32 @foo() { ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot1; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-PTX-NEXT: mov.u64 %rd1, 4294967297; +; CHECK-PTX-NEXT: mov.b64 %rd1, 4294967297; ; CHECK-PTX-NEXT: st.u64 [%SP], %rd1; ; CHECK-PTX-NEXT: mov.b32 %r1, 1; ; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1; -; CHECK-PTX-NEXT: mov.u64 %rd2, 1; +; CHECK-PTX-NEXT: mov.b64 %rd2, 1; ; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd2; -; CHECK-PTX-NEXT: mov.u64 %rd3, 4607182418800017408; +; CHECK-PTX-NEXT: mov.b64 %rd3, 4607182418800017408; ; CHECK-PTX-NEXT: st.u64 [%SP+24], %rd3; ; CHECK-PTX-NEXT: st.u64 [%SP+32], %rd3; ; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 0; @@ -242,9 +242,9 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1; ; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 8; ; CHECK-PTX-NEXT: or.b64 %rd6, %rd5, 4; -; CHECK-PTX-NEXT: mov.u16 %rs9, 1; +; CHECK-PTX-NEXT: mov.b16 %rs9, 1; ; CHECK-PTX-NEXT: st.u8 [%rd6], %rs9; -; CHECK-PTX-NEXT: mov.u64 %rd7, 1; +; CHECK-PTX-NEXT: mov.b64 %rd7, 1; ; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd7; ; CHECK-PTX-NEXT: { // callseq 1, 0 ; CHECK-PTX-NEXT: .param .b32 param0; @@ -400,7 +400,7 @@ define dso_local void @qux() { ; CHECK-PTX-NEXT: add.s64 %rd3, %rd2, 8; ; CHECK-PTX-NEXT: ld.global.nc.u64 %rd4, [%rd3]; ; CHECK-PTX-NEXT: st.u64 [%SP+8], %rd4; -; CHECK-PTX-NEXT: mov.u64 %rd5, 1; +; CHECK-PTX-NEXT: mov.b64 %rd5, 1; ; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd5; ; CHECK-PTX-NEXT: add.u64 %rd6, %SP, 16; ; CHECK-PTX-NEXT: { // callseq 3, 0 diff --git a/llvm/test/CodeGen/NVPTX/vector-returns.ll b/llvm/test/CodeGen/NVPTX/vector-returns.ll index bb120ee2ea019..2001d199ce0a7 100644 --- a/llvm/test/CodeGen/NVPTX/vector-returns.ll +++ b/llvm/test/CodeGen/NVPTX/vector-returns.ll @@ -9,7 +9,7 @@ define <3 x i64> @long3() { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u64 %rd1, 0; +; CHECK-NEXT: mov.b64 %rd1, 0; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd1}; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd1; ; CHECK-NEXT: ret; @@ -22,7 +22,7 @@ define <2 x i64> @long2() { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u64 %rd1, 0; +; CHECK-NEXT: mov.b64 %rd1, 0; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd1}; ; CHECK-NEXT: ret; ret <2 x i64> zeroinitializer @@ -34,7 +34,7 @@ define <1 x i64> @long1() { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u64 %rd1, 0; +; CHECK-NEXT: mov.b64 %rd1, 0; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-NEXT: ret; ret <1 x i64> zeroinitializer @@ -108,7 +108,7 @@ define <9 x i16> @short9() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v4.b16 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b16 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.b16 [func_retval0+16], %rs1; @@ -134,7 +134,7 @@ define <7 x i16> @short7() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v4.b16 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b16 [func_retval0+8], {%rs1, %rs1}; ; CHECK-NEXT: st.param.b16 [func_retval0+12], %rs1; @@ -148,7 +148,7 @@ define <5 x i16> @short5() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v4.b16 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.b16 [func_retval0+8], %rs1; ; CHECK-NEXT: ret; @@ -173,7 +173,7 @@ define <3 x i16> @short3() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs1, %rs1}; ; CHECK-NEXT: st.param.b16 [func_retval0+4], %rs1; ; CHECK-NEXT: ret; @@ -198,7 +198,7 @@ define <1 x i16> @short1() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; ret <1 x i16> zeroinitializer @@ -210,7 +210,7 @@ define <17 x i8> @byte17() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; @@ -238,7 +238,7 @@ define <15 x i8> @byte15() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; @@ -254,7 +254,7 @@ define <9 x i8> @byte9() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs1; @@ -280,7 +280,7 @@ define <7 x i8> @byte7() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+4], {%rs1, %rs1}; ; CHECK-NEXT: st.param.b8 [func_retval0+6], %rs1; @@ -294,7 +294,7 @@ define <5 x i8> @byte5() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1; ; CHECK-NEXT: ret; @@ -343,7 +343,7 @@ define <1 x i8> @byte1() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: ret; ret <1 x i8> zeroinitializer @@ -355,7 +355,7 @@ define <17 x i1> @bit17() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; @@ -371,7 +371,7 @@ define <16 x i1> @bit16() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v2.b8 [func_retval0], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+2], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+4], {%rs1, %rs1}; @@ -390,7 +390,7 @@ define <15 x i1> @bit15() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v2.b8 [func_retval0], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+2], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+4], {%rs1, %rs1}; @@ -409,7 +409,7 @@ define <9 x i1> @bit9() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.v2.b8 [func_retval0], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+2], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+4], {%rs1, %rs1}; @@ -425,7 +425,7 @@ define <8 x i1> @bit8() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs1; @@ -444,7 +444,7 @@ define <7 x i1> @bit7() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs1; @@ -462,7 +462,7 @@ define <5 x i1> @bit5() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs1; @@ -478,7 +478,7 @@ define <4 x i1> @bit4() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs1; @@ -493,7 +493,7 @@ define <3 x i1> @bit3() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs1; @@ -507,7 +507,7 @@ define <2 x i1> @bit2() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: ret; @@ -520,7 +520,7 @@ define <1 x i1> @bit1() { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u16 %rs1, 0; +; CHECK-NEXT: mov.b16 %rs1, 0; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: ret; ret <1 x i1> zeroinitializer From 2756e290002bc4514abcea208c32f1783ed945f7 Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Wed, 18 Dec 2024 23:43:17 +0000 Subject: [PATCH 2/2] further mov cleanup --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 19 --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 123 +++++-------------- llvm/test/CodeGen/NVPTX/atomics-sm70.ll | 6 +- llvm/test/CodeGen/NVPTX/misched_func_call.ll | 9 +- 4 files changed, 38 insertions(+), 119 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 94e90a84a2d41..eadff13070822 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -176,10 +176,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { case ISD::ADDRSPACECAST: SelectAddrSpaceCast(N); return; - case ISD::ConstantFP: - if (tryConstantFP(N)) - return; - break; case ISD::CopyToReg: { if (N->getOperand(1).getValueType() == MVT::i128) { SelectV2I64toI128(N); @@ -212,21 +208,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { } } -// There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we -// have to load them into an .(b)f16 register first. -bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) { - if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16) - return false; - SDValue Val = CurDAG->getTargetConstantFP( - cast(N)->getValueAPF(), SDLoc(N), N->getValueType(0)); - SDNode *LoadConstF16 = CurDAG->getMachineNode( - (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16 - : NVPTX::LOAD_CONST_BF16), - SDLoc(N), N->getValueType(0), Val); - ReplaceNode(N, LoadConstF16); - return true; -} - // Map ISD:CONDCODE value to appropriate CmpMode expected by // NVPTXInstPrinter::printCmpMode() static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index eb4918c43f0dc..711cd67eceed9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1177,17 +1177,6 @@ def NegDoubleConst : SDNodeXForm; -// Loads FP16 constant into a register. -// -// ptxas does not have hex representation for fp16, so we can't use -// fp16 immediate values in .f16 instructions. Instead we have to load -// the constant into a register using mov.b16. -def LOAD_CONST_F16 : - NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a), - "mov.b16 \t$dst, $a;", []>; -def LOAD_CONST_BF16 : - NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a), - "mov.b16 \t$dst, $a;", []>; defm FADD : F3_fma_component<"add", fadd>; defm FSUB : F3_fma_component<"sub", fsub>; defm FMUL : F3_fma_component<"mul", fmul>; @@ -1963,7 +1952,7 @@ let hasSideEffects = false in { // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp -let IsSimpleMove=1, hasSideEffects=0 in { +let IsSimpleMove=1, hasSideEffects=0, isAsCheapAsAMove=1 in { def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), "mov.pred \t$dst, $sss;", []>; def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), @@ -1975,44 +1964,40 @@ let IsSimpleMove=1, hasSideEffects=0 in { def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss), "mov.b128 \t$dst, $sss;", []>; - def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), - "mov.b16 \t$dst, $sss;", []>; - def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), - "mov.b32 \t$dst, $sss;", []>; - def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), - "mov.b64 \t$dst, $sss;", []>; - - def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), - // We have to use .b16 here as there's no mov.f16. - "mov.b16 \t$dst, $src;", []>; def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), "mov.f32 \t$dst, $src;", []>; def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), "mov.f64 \t$dst, $src;", []>; -} -def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), - "mov.pred \t$dst, $src;", - [(set i1:$dst, imm:$src)]>; -def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), - "mov.b16 \t$dst, $src;", - [(set i16:$dst, imm:$src)]>; -def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), - "mov.b32 \t$dst, $src;", - [(set i32:$dst, imm:$src)]>; -def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), - "mov.b64 \t$dst, $src;", - [(set i64:$dst, imm:$src)]>; - -def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), - "mov.f32 \t$dst, $src;", - [(set f32:$dst, fpimm:$src)]>; -def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), - "mov.f64 \t$dst, $src;", - [(set f64:$dst, fpimm:$src)]>; - -def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOVB32ri texternalsym:$dst)>; -def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOVB64ri texternalsym:$dst)>; + def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), + "mov.pred \t$dst, $src;", + [(set i1:$dst, imm:$src)]>; + def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), + "mov.b16 \t$dst, $src;", + [(set i16:$dst, imm:$src)]>; + def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), + "mov.b32 \t$dst, $src;", + [(set i32:$dst, imm:$src)]>; + def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), + "mov.b64 \t$dst, $src;", + [(set i64:$dst, imm:$src)]>; + + def FMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$src), + "mov.b16 \t$dst, $src;", + [(set f16:$dst, fpimm:$src)]>; + def BFMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$src), + "mov.b16 \t$dst, $src;", + [(set bf16:$dst, fpimm:$src)]>; + def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), + "mov.f32 \t$dst, $src;", + [(set f32:$dst, fpimm:$src)]>; + def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), + "mov.f64 \t$dst, $src;", + [(set f64:$dst, fpimm:$src)]>; +} + +def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; +def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>; //---- Copy Frame Index ---- def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), @@ -2208,18 +2193,6 @@ multiclass FSET_FORMAT { def : Pat<(i1 (OpNode f16:$a, f16:$b)), (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, Requires<[useFP16Math]>; - def : Pat<(i1 (OpNode f16:$a, fpimm:$b)), - (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, - Requires<[useFP16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode f16:$a, fpimm:$b)), - (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, - Requires<[useFP16Math]>; - def : Pat<(i1 (OpNode fpimm:$a, f16:$b)), - (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, - Requires<[useFP16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode fpimm:$a, f16:$b)), - (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, - Requires<[useFP16Math]>; // bf16 -> pred def : Pat<(i1 (OpNode bf16:$a, bf16:$b)), @@ -2228,18 +2201,6 @@ multiclass FSET_FORMAT { def : Pat<(i1 (OpNode bf16:$a, bf16:$b)), (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, Requires<[hasBF16Math]>; - def : Pat<(i1 (OpNode bf16:$a, fpimm:$b)), - (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, - Requires<[hasBF16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode bf16:$a, fpimm:$b)), - (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, - Requires<[hasBF16Math]>; - def : Pat<(i1 (OpNode fpimm:$a, bf16:$b)), - (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, - Requires<[hasBF16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode fpimm:$a, bf16:$b)), - (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, - Requires<[hasBF16Math]>; // f32 -> pred def : Pat<(i1 (OpNode f32:$a, f32:$b)), @@ -2273,18 +2234,6 @@ multiclass FSET_FORMAT { def : Pat<(i32 (OpNode f16:$a, f16:$b)), (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, Requires<[useFP16Math]>; - def : Pat<(i32 (OpNode f16:$a, fpimm:$b)), - (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, - Requires<[useFP16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode f16:$a, fpimm:$b)), - (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, - Requires<[useFP16Math]>; - def : Pat<(i32 (OpNode fpimm:$a, f16:$b)), - (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, - Requires<[useFP16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode fpimm:$a, f16:$b)), - (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, - Requires<[useFP16Math]>; // bf16 -> i32 def : Pat<(i32 (OpNode bf16:$a, bf16:$b)), @@ -2293,18 +2242,6 @@ multiclass FSET_FORMAT { def : Pat<(i32 (OpNode bf16:$a, bf16:$b)), (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, Requires<[hasBF16Math]>; - def : Pat<(i32 (OpNode bf16:$a, fpimm:$b)), - (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, - Requires<[hasBF16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode bf16:$a, fpimm:$b)), - (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, - Requires<[hasBF16Math]>; - def : Pat<(i32 (OpNode fpimm:$a, bf16:$b)), - (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, - Requires<[hasBF16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode fpimm:$a, bf16:$b)), - (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, - Requires<[hasBF16Math]>; // f32 -> i32 def : Pat<(i32 (OpNode f32:$a, f32:$b)), diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll index 05f466f2138ec..b76b3e59e9e6d 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll @@ -94,7 +94,8 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: and.b32 %r10, %r22, -4; ; CHECKPTX62-NEXT: shl.b32 %r38, %r22, 3; ; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24; -; CHECKPTX62-NEXT: shl.b32 %r40, %r26, %r11; +; CHECKPTX62-NEXT: mov.b32 %r39, 65535; +; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11; ; CHECKPTX62-NEXT: not.b32 %r12, %r40; ; CHECKPTX62-NEXT: ld.global.u32 %r56, [%r10]; ; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9 @@ -114,7 +115,8 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: and.b32 %r16, %r23, -4; ; CHECKPTX62-NEXT: shl.b32 %r46, %r23, 3; ; CHECKPTX62-NEXT: and.b32 %r17, %r46, 24; -; CHECKPTX62-NEXT: shl.b32 %r48, %r26, %r17; +; CHECKPTX62-NEXT: mov.b32 %r47, 65535; +; CHECKPTX62-NEXT: shl.b32 %r48, %r47, %r17; ; CHECKPTX62-NEXT: not.b32 %r18, %r48; ; CHECKPTX62-NEXT: ld.shared.u32 %r57, [%r16]; ; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll index 32eb998e7d1ff..e0d0197c6ead5 100644 --- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll +++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll @@ -16,8 +16,6 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: ld.param.u32 %r3, [wombat_param_1]; ; CHECK-NEXT: ld.param.u32 %r2, [wombat_param_0]; ; CHECK-NEXT: mov.b32 %r10, 0; -; CHECK-NEXT: mov.b64 %rd1, 0; -; CHECK-NEXT: mov.b32 %r6, 1; ; CHECK-NEXT: $L__BB0_1: // %bb3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: { // callseq 0, 0 @@ -29,16 +27,17 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.f64 %fd1, [retval0]; +; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3; ; CHECK-NEXT: or.b32 %r8, %r4, %r7; ; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8; ; CHECK-NEXT: cvt.rn.f64.s32 %fd3, %r9; -; CHECK-NEXT: ld.param.f64 %fd1, [retval0]; -; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: cvt.rn.f64.u32 %fd4, %r10; ; CHECK-NEXT: add.rn.f64 %fd5, %fd4, %fd3; +; CHECK-NEXT: mov.b64 %rd1, 0; ; CHECK-NEXT: st.global.f64 [%rd1], %fd5; -; CHECK-NEXT: mov.u32 %r10, %r6; +; CHECK-NEXT: mov.b32 %r10, 1; ; CHECK-NEXT: bra.uni $L__BB0_1; bb: br label %bb3