Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 41 additions & 14 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2998,27 +2998,46 @@ let hasSideEffects = false in {
"{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
[]>;

// PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the
// unused high/low part.
def I32toI16H_Sink : NVPTXInst<(outs Int16Regs:$high),
(ins Int32Regs:$s),
"mov.b32 \t{{_, $high}}, $s;",
[]>, Requires<[hasPTX<71>]>;
def I32toI16L_Sink : NVPTXInst<(outs Int16Regs:$low),
(ins Int32Regs:$s),
"mov.b32 \t{{$low, _}}, $s;",
[]>, Requires<[hasPTX<71>]>;
def I64toI32H_Sink : NVPTXInst<(outs Int32Regs:$high),
(ins Int64Regs:$s),
"mov.b64 \t{{_, $high}}, $s;",
[]>, Requires<[hasPTX<71>]>;
def I64toI32L_Sink : NVPTXInst<(outs Int32Regs:$low),
(ins Int64Regs:$s),
"mov.b64 \t{{$low, _}}, $s;",
[]>, Requires<[hasPTX<71>]>;
}

// Using partial vectorized move produces better SASS code for extraction of
// upper/lower parts of an integer.
def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))),
(I32toI16H $s)>;
def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))),
(I32toI16H $s)>;
def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))),
(I64toI32H $s)>;
def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))),
(I64toI32H $s)>;
def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>;
def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>;
def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;

// Fall back to the old way if we don't have PTX 7.1.
def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H $s)>;
def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H $s)>;
def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H $s)>;
def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H $s)>;

def: Pat<(i32 (sext (extractelt v2i16:$src, 0))),
(CVT_INREG_s32_s16 $src)>;

foreach vt = [v2f16, v2bf16, v2i16] in {
def : Pat<(extractelt vt:$src, 0),
(I32toI16L $src)>;
def : Pat<(extractelt vt:$src, 1),
(I32toI16H $src)>;
def : Pat<(extractelt vt:$src, 0), (I32toI16L_Sink $src)>, Requires<[hasPTX<71>]>;
def : Pat<(extractelt vt:$src, 1), (I32toI16H_Sink $src)>, Requires<[hasPTX<71>]>;

def : Pat<(extractelt vt:$src, 0), (I32toI16L $src)>;
def : Pat<(extractelt vt:$src, 1), (I32toI16H $src)>;
}
def : Pat<(v2f16 (build_vector f16:$a, f16:$b)),
(V2I16toI32 $a, $b)>;
Expand Down Expand Up @@ -3405,6 +3424,14 @@ def : Pat <
(v2i16 (bswap v2i16:$a)),
(INT_NVVM_PRMT $a, (i32 0), (i32 0x2301))>;

def : Pat <
(i64 (bswap i64:$a)),
(V2I32toI64
(INT_NVVM_PRMT (I64toI32H_Sink $a), (i32 0), (i32 0x0123)),
(INT_NVVM_PRMT (I64toI32L_Sink $a), (i32 0), (i32 0x0123)))>,
Requires<[hasPTX<71>]>;

// Fall back to the old way if we don't have PTX 7.1.
def : Pat <
(i64 (bswap i64:$a)),
(V2I32toI64
Expand Down
29 changes: 16 additions & 13 deletions llvm/test/CodeGen/NVPTX/bf16-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
; SM70-NEXT: or.b32 %r9, %r5, 4194304;
; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
; SM70-NEXT: mov.b32 {_, %rs1}, %r10;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -104,7 +104,7 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
; SM70-NEXT: or.b32 %r9, %r5, 4194304;
; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
; SM70-NEXT: mov.b32 {_, %rs1}, %r10;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -628,7 +628,7 @@ define bfloat @test_fptrunc_float(float %a) #0 {
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
; SM70-NEXT: or.b32 %r5, %r1, 4194304;
; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; }
; SM70-NEXT: mov.b32 {_, %rs1}, %r6;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -688,7 +688,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
; SM70-NEXT: setp.nan.f32 %p1, %f2, %f2;
; SM70-NEXT: or.b32 %r7, %r3, 4194304;
; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
; SM70-NEXT: mov.b32 {_, %rs1}, %r8;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -1012,7 +1012,7 @@ define bfloat @test_sitofp_i16(i16 %a) {
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
; SM70-NEXT: or.b32 %r5, %r1, 4194304;
; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
; SM70-NEXT: mov.b32 {_, %rs2}, %r6;
; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -1071,7 +1071,7 @@ define bfloat @test_uitofp_i8(i8 %a) {
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
; SM70-NEXT: or.b32 %r5, %r1, 4194304;
; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
; SM70-NEXT: mov.b32 {_, %rs2}, %r6;
; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -1133,7 +1133,7 @@ define bfloat @test_uitofp_i1(i1 %a) {
; SM70-NEXT: setp.nan.f32 %p2, %f1, %f1;
; SM70-NEXT: or.b32 %r6, %r2, 4194304;
; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p2;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r7; }
; SM70-NEXT: mov.b32 {_, %rs3}, %r7;
; SM70-NEXT: st.param.b16 [func_retval0], %rs3;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -1207,7 +1207,7 @@ define bfloat @test_uitofp_i16(i16 %a) {
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
; SM70-NEXT: or.b32 %r5, %r1, 4194304;
; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
; SM70-NEXT: mov.b32 {_, %rs2}, %r6;
; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -1266,7 +1266,7 @@ define bfloat @test_uitofp_i32(i32 %a) {
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
; SM70-NEXT: or.b32 %r6, %r2, 4194304;
; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; }
; SM70-NEXT: mov.b32 {_, %rs1}, %r7;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -1329,7 +1329,7 @@ define bfloat @test_uitofp_i64(i64 %a) {
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
; SM70-NEXT: or.b32 %r5, %r1, 4194304;
; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; }
; SM70-NEXT: mov.b32 {_, %rs1}, %r6;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -1393,7 +1393,7 @@ define bfloat @test_roundeven(bfloat %a) {
; SM70-NEXT: setp.nan.f32 %p1, %f2, %f2;
; SM70-NEXT: or.b32 %r7, %r3, 4194304;
; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
; SM70-NEXT: mov.b32 {_, %rs1}, %r8;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -1528,7 +1528,7 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
; SM70-NEXT: or.b32 %r9, %r5, 4194304;
; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
; SM70-NEXT: mov.b32 {_, %rs1}, %r10;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -1655,7 +1655,7 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
; SM90-NEXT: max.NaN.bf16x2 %r3, %r2, %r1;
; SM90-NEXT: st.param.b32 [func_retval0], %r3;
; SM90-NEXT: ret;
%r = call <2 x bfloat> @llvm.maximum.bf16(<2 x bfloat> %a, <2 x bfloat> %b)
%r = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
ret <2 x bfloat> %r
}

Expand Down Expand Up @@ -1739,3 +1739,6 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
%r = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
ret <2 x bfloat> %r
}

declare bfloat @llvm.maximum.bf16(bfloat, bfloat)
declare <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat>, <2 x bfloat>)
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,10 @@ define void @test_ldst_v3bf16(ptr %a, ptr %b) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3bf16_param_0];
; CHECK-NEXT: ld.u64 %rd2, [%rd1];
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd2; }
; CHECK-NEXT: mov.b64 {_, %r1}, %rd2;
; CHECK-NEXT: ld.param.u64 %rd3, [test_ldst_v3bf16_param_1];
; CHECK-NEXT: st.u32 [%rd3], %rd2;
; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
; CHECK-NEXT: mov.b32 {%rs1, _}, %r1;
; CHECK-NEXT: st.b16 [%rd3+4], %rs1;
; CHECK-NEXT: ret;
%t1 = load <3 x bfloat>, ptr %a
Expand Down
20 changes: 14 additions & 6 deletions llvm/test/CodeGen/NVPTX/bswap.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %}
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s
; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}

target triple = "nvptx64-nvidia-cuda"

Expand Down Expand Up @@ -60,11 +63,16 @@ define i64 @bswap64(i64 %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [bswap64_param_0];
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 291;
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 291;
; CHECK-NEXT: mov.b64 %rd2, {%r4, %r2};
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 291;
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 291;
; PTX70-NEXT: mov.b64 %rd2, {%r4, %r2};
; PTX71-NEXT: mov.b64 {%r1, _}, %rd1;
; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 291;
; PTX71-NEXT: mov.b64 {_, %r3}, %rd1;
Comment on lines +71 to +73
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not this patch's problem, but we're doing something silly here. those two mov instructions should've been just one splitting move.

; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 291;
; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2};
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
; CHECK-NEXT: ret;
%b = tail call i64 @llvm.bswap.i64(i64 %a)
Expand Down
9 changes: 6 additions & 3 deletions llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ define i16 @test_extract_0(<2 x i16> %a) #0 {
; COMMON-EMPTY:
; COMMON-NEXT: // %bb.0:
; COMMON-NEXT: ld.param.u32 %r1, [test_extract_0_param_0];
; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
; I16x2-NEXT: mov.b32 {%rs1, _}, %r1;
; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
; COMMON-NEXT: cvt.u32.u16 %r2, %rs1;
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
; COMMON-NEXT: ret;
Expand All @@ -56,7 +57,8 @@ define i16 @test_extract_1(<2 x i16> %a) #0 {
; COMMON-EMPTY:
; COMMON-NEXT: // %bb.0:
; COMMON-NEXT: ld.param.u32 %r1, [test_extract_1_param_0];
; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
; I16x2-NEXT: mov.b32 {_, %rs1}, %r1;
; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
; COMMON-NEXT: cvt.u32.u16 %r2, %rs1;
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
; COMMON-NEXT: ret;
Expand Down Expand Up @@ -1006,7 +1008,8 @@ define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 {
; COMMON-NEXT: // %bb.0:
; COMMON-NEXT: ld.param.u16 %rs1, [test_insertelement_param_1];
; COMMON-NEXT: ld.param.u32 %r1, [test_insertelement_param_0];
; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
; I16x2-NEXT: mov.b32 {%rs2, _}, %r1;
; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
; COMMON-NEXT: mov.b32 %r2, {%rs2, %rs1};
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
; COMMON-NEXT: ret;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1412,7 +1412,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
; CHECK-NEXT: prmt.b32 %r10, %r11, %r12, 0x3340U;
; CHECK-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U;
; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10;
; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r13; }
; CHECK-NEXT: mov.b32 {%rs18, _}, %r13;
; CHECK-NEXT: st.u8 [%rd3], %rs18;
; CHECK-NEXT: shr.u16 %rs19, %rs18, 8;
; CHECK-NEXT: st.u8 [%rd3+1], %rs19;
Expand Down