Skip to content

Commit 337a4d5

Browse files
authored
[NVPTX] Use sink registers instead of temp registers where possible. (#134957)
PTX 7.1 introduces the concept of a "sink" register, `_`, which is a register to which writes are ignored. This patch makes us use sink registers where possible, instead of using explicit temp registers. This results in cleaner assembly, and also works around a problem we encountered in some private workloads. (Unfortunately the tablegen is not particularly clean. But then again, it's tablegen...)
1 parent 6ca9a30 commit 337a4d5

File tree

6 files changed

+80
-39
lines changed

6 files changed

+80
-39
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2998,27 +2998,46 @@ let hasSideEffects = false in {
29982998
"{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
29992999
[]>;
30003000

3001+
// PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the
3002+
// unused high/low part.
3003+
def I32toI16H_Sink : NVPTXInst<(outs Int16Regs:$high),
3004+
(ins Int32Regs:$s),
3005+
"mov.b32 \t{{_, $high}}, $s;",
3006+
[]>, Requires<[hasPTX<71>]>;
3007+
def I32toI16L_Sink : NVPTXInst<(outs Int16Regs:$low),
3008+
(ins Int32Regs:$s),
3009+
"mov.b32 \t{{$low, _}}, $s;",
3010+
[]>, Requires<[hasPTX<71>]>;
3011+
def I64toI32H_Sink : NVPTXInst<(outs Int32Regs:$high),
3012+
(ins Int64Regs:$s),
3013+
"mov.b64 \t{{_, $high}}, $s;",
3014+
[]>, Requires<[hasPTX<71>]>;
3015+
def I64toI32L_Sink : NVPTXInst<(outs Int32Regs:$low),
3016+
(ins Int64Regs:$s),
3017+
"mov.b64 \t{{$low, _}}, $s;",
3018+
[]>, Requires<[hasPTX<71>]>;
30013019
}
30023020

3003-
// Using partial vectorized move produces better SASS code for extraction of
3004-
// upper/lower parts of an integer.
3005-
def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))),
3006-
(I32toI16H $s)>;
3007-
def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))),
3008-
(I32toI16H $s)>;
3009-
def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))),
3010-
(I64toI32H $s)>;
3011-
def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))),
3012-
(I64toI32H $s)>;
3021+
def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>;
3022+
def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>;
3023+
def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
3024+
def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
3025+
3026+
// Fall back to the old way if we don't have PTX 7.1.
3027+
def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H $s)>;
3028+
def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H $s)>;
3029+
def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H $s)>;
3030+
def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H $s)>;
30133031

30143032
def: Pat<(i32 (sext (extractelt v2i16:$src, 0))),
30153033
(CVT_INREG_s32_s16 $src)>;
30163034

30173035
foreach vt = [v2f16, v2bf16, v2i16] in {
3018-
def : Pat<(extractelt vt:$src, 0),
3019-
(I32toI16L $src)>;
3020-
def : Pat<(extractelt vt:$src, 1),
3021-
(I32toI16H $src)>;
3036+
def : Pat<(extractelt vt:$src, 0), (I32toI16L_Sink $src)>, Requires<[hasPTX<71>]>;
3037+
def : Pat<(extractelt vt:$src, 1), (I32toI16H_Sink $src)>, Requires<[hasPTX<71>]>;
3038+
3039+
def : Pat<(extractelt vt:$src, 0), (I32toI16L $src)>;
3040+
def : Pat<(extractelt vt:$src, 1), (I32toI16H $src)>;
30223041
}
30233042
def : Pat<(v2f16 (build_vector f16:$a, f16:$b)),
30243043
(V2I16toI32 $a, $b)>;
@@ -3405,6 +3424,14 @@ def : Pat <
34053424
(v2i16 (bswap v2i16:$a)),
34063425
(INT_NVVM_PRMT $a, (i32 0), (i32 0x2301))>;
34073426

3427+
def : Pat <
3428+
(i64 (bswap i64:$a)),
3429+
(V2I32toI64
3430+
(INT_NVVM_PRMT (I64toI32H_Sink $a), (i32 0), (i32 0x0123)),
3431+
(INT_NVVM_PRMT (I64toI32L_Sink $a), (i32 0), (i32 0x0123)))>,
3432+
Requires<[hasPTX<71>]>;
3433+
3434+
// Fall back to the old way if we don't have PTX 7.1.
34083435
def : Pat <
34093436
(i64 (bswap i64:$a)),
34103437
(V2I32toI64

llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
3636
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
3737
; SM70-NEXT: or.b32 %r9, %r5, 4194304;
3838
; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
39-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
39+
; SM70-NEXT: mov.b32 {_, %rs1}, %r10;
4040
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
4141
; SM70-NEXT: ret;
4242
;
@@ -104,7 +104,7 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
104104
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
105105
; SM70-NEXT: or.b32 %r9, %r5, 4194304;
106106
; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
107-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
107+
; SM70-NEXT: mov.b32 {_, %rs1}, %r10;
108108
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
109109
; SM70-NEXT: ret;
110110
;
@@ -628,7 +628,7 @@ define bfloat @test_fptrunc_float(float %a) #0 {
628628
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
629629
; SM70-NEXT: or.b32 %r5, %r1, 4194304;
630630
; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
631-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; }
631+
; SM70-NEXT: mov.b32 {_, %rs1}, %r6;
632632
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
633633
; SM70-NEXT: ret;
634634
;
@@ -688,7 +688,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
688688
; SM70-NEXT: setp.nan.f32 %p1, %f2, %f2;
689689
; SM70-NEXT: or.b32 %r7, %r3, 4194304;
690690
; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1;
691-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
691+
; SM70-NEXT: mov.b32 {_, %rs1}, %r8;
692692
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
693693
; SM70-NEXT: ret;
694694
;
@@ -1012,7 +1012,7 @@ define bfloat @test_sitofp_i16(i16 %a) {
10121012
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
10131013
; SM70-NEXT: or.b32 %r5, %r1, 4194304;
10141014
; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
1015-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
1015+
; SM70-NEXT: mov.b32 {_, %rs2}, %r6;
10161016
; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
10171017
; SM70-NEXT: ret;
10181018
;
@@ -1071,7 +1071,7 @@ define bfloat @test_uitofp_i8(i8 %a) {
10711071
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
10721072
; SM70-NEXT: or.b32 %r5, %r1, 4194304;
10731073
; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
1074-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
1074+
; SM70-NEXT: mov.b32 {_, %rs2}, %r6;
10751075
; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
10761076
; SM70-NEXT: ret;
10771077
;
@@ -1133,7 +1133,7 @@ define bfloat @test_uitofp_i1(i1 %a) {
11331133
; SM70-NEXT: setp.nan.f32 %p2, %f1, %f1;
11341134
; SM70-NEXT: or.b32 %r6, %r2, 4194304;
11351135
; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p2;
1136-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r7; }
1136+
; SM70-NEXT: mov.b32 {_, %rs3}, %r7;
11371137
; SM70-NEXT: st.param.b16 [func_retval0], %rs3;
11381138
; SM70-NEXT: ret;
11391139
;
@@ -1207,7 +1207,7 @@ define bfloat @test_uitofp_i16(i16 %a) {
12071207
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
12081208
; SM70-NEXT: or.b32 %r5, %r1, 4194304;
12091209
; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
1210-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
1210+
; SM70-NEXT: mov.b32 {_, %rs2}, %r6;
12111211
; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
12121212
; SM70-NEXT: ret;
12131213
;
@@ -1266,7 +1266,7 @@ define bfloat @test_uitofp_i32(i32 %a) {
12661266
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
12671267
; SM70-NEXT: or.b32 %r6, %r2, 4194304;
12681268
; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p1;
1269-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; }
1269+
; SM70-NEXT: mov.b32 {_, %rs1}, %r7;
12701270
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
12711271
; SM70-NEXT: ret;
12721272
;
@@ -1329,7 +1329,7 @@ define bfloat @test_uitofp_i64(i64 %a) {
13291329
; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
13301330
; SM70-NEXT: or.b32 %r5, %r1, 4194304;
13311331
; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
1332-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; }
1332+
; SM70-NEXT: mov.b32 {_, %rs1}, %r6;
13331333
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
13341334
; SM70-NEXT: ret;
13351335
;
@@ -1393,7 +1393,7 @@ define bfloat @test_roundeven(bfloat %a) {
13931393
; SM70-NEXT: setp.nan.f32 %p1, %f2, %f2;
13941394
; SM70-NEXT: or.b32 %r7, %r3, 4194304;
13951395
; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1;
1396-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
1396+
; SM70-NEXT: mov.b32 {_, %rs1}, %r8;
13971397
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
13981398
; SM70-NEXT: ret;
13991399
;
@@ -1528,7 +1528,7 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
15281528
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
15291529
; SM70-NEXT: or.b32 %r9, %r5, 4194304;
15301530
; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
1531-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
1531+
; SM70-NEXT: mov.b32 {_, %rs1}, %r10;
15321532
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
15331533
; SM70-NEXT: ret;
15341534
;
@@ -1655,7 +1655,7 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
16551655
; SM90-NEXT: max.NaN.bf16x2 %r3, %r2, %r1;
16561656
; SM90-NEXT: st.param.b32 [func_retval0], %r3;
16571657
; SM90-NEXT: ret;
1658-
%r = call <2 x bfloat> @llvm.maximum.bf16(<2 x bfloat> %a, <2 x bfloat> %b)
1658+
%r = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
16591659
ret <2 x bfloat> %r
16601660
}
16611661

@@ -1739,3 +1739,6 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
17391739
%r = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
17401740
ret <2 x bfloat> %r
17411741
}
1742+
1743+
declare bfloat @llvm.maximum.bf16(bfloat, bfloat)
1744+
declare <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat>, <2 x bfloat>)

llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,10 @@ define void @test_ldst_v3bf16(ptr %a, ptr %b) {
192192
; CHECK-NEXT: // %bb.0:
193193
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3bf16_param_0];
194194
; CHECK-NEXT: ld.u64 %rd2, [%rd1];
195-
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd2; }
195+
; CHECK-NEXT: mov.b64 {_, %r1}, %rd2;
196196
; CHECK-NEXT: ld.param.u64 %rd3, [test_ldst_v3bf16_param_1];
197197
; CHECK-NEXT: st.u32 [%rd3], %rd2;
198-
; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
198+
; CHECK-NEXT: mov.b32 {%rs1, _}, %r1;
199199
; CHECK-NEXT: st.b16 [%rd3+4], %rs1;
200200
; CHECK-NEXT: ret;
201201
%t1 = load <3 x bfloat>, ptr %a

llvm/test/CodeGen/NVPTX/bswap.ll

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2-
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s
33
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
4+
; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %}
5+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s
6+
; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
47

58
target triple = "nvptx64-nvidia-cuda"
69

@@ -60,11 +63,16 @@ define i64 @bswap64(i64 %a) {
6063
; CHECK-EMPTY:
6164
; CHECK-NEXT: // %bb.0:
6265
; CHECK-NEXT: ld.param.u64 %rd1, [bswap64_param_0];
63-
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
64-
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 291;
65-
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
66-
; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 291;
67-
; CHECK-NEXT: mov.b64 %rd2, {%r4, %r2};
66+
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
67+
; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 291;
68+
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
69+
; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 291;
70+
; PTX70-NEXT: mov.b64 %rd2, {%r4, %r2};
71+
; PTX71-NEXT: mov.b64 {%r1, _}, %rd1;
72+
; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 291;
73+
; PTX71-NEXT: mov.b64 {_, %r3}, %rd1;
74+
; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 291;
75+
; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2};
6876
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
6977
; CHECK-NEXT: ret;
7078
%b = tail call i64 @llvm.bswap.i64(i64 %a)

llvm/test/CodeGen/NVPTX/i16x2-instructions.ll

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ define i16 @test_extract_0(<2 x i16> %a) #0 {
4040
; COMMON-EMPTY:
4141
; COMMON-NEXT: // %bb.0:
4242
; COMMON-NEXT: ld.param.u32 %r1, [test_extract_0_param_0];
43-
; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
43+
; I16x2-NEXT: mov.b32 {%rs1, _}, %r1;
44+
; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
4445
; COMMON-NEXT: cvt.u32.u16 %r2, %rs1;
4546
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
4647
; COMMON-NEXT: ret;
@@ -56,7 +57,8 @@ define i16 @test_extract_1(<2 x i16> %a) #0 {
5657
; COMMON-EMPTY:
5758
; COMMON-NEXT: // %bb.0:
5859
; COMMON-NEXT: ld.param.u32 %r1, [test_extract_1_param_0];
59-
; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
60+
; I16x2-NEXT: mov.b32 {_, %rs1}, %r1;
61+
; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
6062
; COMMON-NEXT: cvt.u32.u16 %r2, %rs1;
6163
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
6264
; COMMON-NEXT: ret;
@@ -1006,7 +1008,8 @@ define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 {
10061008
; COMMON-NEXT: // %bb.0:
10071009
; COMMON-NEXT: ld.param.u16 %rs1, [test_insertelement_param_1];
10081010
; COMMON-NEXT: ld.param.u32 %r1, [test_insertelement_param_0];
1009-
; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
1011+
; I16x2-NEXT: mov.b32 {%rs2, _}, %r1;
1012+
; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
10101013
; COMMON-NEXT: mov.b32 %r2, {%rs2, %rs1};
10111014
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
10121015
; COMMON-NEXT: ret;

llvm/test/CodeGen/NVPTX/i8x4-instructions.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1412,7 +1412,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
14121412
; CHECK-NEXT: prmt.b32 %r10, %r11, %r12, 0x3340U;
14131413
; CHECK-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U;
14141414
; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10;
1415-
; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r13; }
1415+
; CHECK-NEXT: mov.b32 {%rs18, _}, %r13;
14161416
; CHECK-NEXT: st.u8 [%rd3], %rs18;
14171417
; CHECK-NEXT: shr.u16 %rs19, %rs18, 8;
14181418
; CHECK-NEXT: st.u8 [%rd3+1], %rs19;

0 commit comments

Comments
 (0)