Skip to content

Commit a568efb

Browse files
committed
[NVPTX] Use sink registers instead of temp registers where possible.
PTX 7.1 introduces the concept of a "sink" register, `_`, which is a register to which writes are ignored. This patch makes us use sink registers where possible, instead of using explicit temp registers. This results in cleaner assembly, and also works around a problem we encountered in some private workloads. (Unfortunately the tablegen is not particularly clean. But then again, it's tablegen...)
1 parent b5cc222 commit a568efb

File tree

2 files changed

+59
-13
lines changed

2 files changed

+59
-13
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3463,10 +3463,36 @@ let hasSideEffects = false in {
34633463
"{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
34643464
[]>;
34653465

3466+
// PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the
3467+
// unused high/low part.
3468+
def I32toI16H_ptx71 : NVPTXInst<(outs Int16Regs:$high),
3469+
(ins Int32Regs:$s),
3470+
"mov.b32 \t{{_, $high}}, $s;",
3471+
[]>, Requires<[hasPTX<71>]>;
3472+
def I32toI16L_ptx71 : NVPTXInst<(outs Int16Regs:$low),
3473+
(ins Int32Regs:$s),
3474+
"mov.b32 \t{{$low, _}}, $s;",
3475+
[]>, Requires<[hasPTX<71>]>;
3476+
def I64toI32H_ptx71 : NVPTXInst<(outs Int32Regs:$high),
3477+
(ins Int64Regs:$s),
3478+
"mov.b64 \t{{_, $high}}, $s;",
3479+
[]>, Requires<[hasPTX<71>]>;
3480+
def I64toI32L_ptx71 : NVPTXInst<(outs Int32Regs:$low),
3481+
(ins Int64Regs:$s),
3482+
"mov.b64 \t{{$low, _}}, $s;",
3483+
[]>, Requires<[hasPTX<71>]>;
34663484
}
34673485

3468-
// Using partial vectorized move produces better SASS code for extraction of
3469-
// upper/lower parts of an integer.
3486+
def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
3487+
(I32toI16H_ptx71 Int32Regs:$s)>, Requires<[hasPTX<71>]>;
3488+
def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
3489+
(I32toI16H_ptx71 Int32Regs:$s)>, Requires<[hasPTX<71>]>;
3490+
def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
3491+
(I64toI32H_ptx71 Int64Regs:$s)>, Requires<[hasPTX<71>]>;
3492+
def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
3493+
(I64toI32H_ptx71 Int64Regs:$s)>, Requires<[hasPTX<71>]>;
3494+
3495+
// Fall back to the old way if we don't have PTX 7.1.
34703496
def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
34713497
(I32toI16H Int32Regs:$s)>;
34723498
def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
@@ -3480,10 +3506,15 @@ def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))),
34803506
(CVT_INREG_s32_s16 Int32Regs:$src)>;
34813507

34823508
foreach vt = [v2f16, v2bf16, v2i16] in {
3483-
def : Pat<(extractelt (vt Int32Regs:$src), 0),
3484-
(I32toI16L Int32Regs:$src)>;
3485-
def : Pat<(extractelt (vt Int32Regs:$src), 1),
3486-
(I32toI16H Int32Regs:$src)>;
3509+
def : Pat<(extractelt (vt Int32Regs:$src), 0),
3510+
(I32toI16L_ptx71 Int32Regs:$src)>, Requires<[hasPTX<71>]>;
3511+
def : Pat<(extractelt (vt Int32Regs:$src), 1),
3512+
(I32toI16H_ptx71 Int32Regs:$src)>, Requires<[hasPTX<71>]>;
3513+
3514+
def : Pat<(extractelt (vt Int32Regs:$src), 0),
3515+
(I32toI16L Int32Regs:$src)>;
3516+
def : Pat<(extractelt (vt Int32Regs:$src), 1),
3517+
(I32toI16H Int32Regs:$src)>;
34873518
}
34883519
def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
34893520
(V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
@@ -3866,6 +3897,14 @@ def : Pat <
38663897
(v2i16 (bswap v2i16:$a)),
38673898
(INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
38683899

3900+
def : Pat <
3901+
(i64 (bswap i64:$a)),
3902+
(V2I32toI64
3903+
(INT_NVVM_PRMT (I64toI32H_ptx71 Int64Regs:$a), (i32 0), (i32 0x0123)),
3904+
(INT_NVVM_PRMT (I64toI32L_ptx71 Int64Regs:$a), (i32 0), (i32 0x0123)))>,
3905+
Requires<[hasPTX<71>]>;
3906+
3907+
// Fall back to the old way if we don't have PTX 7.1.
38693908
def : Pat <
38703909
(i64 (bswap i64:$a)),
38713910
(V2I32toI64

llvm/test/CodeGen/NVPTX/bswap.ll

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2-
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
3-
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s
3+
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %}
4+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s
5+
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
46

57
target triple = "nvptx64-nvidia-cuda"
68

@@ -60,11 +62,16 @@ define i64 @bswap64(i64 %a) {
6062
; CHECK-EMPTY:
6163
; CHECK-NEXT: // %bb.0:
6264
; CHECK-NEXT: ld.param.u64 %rd1, [bswap64_param_0];
63-
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
64-
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 291;
65-
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
66-
; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 291;
67-
; CHECK-NEXT: mov.b64 %rd2, {%r4, %r2};
65+
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
66+
; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 291;
67+
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
68+
; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 291;
69+
; PTX70-NEXT: mov.b64 %rd2, {%r4, %r2};
70+
; PTX71-NEXT: mov.b64 {%r1, _}, %rd1;
71+
; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 291;
72+
; PTX71-NEXT: mov.b64 {_, %r3}, %rd1;
73+
; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 291;
74+
; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2};
6875
; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd2;
6976
; CHECK-NEXT: ret;
7077
%b = tail call i64 @llvm.bswap.i64(i64 %a)

0 commit comments

Comments
 (0)