Skip to content

Commit bc77363

Browse files
authored
[NVPTX] Do not mark move of global address as cheap enabling more CSE (#153730)
1 parent fd3f052 commit bc77363

File tree

2 files changed

+76
-9
lines changed

2 files changed

+76
-9
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1510,18 +1510,23 @@ let hasSideEffects = false in {
15101510
"mov.b64 \t$d, __local_depot$num;">;
15111511
}
15121512

1513-
1514-
// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
1515-
let hasSideEffects = false, isAsCheapAsAMove = true in {
1516-
let isMoveReg = true in
1513+
let hasSideEffects = false in {
1514+
let isMoveReg = true, isAsCheapAsAMove = true in
15171515
class MOVr<RegisterClass RC, string OpStr> :
15181516
BasicNVPTXInst<(outs RC:$dst), (ins RC:$src), "mov." # OpStr>;
15191517

1520-
let isMoveImm = true in
1518+
let isMoveImm = true, isAsCheapAsAMove = true in
15211519
class MOVi<RegTyInfo t, string suffix> :
15221520
BasicNVPTXInst<(outs t.RC:$dst), (ins t.Imm:$src),
15231521
"mov." # suffix,
15241522
[(set t.Ty:$dst, t.ImmNode:$src)]>;
1523+
1524+
// We don't want to set isAsCheapAsAMove to true for these instructions as
1525+
// this would prevent CSE and resulted in regressions (see discussion after
1526+
// PR-145581 in llvm-project).
1527+
class MovSymInst<RegTyInfo t> :
1528+
BasicNVPTXInst<(outs t.RC:$dst), (ins Operand<t.Ty>:$src),
1529+
"mov.b" # t.Size>;
15251530
}
15261531

15271532
def MOV_B1_r : MOVr<B1, "pred">;
@@ -1539,6 +1544,9 @@ def MOV_BF16_i : MOVi<BF16RT, "b16">;
15391544
def MOV_F32_i : MOVi<F32RT, "b32">;
15401545
def MOV_F64_i : MOVi<F64RT, "b64">;
15411546

1547+
def MOV_B32_sym : MovSymInst<I32RT>;
1548+
def MOV_B64_sym : MovSymInst<I64RT>;
1549+
15421550

15431551
def to_tglobaladdr : SDNodeXForm<globaladdr, [{
15441552
return CurDAG->getTargetGlobalAddress(N->getGlobal(), SDLoc(N),
@@ -1555,11 +1563,11 @@ def to_tframeindex : SDNodeXForm<frameindex, [{
15551563
return CurDAG->getTargetFrameIndex(N->getIndex(), N->getValueType(0));
15561564
}]>;
15571565

1558-
def : Pat<(i32 globaladdr:$dst), (MOV_B32_i (to_tglobaladdr $dst))>;
1559-
def : Pat<(i64 globaladdr:$dst), (MOV_B64_i (to_tglobaladdr $dst))>;
1566+
def : Pat<(i32 globaladdr:$dst), (MOV_B32_sym (to_tglobaladdr $dst))>;
1567+
def : Pat<(i64 globaladdr:$dst), (MOV_B64_sym (to_tglobaladdr $dst))>;
15601568

1561-
def : Pat<(i32 externalsym:$dst), (MOV_B32_i (to_texternsym $dst))>;
1562-
def : Pat<(i64 externalsym:$dst), (MOV_B64_i (to_texternsym $dst))>;
1569+
def : Pat<(i32 externalsym:$dst), (MOV_B32_sym (to_texternsym $dst))>;
1570+
def : Pat<(i64 externalsym:$dst), (MOV_B64_sym (to_texternsym $dst))>;
15631571

15641572
//---- Copy Frame Index ----
15651573
def LEA_ADDRi : NVPTXInst<(outs B32:$dst), (ins ADDR:$addr),
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s | FileCheck %s
3+
4+
target triple = "nvptx64-nvidia-cuda"
5+
6+
@global_smem = external addrspace(3) global [0 x i8], align 16
7+
8+
9+
;; Confirm the mov.b64 of global_smem is CSE'd. We need to make things a bit
10+
;; complex with a loop to make this interesting.
11+
define i32 @test_mov_sym(i32 %offset1, i32 %offset2, i1 %cond) {
12+
; CHECK-LABEL: test_mov_sym(
13+
; CHECK: {
14+
; CHECK-NEXT: .reg .pred %p<4>;
15+
; CHECK-NEXT: .reg .b16 %rs<3>;
16+
; CHECK-NEXT: .reg .b32 %r<8>;
17+
; CHECK-NEXT: .reg .b64 %rd<7>;
18+
; CHECK-EMPTY:
19+
; CHECK-NEXT: // %bb.0: // %entry
20+
; CHECK-NEXT: ld.param.b8 %rs1, [test_mov_sym_param_2];
21+
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
22+
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
23+
; CHECK-NEXT: ld.param.b32 %r4, [test_mov_sym_param_0];
24+
; CHECK-NEXT: cvt.s64.s32 %rd1, %r4;
25+
; CHECK-NEXT: mov.b64 %rd2, global_smem;
26+
; CHECK-NEXT: add.s64 %rd3, %rd2, %rd1;
27+
; CHECK-NEXT: ld.shared.b32 %r7, [%rd3];
28+
; CHECK-NEXT: not.pred %p2, %p1;
29+
; CHECK-NEXT: @%p2 bra $L__BB0_4;
30+
; CHECK-NEXT: // %bb.1: // %if1.preheader
31+
; CHECK-NEXT: ld.param.b32 %r5, [test_mov_sym_param_1];
32+
; CHECK-NEXT: setp.ne.b32 %p3, %r4, %r5;
33+
; CHECK-NEXT: $L__BB0_2: // %if1
34+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
35+
; CHECK-NEXT: @%p3 bra $L__BB0_2;
36+
; CHECK-NEXT: // %bb.3: // %if2
37+
; CHECK-NEXT: cvt.s64.s32 %rd4, %r5;
38+
; CHECK-NEXT: add.s64 %rd6, %rd2, %rd4;
39+
; CHECK-NEXT: ld.shared.b32 %r6, [%rd6];
40+
; CHECK-NEXT: add.s32 %r7, %r7, %r6;
41+
; CHECK-NEXT: $L__BB0_4: // %end
42+
; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
43+
; CHECK-NEXT: ret;
44+
entry:
45+
%gep = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset1
46+
%val = load i32, ptr addrspace(3) %gep
47+
br i1 %cond, label %if1, label %end
48+
if1:
49+
%cond2 = icmp eq i32 %offset1, %offset2
50+
br i1 %cond2, label %if2, label %if1
51+
if2:
52+
%gep2 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset2
53+
%val2 = load i32, ptr addrspace(3) %gep2
54+
%add = add i32 %val, %val2
55+
br label %end
56+
end:
57+
%ret = phi i32 [ %add, %if2 ], [ %val, %entry ]
58+
ret i32 %ret
59+
}

0 commit comments

Comments
 (0)