Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1510,18 +1510,22 @@ let hasSideEffects = false in {
"mov.b64 \t$d, __local_depot$num;">;
}


// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
let hasSideEffects = false, isAsCheapAsAMove = true in {
let isMoveReg = true in
let hasSideEffects = false in {
let isMoveReg = true, isAsCheapAsAMove = true in
class MOVr<RegisterClass RC, string OpStr> :
BasicNVPTXInst<(outs RC:$dst), (ins RC:$src), "mov." # OpStr>;

let isMoveImm = true in
let isMoveImm = true, isAsCheapAsAMove = true in
class MOVi<RegTyInfo t, string suffix> :
BasicNVPTXInst<(outs t.RC:$dst), (ins t.Imm:$src),
"mov." # suffix,
[(set t.Ty:$dst, t.ImmNode:$src)]>;

// We don't want to set isAsCheapAsAMove to true for these instructions as
// this would prevent CSE and resulted in regressions.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd mention the thread discussing the context of the issue.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added!

class MovSymInst<RegTyInfo t> :
BasicNVPTXInst<(outs t.RC:$dst), (ins Operand<t.Ty>:$src),
"mov.b" # t.Size>;
}

def MOV_B1_r : MOVr<B1, "pred">;
Expand All @@ -1539,6 +1543,9 @@ def MOV_BF16_i : MOVi<BF16RT, "b16">;
def MOV_F32_i : MOVi<F32RT, "b32">;
def MOV_F64_i : MOVi<F64RT, "b64">;

def MOV_B32_sym : MovSymInst<I32RT>;
def MOV_B64_sym : MovSymInst<I64RT>;


def to_tglobaladdr : SDNodeXForm<globaladdr, [{
return CurDAG->getTargetGlobalAddress(N->getGlobal(), SDLoc(N),
Expand All @@ -1555,11 +1562,11 @@ def to_tframeindex : SDNodeXForm<frameindex, [{
return CurDAG->getTargetFrameIndex(N->getIndex(), N->getValueType(0));
}]>;

def : Pat<(i32 globaladdr:$dst), (MOV_B32_i (to_tglobaladdr $dst))>;
def : Pat<(i64 globaladdr:$dst), (MOV_B64_i (to_tglobaladdr $dst))>;
def : Pat<(i32 globaladdr:$dst), (MOV_B32_sym (to_tglobaladdr $dst))>;
def : Pat<(i64 globaladdr:$dst), (MOV_B64_sym (to_tglobaladdr $dst))>;

def : Pat<(i32 externalsym:$dst), (MOV_B32_i (to_texternsym $dst))>;
def : Pat<(i64 externalsym:$dst), (MOV_B64_i (to_texternsym $dst))>;
def : Pat<(i32 externalsym:$dst), (MOV_B32_sym (to_texternsym $dst))>;
def : Pat<(i64 externalsym:$dst), (MOV_B64_sym (to_texternsym $dst))>;

//---- Copy Frame Index ----
def LEA_ADDRi : NVPTXInst<(outs B32:$dst), (ins ADDR:$addr),
Expand Down
59 changes: 59 additions & 0 deletions llvm/test/CodeGen/NVPTX/cse-mov-sym.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s | FileCheck %s

target triple = "nvptx64-nvidia-cuda"

@global_smem = external addrspace(3) global [0 x i8], align 16


;; Confirm the mov.b64 of global_smem is CSE'd. We need to make things a bit
;; complex with a loop to make this interesting.
define i32 @test_mov_sym(i32 %offset1, i32 %offset2, i1 %cond) {
; CHECK-LABEL: test_mov_sym(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<4>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<8>;
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b8 %rs1, [test_mov_sym_param_2];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
; CHECK-NEXT: ld.param.b32 %r4, [test_mov_sym_param_0];
; CHECK-NEXT: cvt.s64.s32 %rd1, %r4;
; CHECK-NEXT: mov.b64 %rd2, global_smem;
; CHECK-NEXT: add.s64 %rd3, %rd2, %rd1;
; CHECK-NEXT: ld.shared.b32 %r7, [%rd3];
; CHECK-NEXT: not.pred %p2, %p1;
; CHECK-NEXT: @%p2 bra $L__BB0_4;
; CHECK-NEXT: // %bb.1: // %if1.preheader
; CHECK-NEXT: ld.param.b32 %r5, [test_mov_sym_param_1];
; CHECK-NEXT: setp.ne.b32 %p3, %r4, %r5;
; CHECK-NEXT: $L__BB0_2: // %if1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: @%p3 bra $L__BB0_2;
; CHECK-NEXT: // %bb.3: // %if2
; CHECK-NEXT: cvt.s64.s32 %rd4, %r5;
; CHECK-NEXT: add.s64 %rd6, %rd2, %rd4;
; CHECK-NEXT: ld.shared.b32 %r6, [%rd6];
; CHECK-NEXT: add.s32 %r7, %r7, %r6;
; CHECK-NEXT: $L__BB0_4: // %end
; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-NEXT: ret;
entry:
%gep = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset1
%val = load i32, ptr addrspace(3) %gep
br i1 %cond, label %if1, label %end
if1:
%cond2 = icmp eq i32 %offset1, %offset2
br i1 %cond2, label %if2, label %if1
if2:
%gep2 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset2
%val2 = load i32, ptr addrspace(3) %gep2
%add = add i32 %val, %val2
br label %end
end:
%ret = phi i32 [ %add, %if2 ], [ %val, %entry ]
ret i32 %ret
}