diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index ebb5e32f5e6fc..7b135098bd4c1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1510,18 +1510,23 @@ let hasSideEffects = false in { "mov.b64 \t$d, __local_depot$num;">; } - -// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp -let hasSideEffects = false, isAsCheapAsAMove = true in { - let isMoveReg = true in +let hasSideEffects = false in { + let isMoveReg = true, isAsCheapAsAMove = true in class MOVr : BasicNVPTXInst<(outs RC:$dst), (ins RC:$src), "mov." # OpStr>; - let isMoveImm = true in + let isMoveImm = true, isAsCheapAsAMove = true in class MOVi : BasicNVPTXInst<(outs t.RC:$dst), (ins t.Imm:$src), "mov." # suffix, [(set t.Ty:$dst, t.ImmNode:$src)]>; + + // We don't want to set isAsCheapAsAMove to true for these instructions as + // this would prevent CSE and resulted in regressions (see discussion after + // PR-145581 in llvm-project). + class MovSymInst : + BasicNVPTXInst<(outs t.RC:$dst), (ins Operand:$src), + "mov.b" # t.Size>; } def MOV_B1_r : MOVr; @@ -1539,6 +1544,9 @@ def MOV_BF16_i : MOVi; def MOV_F32_i : MOVi; def MOV_F64_i : MOVi; +def MOV_B32_sym : MovSymInst; +def MOV_B64_sym : MovSymInst; + def to_tglobaladdr : SDNodeXFormgetTargetGlobalAddress(N->getGlobal(), SDLoc(N), @@ -1555,11 +1563,11 @@ def to_tframeindex : SDNodeXFormgetTargetFrameIndex(N->getIndex(), N->getValueType(0)); }]>; -def : Pat<(i32 globaladdr:$dst), (MOV_B32_i (to_tglobaladdr $dst))>; -def : Pat<(i64 globaladdr:$dst), (MOV_B64_i (to_tglobaladdr $dst))>; +def : Pat<(i32 globaladdr:$dst), (MOV_B32_sym (to_tglobaladdr $dst))>; +def : Pat<(i64 globaladdr:$dst), (MOV_B64_sym (to_tglobaladdr $dst))>; -def : Pat<(i32 externalsym:$dst), (MOV_B32_i (to_texternsym $dst))>; -def : Pat<(i64 externalsym:$dst), (MOV_B64_i (to_texternsym $dst))>; +def : Pat<(i32 externalsym:$dst), (MOV_B32_sym (to_texternsym $dst))>; +def : Pat<(i64 externalsym:$dst), (MOV_B64_sym (to_texternsym $dst))>; //---- Copy Frame Index ---- def LEA_ADDRi : NVPTXInst<(outs B32:$dst), (ins ADDR:$addr), diff --git a/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll new file mode 100644 index 0000000000000..31ecce5a66b64 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + + +;; Confirm the mov.b64 of global_smem is CSE'd. We need to make things a bit +;; complex with a loop to make this interesting. +define i32 @test_mov_sym(i32 %offset1, i32 %offset2, i1 %cond) { +; CHECK-LABEL: test_mov_sym( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<4>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b8 %rs1, [test_mov_sym_param_2]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r4, [test_mov_sym_param_0]; +; CHECK-NEXT: cvt.s64.s32 %rd1, %r4; +; CHECK-NEXT: mov.b64 %rd2, global_smem; +; CHECK-NEXT: add.s64 %rd3, %rd2, %rd1; +; CHECK-NEXT: ld.shared.b32 %r7, [%rd3]; +; CHECK-NEXT: not.pred %p2, %p1; +; CHECK-NEXT: @%p2 bra $L__BB0_4; +; CHECK-NEXT: // %bb.1: // %if1.preheader +; CHECK-NEXT: ld.param.b32 %r5, [test_mov_sym_param_1]; +; CHECK-NEXT: setp.ne.b32 %p3, %r4, %r5; +; CHECK-NEXT: $L__BB0_2: // %if1 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: @%p3 bra $L__BB0_2; +; CHECK-NEXT: // %bb.3: // %if2 +; CHECK-NEXT: cvt.s64.s32 %rd4, %r5; +; CHECK-NEXT: add.s64 %rd6, %rd2, %rd4; +; CHECK-NEXT: ld.shared.b32 %r6, [%rd6]; +; CHECK-NEXT: add.s32 %r7, %r7, %r6; +; CHECK-NEXT: $L__BB0_4: // %end +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NEXT: ret; +entry: + %gep = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset1 + %val = load i32, ptr addrspace(3) %gep + br i1 %cond, label %if1, label %end +if1: + %cond2 = icmp eq i32 %offset1, %offset2 + br i1 %cond2, label %if2, label %if1 +if2: + %gep2 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset2 + %val2 = load i32, ptr addrspace(3) %gep2 + %add = add i32 %val, %val2 + br label %end +end: + %ret = phi i32 [ %add, %if2 ], [ %val, %entry ] + ret i32 %ret +}