diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 174a497c51b26..069d5bfa9c7a6 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1402,6 +1402,10 @@ The AMDGPU backend implements the following LLVM IR intrinsics. performs subtraction only if the memory value is greater than or equal to the data value. + llvm.amdgcn.s.barrier.signal.isfirst Provides access to the s_barrier_signal_first instruction; + additionally ensures that the result value is valid even when the + intrinsic is used from a wave that is not running in a workgroup. + llvm.amdgcn.s.getpc Provides access to the s_getpc_b64 instruction, but with the return value sign-extended from the width of the underlying PC hardware register even on processors where the s_getpc_b64 instruction returns a zero-extended value. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 7e72f6ca478fd..672520390c8bf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5918,6 +5918,9 @@ bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst( const DebugLoc &DL = I.getDebugLoc(); Register CCReg = I.getOperand(0).getReg(); + // Set SCC to true, in case the barrier instruction gets converted to a NOP. + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM)) .addImm(I.getOperand(2).getImm()); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 53dc540cbd635..341d0b98797f5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5411,6 +5411,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); return BB; } + case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: { + // Set SCC to true, in case the barrier instruction gets converted to a NOP. + BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::S_CMP_EQ_U32)) + .addImm(0) + .addImm(0); + return BB; + } case AMDGPU::GET_GROUPSTATICSIZE: { assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e0a36758534d5..90e65a6950c0a 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -472,6 +472,7 @@ def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins), def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins), "", []>{ let Defs = [SCC]; + let Uses = [M0, SCC]; let SchedRW = [WriteBarrier]; let isConvergent = 1; } @@ -487,6 +488,8 @@ def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs), def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (outs), (ins SplitBarrier:$src0), "$src0", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst timm:$src0))]>{ let Defs = [SCC]; + let Uses = [SCC]; + let usesCustomInserter = 1; let SchedRW = [WriteBarrier]; let isConvergent = 1; } diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir index e4b16a3fa0040..f437dee253d00 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir @@ -374,7 +374,8 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: V_NOP_e32 implicit $exec - ; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc + ; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: S_ENDPGM 0 @@ -385,7 +386,8 @@ body: | bb.1: successors: %bb.2 V_NOP_e32 implicit $exec - S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc + S_CMP_EQ_U32 0, 0, implicit-def $scc + S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc bb.2: S_ENDPGM 0 @@ -437,6 +439,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: V_NOP_e32 implicit $exec ; CHECK-NEXT: $m0 = S_MOV_B32 -1 + ; CHECK-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -449,7 +452,8 @@ body: | successors: %bb.2 V_NOP_e32 implicit $exec $m0 = S_MOV_B32 -1 - S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc + S_CMP_EQ_U32 0, 0, implicit-def $scc + S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit $scc bb.2: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll new file mode 100644 index 0000000000000..651d204f65b6c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +define i1 @func1() { +; GFX12-SDAG-LABEL: func1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0 +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 +; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: func1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0 +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 +; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) + ret i1 %r +} + +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)