Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5918,6 +5918,9 @@ bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
const DebugLoc &DL = I.getDebugLoc();
Register CCReg = I.getOperand(0).getReg();

// Set SCC to true, in case the barrier instruction gets converted to a NOP.
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);

BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
.addImm(I.getOperand(2).getImm());

Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5411,6 +5411,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return BB;
}
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
// Set SCC to true, in case the barrier instruction gets converted to a NOP.
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::S_CMP_EQ_U32))
.addImm(0)
.addImm(0);
Comment on lines +5415 to +5419
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this leave a dead instruction behind if it doesn't?

Copy link
Collaborator Author

@nhaehnle nhaehnle Jun 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps the comment isn't clear enough. The point is that hardware at runtime can convert the barrier instructions into NOPs (when the workgroup is launched with a single wave).

I suppose we could remove the barrier instruction ourselves if the workgroup size is statically known to be too small, but I would we hope we'd do that earlier in LLVM IR since it might expose more optimization opportunities.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So it's sort of a hardware bug? The ordinary behavior is SCC is always set to 1? And if it nops it leaves it in the previous state?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, exactly.

return BB;
}
case AMDGPU::GET_GROUPSTATICSIZE: {
assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,7 @@ def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins),
def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins),
"", []>{
let Defs = [SCC];
let Uses = [M0, SCC];
let SchedRW = [WriteBarrier];
let isConvergent = 1;
}
Expand All @@ -487,6 +488,8 @@ def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (outs),
(ins SplitBarrier:$src0), "$src0", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst timm:$src0))]>{
let Defs = [SCC];
let Uses = [SCC];
let usesCustomInserter = 1;
let SchedRW = [WriteBarrier];
let isConvergent = 1;
}
Expand Down
10 changes: 7 additions & 3 deletions llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,8 @@ body: |
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: V_NOP_e32 implicit $exec
; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
; CHECK-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0
Expand All @@ -385,7 +386,8 @@ body: |
bb.1:
successors: %bb.2
V_NOP_e32 implicit $exec
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
S_CMP_EQ_U32 0, 0, implicit-def $scc
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc

bb.2:
S_ENDPGM 0
Expand Down Expand Up @@ -437,6 +439,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: V_NOP_e32 implicit $exec
; CHECK-NEXT: $m0 = S_MOV_B32 -1
; CHECK-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
Expand All @@ -449,7 +452,8 @@ body: |
successors: %bb.2
V_NOP_e32 implicit $exec
$m0 = S_MOV_B32 -1
S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
S_CMP_EQ_U32 0, 0, implicit-def $scc
S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit $scc

bb.2:
S_ENDPGM 0
Expand Down
41 changes: 41 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s

define i1 @func1() {
; GFX12-SDAG-LABEL: func1:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: func1:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
ret i1 %r
}

declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)
Loading