Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1369,6 +1369,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
sign-extended from the width of the underlying PC hardware register even on
processors where the s_getpc_b64 instruction returns a zero-extended value.

llvm.amdgcn.ballot Returns a bitfield(i32 or i64) containing the result of its i1 argument
in all active lanes, and zero in all inactive lanes.
Provides a way to convert i1 in LLVM IR to i32 or i64 lane mask - bitfield
used by hardware to control active lanes when used in EXEC register.
For example, ballot(i1 true) return EXEC mask.

============================================== ==========================================================

.. TODO::
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2086,6 +2086,8 @@ def int_amdgcn_fcmp :
[IntrNoMem, IntrConvergent,
ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// Returns a bitfield(i32 or i64) containing the result of its i1 argument
// in all active lanes, and zero in all inactive lanes.
def int_amdgcn_ballot :
Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
Expand Down
101 changes: 76 additions & 25 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1413,50 +1413,101 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
return true;
}

// Ballot has to zero bits in input lane-mask that are zero in current exec,
// Done as AND with exec. For inputs that are results of instruction that
// implicitly use same exec, for example compares in same basic block or SCC to
// VCC copy, use copy.
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI,
MachineBasicBlock *MBB) {
MachineInstr *MI = MRI.getVRegDef(Reg);
if (MI->getParent() != MBB)
return false;
Comment on lines +1423 to +1424
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same MBB check is here. Deleted the loop


// Lane mask generated by SCC to VCC copy.
if (MI->getOpcode() == AMDGPU::COPY) {
auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
SrcRB->getID() == AMDGPU::SGPRRegBankID)
return true;
}

// Lane mask generated using compare with same exec.
if (isa<GAnyCmp>(MI))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is now missing a check that MI is in MBB.

return true;

Register LHS, RHS;
// Look through AND.
if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
isLaneMaskFromSameBlock(RHS, MRI, MBB);

return false;
}

bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
Register DstReg = I.getOperand(0).getReg();
const unsigned Size = MRI->getType(DstReg).getSizeInBits();
const bool Is64 = Size == 64;
const bool IsWave32 = (STI.getWavefrontSize() == 32);
Register SrcReg = I.getOperand(2).getReg();
const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
const unsigned WaveSize = STI.getWavefrontSize();

// In the common case, the return type matches the wave size.
// However we also support emitting i64 ballots in wave32 mode.
if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
return false;

std::optional<ValueAndVReg> Arg =
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
getIConstantVRegValWithLookThrough(SrcReg, *MRI);

Register Dst = DstReg;
// i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
if (BallotSize != WaveSize) {
Dst = MRI->createVirtualRegister(TRI.getBoolRC());
}

const auto BuildCopy = [&](Register SrcReg) {
if (Size == STI.getWavefrontSize()) {
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
.addReg(SrcReg);
return;
if (Arg) {
const int64_t Value = Arg->Value.getZExtValue();
if (Value == 0) {
// Dst = S_MOV 0
unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
} else {
// Dst = COPY EXEC
assert(Value == 1);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
}
if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
return false;
} else {
if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
// Dst = COPY SrcReg
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
return false;
} else {
// Dst = S_AND SrcReg, EXEC
unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
.addReg(SrcReg)
.addReg(TRI.getExec())
.setOperandDead(3); // Dead scc
if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
return false;
}
}

// If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
// i64 ballot on Wave32: zero-extend i32 ballot to i64.
if (BallotSize != WaveSize) {
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
.addReg(SrcReg)
.addReg(Dst)
.addImm(AMDGPU::sub0)
.addReg(HiReg)
.addImm(AMDGPU::sub1);
};

if (Arg) {
const int64_t Value = Arg->Value.getSExtValue();
if (Value == 0) {
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
} else if (Value == -1) // all ones
BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
else
return false;
} else
BuildCopy(I.getOperand(2).getReg());
}

I.eraseFromParent();
return true;
Expand Down
90 changes: 85 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX11 %s

declare i32 @llvm.amdgcn.ballot.i32(i1)
declare i32 @llvm.ctpop.i32(i32)
Expand Down Expand Up @@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
; CHECK-LABEL: non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: ; return to shader part epilog
%trunc = trunc i32 %x to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
Expand Down Expand Up @@ -89,7 +90,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
Comment on lines 92 to +93
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The output was a real compare, so this shouldn't be needed. The DAG doesn't emit this and

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really not obvious, Input is "vgpr to vcc trunc" that is selected as compare. Will become better after new reg bank select.

; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
Expand Down Expand Up @@ -137,7 +139,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
Expand Down Expand Up @@ -419,3 +422,80 @@ true:
false:
ret i32 33
}

; Input that is not constant or direct result of a compare.
; Tests setting 0 to inactive lanes.
define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
; GFX10-LABEL: non_cst_non_compare_input:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_and_b32 s0, 1, s0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX10-NEXT: ; %bb.1: ; %B
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
; GFX10-NEXT: ; implicit-def: $vgpr2
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: ; %bb.2: ; %Flow
; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
; GFX10-NEXT: ; %bb.3: ; %A
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: ; %bb.4: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: s_and_b32 s0, s0, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: non_cst_non_compare_input:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_and_b32 s0, 1, s0
; GFX11-NEXT: s_mov_b32 s1, exec_lo
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3
; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX11-NEXT: ; %bb.1: ; %B
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
; GFX11-NEXT: ; implicit-def: $vgpr2
; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo
; GFX11-NEXT: s_or_b32 s0, s0, s2
; GFX11-NEXT: ; %bb.2: ; %Flow
; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1
; GFX11-NEXT: ; %bb.3: ; %A
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo
; GFX11-NEXT: s_or_b32 s0, s0, s2
; GFX11-NEXT: ; %bb.4: ; %exit
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
Comment on lines +476 to +480
Copy link
Collaborator Author

@petar-avramovic petar-avramovic Oct 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%cmp = icmp eq i32 %cond, 0
br i1 %cmp, label %A, label %B

A:
%val_A = icmp uge i32 %tid, 1
br label %exit

B:
%val_B = icmp ult i32 %tid, 2
br label %exit

exit:
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
store i32 %ballot, ptr addrspace(1) %out
ret void
}
58 changes: 55 additions & 3 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) {
; CHECK-LABEL: non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: ; return to shader part epilog
%trunc = trunc i32 %x to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
Expand Down Expand Up @@ -92,7 +93,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
Expand Down Expand Up @@ -140,7 +142,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
Expand Down Expand Up @@ -422,3 +425,52 @@ true:
false:
ret i32 33
}

; Input that is not constant or direct result of a compare.
; Tests setting 0 to inactive lanes.
define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
; CHECK-LABEL: non_cst_non_compare_input:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; CHECK-NEXT: ; %bb.1: ; %B
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: ; %bb.2: ; %Flow
; CHECK-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
; CHECK-NEXT: ; %bb.3: ; %A
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 1, v2
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; CHECK-NEXT: ; %bb.4: ; %exit
; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: v_mov_b32_e32 v3, s1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; CHECK-NEXT: s_endpgm
entry:
%cmp = icmp eq i32 %cond, 0
br i1 %cmp, label %A, label %B

A:
%val_A = icmp uge i32 %tid, 1
br label %exit

B:
%val_B = icmp ult i32 %tid, 2
br label %exit

exit:
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi)
store i64 %ballot, ptr addrspace(1) %out
ret void
}
Loading