-
Notifications
You must be signed in to change notification settings - Fork 15.4k
AMDGPU/GlobalISel: Fix inst-selection of ballot #109986
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1413,50 +1413,101 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { | |
| return true; | ||
| } | ||
|
|
||
| // Ballot has to zero bits in input lane-mask that are zero in current exec, | ||
| // Done as AND with exec. For inputs that are results of instruction that | ||
| // implicitly use same exec, for example compares in same basic block or SCC to | ||
| // VCC copy, use copy. | ||
| static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, | ||
| MachineBasicBlock *MBB) { | ||
| MachineInstr *MI = MRI.getVRegDef(Reg); | ||
| if (MI->getParent() != MBB) | ||
| return false; | ||
|
|
||
| // Lane mask generated by SCC to VCC copy. | ||
| if (MI->getOpcode() == AMDGPU::COPY) { | ||
| auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg()); | ||
| auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg()); | ||
| if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID && | ||
| SrcRB->getID() == AMDGPU::SGPRRegBankID) | ||
| return true; | ||
| } | ||
|
|
||
| // Lane mask generated using compare with same exec. | ||
| if (isa<GAnyCmp>(MI)) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is now missing a check that MI is in MBB. |
||
| return true; | ||
|
|
||
| Register LHS, RHS; | ||
| // Look through AND. | ||
| if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS)))) | ||
| return isLaneMaskFromSameBlock(LHS, MRI, MBB) || | ||
| isLaneMaskFromSameBlock(RHS, MRI, MBB); | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { | ||
| MachineBasicBlock *BB = I.getParent(); | ||
| const DebugLoc &DL = I.getDebugLoc(); | ||
| Register DstReg = I.getOperand(0).getReg(); | ||
| const unsigned Size = MRI->getType(DstReg).getSizeInBits(); | ||
| const bool Is64 = Size == 64; | ||
| const bool IsWave32 = (STI.getWavefrontSize() == 32); | ||
| Register SrcReg = I.getOperand(2).getReg(); | ||
| const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits(); | ||
| const unsigned WaveSize = STI.getWavefrontSize(); | ||
|
|
||
| // In the common case, the return type matches the wave size. | ||
| // However we also support emitting i64 ballots in wave32 mode. | ||
| if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32)) | ||
| if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32)) | ||
| return false; | ||
|
|
||
| std::optional<ValueAndVReg> Arg = | ||
| getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); | ||
| getIConstantVRegValWithLookThrough(SrcReg, *MRI); | ||
|
|
||
| Register Dst = DstReg; | ||
| // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot. | ||
| if (BallotSize != WaveSize) { | ||
| Dst = MRI->createVirtualRegister(TRI.getBoolRC()); | ||
| } | ||
|
|
||
| const auto BuildCopy = [&](Register SrcReg) { | ||
| if (Size == STI.getWavefrontSize()) { | ||
| BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg) | ||
| .addReg(SrcReg); | ||
| return; | ||
| if (Arg) { | ||
| const int64_t Value = Arg->Value.getZExtValue(); | ||
| if (Value == 0) { | ||
| // Dst = S_MOV 0 | ||
| unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; | ||
| BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0); | ||
| } else { | ||
| // Dst = COPY EXEC | ||
| assert(Value == 1); | ||
| BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec()); | ||
| } | ||
| if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI)) | ||
| return false; | ||
| } else { | ||
| if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) { | ||
| // Dst = COPY SrcReg | ||
| BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg); | ||
| if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI)) | ||
| return false; | ||
| } else { | ||
| // Dst = S_AND SrcReg, EXEC | ||
| unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; | ||
| auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst) | ||
| .addReg(SrcReg) | ||
| .addReg(TRI.getExec()) | ||
| .setOperandDead(3); // Dead scc | ||
| if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI)) | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| // If emitting a i64 ballot in wave32, fill the upper bits with zeroes. | ||
| // i64 ballot on Wave32: zero-extend i32 ballot to i64. | ||
| if (BallotSize != WaveSize) { | ||
| Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
| BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0); | ||
| BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) | ||
| .addReg(SrcReg) | ||
| .addReg(Dst) | ||
| .addImm(AMDGPU::sub0) | ||
| .addReg(HiReg) | ||
| .addImm(AMDGPU::sub1); | ||
| }; | ||
|
|
||
| if (Arg) { | ||
| const int64_t Value = Arg->Value.getSExtValue(); | ||
| if (Value == 0) { | ||
| unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; | ||
| BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); | ||
| } else if (Value == -1) // all ones | ||
| BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); | ||
| else | ||
| return false; | ||
| } else | ||
| BuildCopy(I.getOperand(2).getReg()); | ||
| } | ||
|
|
||
| I.eraseFromParent(); | ||
| return true; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,6 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck %s | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX11 %s | ||
|
|
||
| declare i32 @llvm.amdgcn.ballot.i32(i1) | ||
| declare i32 @llvm.ctpop.i32(i32) | ||
|
|
@@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) { | |
| ; CHECK-LABEL: non_compare: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 | ||
| ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 | ||
| ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 | ||
| ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo | ||
| ; CHECK-NEXT: ; return to shader part epilog | ||
| %trunc = trunc i32 %x to i1 | ||
| %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc) | ||
|
|
@@ -89,7 +90,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { | |
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 | ||
| ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 | ||
| ; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0 | ||
| ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo | ||
|
Comment on lines
92
to
+93
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The output was a real compare, so this shouldn't be needed. The DAG doesn't emit this and
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is really not obvious, Input is "vgpr to vcc trunc" that is selected as compare. Will become better after new reg bank select. |
||
| ; CHECK-NEXT: s_cmp_eq_u32 s0, 0 | ||
| ; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 | ||
| ; CHECK-NEXT: ; %bb.1: ; %true | ||
| ; CHECK-NEXT: s_mov_b32 s0, 42 | ||
|
|
@@ -137,7 +139,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { | |
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 | ||
| ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 | ||
| ; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 | ||
| ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo | ||
| ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 | ||
| ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 | ||
| ; CHECK-NEXT: ; %bb.1: ; %false | ||
| ; CHECK-NEXT: s_mov_b32 s0, 33 | ||
|
|
@@ -419,3 +422,80 @@ true: | |
| false: | ||
| ret i32 33 | ||
| } | ||
|
|
||
| ; Input that is not constant or direct result of a compare. | ||
| ; Tests setting 0 to inactive lanes. | ||
| define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { | ||
| ; GFX10-LABEL: non_cst_non_compare_input: | ||
| ; GFX10: ; %bb.0: ; %entry | ||
| ; GFX10-NEXT: s_and_b32 s0, 1, s0 | ||
| ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 | ||
| ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 | ||
| ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo | ||
| ; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 | ||
| ; GFX10-NEXT: ; %bb.1: ; %B | ||
| ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 | ||
| ; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo | ||
| ; GFX10-NEXT: ; implicit-def: $vgpr2 | ||
| ; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo | ||
| ; GFX10-NEXT: s_or_b32 s0, s0, s2 | ||
| ; GFX10-NEXT: ; %bb.2: ; %Flow | ||
| ; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1 | ||
| ; GFX10-NEXT: ; %bb.3: ; %A | ||
| ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2 | ||
| ; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo | ||
| ; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo | ||
| ; GFX10-NEXT: s_or_b32 s0, s0, s2 | ||
| ; GFX10-NEXT: ; %bb.4: ; %exit | ||
| ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 | ||
| ; GFX10-NEXT: s_and_b32 s0, s0, exec_lo | ||
| ; GFX10-NEXT: v_mov_b32_e32 v2, s0 | ||
| ; GFX10-NEXT: global_store_dword v[0:1], v2, off | ||
| ; GFX10-NEXT: s_endpgm | ||
| ; | ||
| ; GFX11-LABEL: non_cst_non_compare_input: | ||
| ; GFX11: ; %bb.0: ; %entry | ||
| ; GFX11-NEXT: s_and_b32 s0, 1, s0 | ||
| ; GFX11-NEXT: s_mov_b32 s1, exec_lo | ||
| ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 | ||
| ; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 | ||
| ; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1 | ||
| ; GFX11-NEXT: ; %bb.1: ; %B | ||
| ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 | ||
| ; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo | ||
| ; GFX11-NEXT: ; implicit-def: $vgpr2 | ||
| ; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo | ||
| ; GFX11-NEXT: s_or_b32 s0, s0, s2 | ||
| ; GFX11-NEXT: ; %bb.2: ; %Flow | ||
| ; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1 | ||
| ; GFX11-NEXT: ; %bb.3: ; %A | ||
| ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2 | ||
| ; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo | ||
| ; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo | ||
| ; GFX11-NEXT: s_or_b32 s0, s0, s2 | ||
| ; GFX11-NEXT: ; %bb.4: ; %exit | ||
| ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 | ||
| ; GFX11-NEXT: s_and_b32 s0, s0, exec_lo | ||
| ; GFX11-NEXT: v_mov_b32_e32 v2, s0 | ||
| ; GFX11-NEXT: global_store_b32 v[0:1], v2, off | ||
|
Comment on lines
+476
to
+480
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is ballot source that is not compare (source is phi), compared to what sdag does: |
||
| ; GFX11-NEXT: s_nop 0 | ||
| ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||
| ; GFX11-NEXT: s_endpgm | ||
| entry: | ||
| %cmp = icmp eq i32 %cond, 0 | ||
| br i1 %cmp, label %A, label %B | ||
|
|
||
| A: | ||
| %val_A = icmp uge i32 %tid, 1 | ||
| br label %exit | ||
|
|
||
| B: | ||
| %val_B = icmp ult i32 %tid, 2 | ||
| br label %exit | ||
|
|
||
| exit: | ||
| %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] | ||
| %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi) | ||
| store i32 %ballot, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same MBB check is here. Deleted the loop