Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 69 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10608,6 +10608,73 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
return false;

const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
this]() -> bool {
if (CmpValue != 0)
return false;

MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
if (!Def || Def->getParent() != CmpInstr.getParent())
return false;
Comment on lines +10617 to +10618
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't see why the parent would matter, but this also looks untested

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Def and CmpInstr need to have the same parent so that the scan will work.


if (!(Def->getOpcode() == AMDGPU::S_LSHL_B32 ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this just a list of all instructions that define SCC?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be all instructions that set SCC = DST!=0. s_max/s_min are excluded since they set SCC but with a different condition. I need to review the list since I missed some instructions that should be included (e.g. s_absdiff).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a switch would be slightly more readable ?

Def->getOpcode() == AMDGPU::S_LSHL_B64 ||
Def->getOpcode() == AMDGPU::S_LSHR_B32 ||
Def->getOpcode() == AMDGPU::S_LSHR_B64 ||
Def->getOpcode() == AMDGPU::S_AND_B32 ||
Def->getOpcode() == AMDGPU::S_AND_B64 ||
Def->getOpcode() == AMDGPU::S_OR_B32 ||
Def->getOpcode() == AMDGPU::S_OR_B64 ||
Def->getOpcode() == AMDGPU::S_XOR_B32 ||
Def->getOpcode() == AMDGPU::S_XOR_B64 ||
Def->getOpcode() == AMDGPU::S_NAND_B32 ||
Def->getOpcode() == AMDGPU::S_NAND_B64 ||
Def->getOpcode() == AMDGPU::S_NOR_B32 ||
Def->getOpcode() == AMDGPU::S_NOR_B64 ||
Def->getOpcode() == AMDGPU::S_XNOR_B32 ||
Def->getOpcode() == AMDGPU::S_XNOR_B64 ||
Def->getOpcode() == AMDGPU::S_ANDN2_B32 ||
Def->getOpcode() == AMDGPU::S_ANDN2_B64 ||
Def->getOpcode() == AMDGPU::S_ORN2_B32 ||
Def->getOpcode() == AMDGPU::S_ORN2_B64 ||
Def->getOpcode() == AMDGPU::S_BFE_I32 ||
Def->getOpcode() == AMDGPU::S_BFE_I64 ||
Def->getOpcode() == AMDGPU::S_BFE_U32 ||
Def->getOpcode() == AMDGPU::S_BFE_U64 ||
Def->getOpcode() == AMDGPU::S_BCNT0_I32_B32 ||
Def->getOpcode() == AMDGPU::S_BCNT0_I32_B64 ||
Def->getOpcode() == AMDGPU::S_BCNT1_I32_B32 ||
Def->getOpcode() == AMDGPU::S_BCNT1_I32_B64 ||
Def->getOpcode() == AMDGPU::S_QUADMASK_B32 ||
Def->getOpcode() == AMDGPU::S_QUADMASK_B64 ||
Def->getOpcode() == AMDGPU::S_NOT_B32 ||
Def->getOpcode() == AMDGPU::S_NOT_B64 ||

((Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
Def->getOpcode() == AMDGPU::S_CSELECT_B64) &&
Def->getOperand(1).isImm() && Def->getOperand(1).getImm() &&
!Def->getOperand(2).isImm() && !Def->getOperand(2).getImm())))
return false;

for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
I != E; ++I) {
if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
I->killsRegister(AMDGPU::SCC, &RI))
return false;
}
Comment on lines +10659 to +10664
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you should need to be scanning around for physreg liveness, I don't see other targets doing that here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you should need to be scanning around for physreg liveness, I don't see other targets doing that here

Scanning for register liveness is also done in the other lambda, optimizeCmpAnd, (line 10752) that is used in optimizeCompareInstr. It is necessary to prevent incorrectly deleting a s_cmp if an there is an intervening definition. In PR #161582, tests were added that requires this scanning. PR #161582 was done to add SCC as an implicit destination of s_quadmask*. Without this fix the scanning would not detect a conflict.


if (!(Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
Def->getOpcode() == AMDGPU::S_CSELECT_B64)) {
Comment on lines +10666 to +10667
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

demorgan this condition

MachineOperand *SccDef =
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
assert(SccDef && "Def instruction must define SCC");
SccDef->setIsDead(false);
}

CmpInstr.eraseFromParent();
return true;
};

const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
this](int64_t ExpectedValue, unsigned SrcSize,
bool IsReversible, bool IsSigned) -> bool {
Expand Down Expand Up @@ -10735,15 +10802,15 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMPK_LG_U32:
case AMDGPU::S_CMPK_LG_I32:
return optimizeCmpAnd(0, 32, true, false);
return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMPK_GT_U32:
return optimizeCmpAnd(0, 32, false, false);
case AMDGPU::S_CMP_GT_I32:
case AMDGPU::S_CMPK_GT_I32:
return optimizeCmpAnd(0, 32, false, true);
case AMDGPU::S_CMP_LG_U64:
return optimizeCmpAnd(0, 64, true, false);
return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
}

return false;
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
Expand Down Expand Up @@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
Expand Down Expand Up @@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
Expand Down
Loading