-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Remove redundant s_cmp_lg_* sX, 0 #162352
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10608,6 +10608,73 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, | |
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) | ||
return false; | ||
|
||
const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI, | ||
this]() -> bool { | ||
if (CmpValue != 0) | ||
return false; | ||
|
||
MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); | ||
if (!Def || Def->getParent() != CmpInstr.getParent()) | ||
return false; | ||
|
||
if (!(Def->getOpcode() == AMDGPU::S_LSHL_B32 || | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this just a list of all instructions that define SCC? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should be all instructions that set SCC = DST!=0. s_max/s_min are excluded since they set SCC but with a different condition. I need to review the list since I missed some instructions that should be included (e.g. s_absdiff). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe a switch would be slightly more readable ? |
||
Def->getOpcode() == AMDGPU::S_LSHL_B64 || | ||
Def->getOpcode() == AMDGPU::S_LSHR_B32 || | ||
Def->getOpcode() == AMDGPU::S_LSHR_B64 || | ||
Def->getOpcode() == AMDGPU::S_AND_B32 || | ||
Def->getOpcode() == AMDGPU::S_AND_B64 || | ||
Def->getOpcode() == AMDGPU::S_OR_B32 || | ||
Def->getOpcode() == AMDGPU::S_OR_B64 || | ||
Def->getOpcode() == AMDGPU::S_XOR_B32 || | ||
Def->getOpcode() == AMDGPU::S_XOR_B64 || | ||
Def->getOpcode() == AMDGPU::S_NAND_B32 || | ||
Def->getOpcode() == AMDGPU::S_NAND_B64 || | ||
Def->getOpcode() == AMDGPU::S_NOR_B32 || | ||
Def->getOpcode() == AMDGPU::S_NOR_B64 || | ||
Def->getOpcode() == AMDGPU::S_XNOR_B32 || | ||
Def->getOpcode() == AMDGPU::S_XNOR_B64 || | ||
Def->getOpcode() == AMDGPU::S_ANDN2_B32 || | ||
Def->getOpcode() == AMDGPU::S_ANDN2_B64 || | ||
Def->getOpcode() == AMDGPU::S_ORN2_B32 || | ||
Def->getOpcode() == AMDGPU::S_ORN2_B64 || | ||
Def->getOpcode() == AMDGPU::S_BFE_I32 || | ||
Def->getOpcode() == AMDGPU::S_BFE_I64 || | ||
Def->getOpcode() == AMDGPU::S_BFE_U32 || | ||
Def->getOpcode() == AMDGPU::S_BFE_U64 || | ||
Def->getOpcode() == AMDGPU::S_BCNT0_I32_B32 || | ||
Def->getOpcode() == AMDGPU::S_BCNT0_I32_B64 || | ||
Def->getOpcode() == AMDGPU::S_BCNT1_I32_B32 || | ||
Def->getOpcode() == AMDGPU::S_BCNT1_I32_B64 || | ||
Def->getOpcode() == AMDGPU::S_QUADMASK_B32 || | ||
Def->getOpcode() == AMDGPU::S_QUADMASK_B64 || | ||
Def->getOpcode() == AMDGPU::S_NOT_B32 || | ||
Def->getOpcode() == AMDGPU::S_NOT_B64 || | ||
|
||
((Def->getOpcode() == AMDGPU::S_CSELECT_B32 || | ||
Def->getOpcode() == AMDGPU::S_CSELECT_B64) && | ||
Def->getOperand(1).isImm() && Def->getOperand(1).getImm() && | ||
!Def->getOperand(2).isImm() && !Def->getOperand(2).getImm()))) | ||
return false; | ||
|
||
for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator(); | ||
I != E; ++I) { | ||
if (I->modifiesRegister(AMDGPU::SCC, &RI) || | ||
I->killsRegister(AMDGPU::SCC, &RI)) | ||
return false; | ||
} | ||
Comment on lines
+10659
to
+10664
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you should need to be scanning around for physreg liveness, I don't see other targets doing that here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Scanning for register liveness is also done in the other lambda, optimizeCmpAnd, (line 10752) that is used in optimizeCompareInstr. It is necessary to prevent incorrectly deleting a s_cmp if an there is an intervening definition. In PR #161582, tests were added that requires this scanning. PR #161582 was done to add SCC as an implicit destination of s_quadmask*. Without this fix the scanning would not detect a conflict. |
||
|
||
if (!(Def->getOpcode() == AMDGPU::S_CSELECT_B32 || | ||
Def->getOpcode() == AMDGPU::S_CSELECT_B64)) { | ||
Comment on lines
+10666
to
+10667
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. demorgan this condition |
||
MachineOperand *SccDef = | ||
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); | ||
assert(SccDef && "Def instruction must define SCC"); | ||
SccDef->setIsDead(false); | ||
} | ||
|
||
CmpInstr.eraseFromParent(); | ||
return true; | ||
}; | ||
|
||
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, | ||
this](int64_t ExpectedValue, unsigned SrcSize, | ||
bool IsReversible, bool IsSigned) -> bool { | ||
|
@@ -10735,15 +10802,15 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, | |
case AMDGPU::S_CMP_LG_I32: | ||
case AMDGPU::S_CMPK_LG_U32: | ||
case AMDGPU::S_CMPK_LG_I32: | ||
return optimizeCmpAnd(0, 32, true, false); | ||
return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect(); | ||
case AMDGPU::S_CMP_GT_U32: | ||
case AMDGPU::S_CMPK_GT_U32: | ||
return optimizeCmpAnd(0, 32, false, false); | ||
case AMDGPU::S_CMP_GT_I32: | ||
case AMDGPU::S_CMPK_GT_I32: | ||
return optimizeCmpAnd(0, 32, false, true); | ||
case AMDGPU::S_CMP_LG_U64: | ||
return optimizeCmpAnd(0, 64, true, false); | ||
return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect(); | ||
} | ||
|
||
return false; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't see why the parent would matter, but this also looks untested
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Def and CmpInstr need to have the same parent so that the scan will work.