@@ -8009,7 +8009,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
8009
8009
8010
8010
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
8011
8011
this ](int64_t ExpectedValue,
8012
- unsigned SrcSize) -> bool {
8012
+ unsigned SrcSize,
8013
+ bool IsReversable) -> bool {
8013
8014
// s_cmp_eq_u32 (s_and_b32 $src, 1), 1 => s_and_b32 $src, 1
8014
8015
// s_cmp_eq_i32 (s_and_b32 $src, 1), 1 => s_and_b32 $src, 1
8015
8016
// s_cmp_ge_u32 (s_and_b32 $src, 1), 1 => s_and_b32 $src, 1
@@ -8023,9 +8024,22 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
8023
8024
//
8024
8025
// If result of the AND is unused except in the compare:
8025
8026
// s_and_b(32|64) $src, 1 => s_bitcmp1_b(32|64) $src, 0
8026
-
8027
- if (CmpValue != ExpectedValue)
8028
- return false ;
8027
+ //
8028
+ // s_cmp_eq_u32 (s_and_b32 $src, 1), 0 => s_bitcmp0_b32 $src, 0
8029
+ // s_cmp_eq_i32 (s_and_b32 $src, 1), 0 => s_bitcmp0_b32 $src, 0
8030
+ // s_cmp_eq_u64 (s_and_b64 $src, 1), 0 => s_bitcmp0_b64 $src, 0
8031
+ // s_cmp_lg_u32 (s_and_b32 $src, 1), 1 => s_bitcmp0_b32 $src, 0
8032
+ // s_cmp_lg_i32 (s_and_b32 $src, 1), 1 => s_bitcmp0_b32 $src, 0
8033
+ // s_cmp_lg_u64 (s_and_b64 $src, 1), 1 => s_bitcmp0_b64 $src, 0
8034
+
8035
+ bool IsReversedCC = false ;
8036
+ if (CmpValue != ExpectedValue) {
8037
+ if (!IsReversable)
8038
+ return false ;
8039
+ IsReversedCC = CmpValue == (ExpectedValue ^ 1 );
8040
+ if (!IsReversedCC)
8041
+ return false ;
8042
+ }
8029
8043
8030
8044
MachineInstr *Def = MRI->getUniqueVRegDef (SrcReg);
8031
8045
if (!Def || Def->getParent () != CmpInstr.getParent ())
@@ -8041,6 +8055,10 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
8041
8055
else if (!Def->getOperand (2 ).isImm () || Def->getOperand (2 ).getImm () != 1 )
8042
8056
return false ;
8043
8057
8058
+ Register DefReg = Def->getOperand (0 ).getReg ();
8059
+ if (IsReversedCC && !MRI->hasOneNonDBGUse (DefReg))
8060
+ return false ;
8061
+
8044
8062
for (auto I = std::next (Def->getIterator ()), E = CmpInstr.getIterator ();
8045
8063
I != E; ++I) {
8046
8064
if (I->modifiesRegister (AMDGPU::SCC, &RI) ||
@@ -8052,17 +8070,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
8052
8070
SccDef->setIsDead (false );
8053
8071
CmpInstr.eraseFromParent ();
8054
8072
8055
- if (!MRI->use_nodbg_empty (Def->getOperand (0 ).getReg ()))
8073
+ if (!MRI->use_nodbg_empty (DefReg)) {
8074
+ assert (!IsReversedCC);
8056
8075
return true ;
8076
+ }
8057
8077
8058
8078
// Replace AND with unused result with a S_BITCMP.
8059
8079
// TODO: If s_bitcmp can be used we are not limited to 1 and 0 but can
8060
8080
// process any power of 2.
8061
8081
MachineBasicBlock *MBB = Def->getParent ();
8062
8082
8063
- // TODO: Reverse conditions can use S_BITCMP0_*.
8064
- unsigned NewOpc = (SrcSize == 32 ) ? AMDGPU::S_BITCMP1_B32
8065
- : AMDGPU::S_BITCMP1_B64;
8083
+ unsigned NewOpc = (SrcSize == 32 ) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
8084
+ : AMDGPU::S_BITCMP1_B32
8085
+ : IsReversedCC ? AMDGPU::S_BITCMP0_B64
8086
+ : AMDGPU::S_BITCMP1_B64;
8066
8087
8067
8088
BuildMI (*MBB, Def, Def->getDebugLoc (), get (NewOpc))
8068
8089
.add (*SrcOp)
@@ -8077,26 +8098,28 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
8077
8098
break ;
8078
8099
case AMDGPU::S_CMP_EQ_U32:
8079
8100
case AMDGPU::S_CMP_EQ_I32:
8080
- case AMDGPU::S_CMP_GE_U32:
8081
- case AMDGPU::S_CMP_GE_I32:
8082
8101
case AMDGPU::S_CMPK_EQ_U32:
8083
8102
case AMDGPU::S_CMPK_EQ_I32:
8103
+ return optimizeCmpAnd (1 , 32 , true );
8104
+ case AMDGPU::S_CMP_GE_U32:
8105
+ case AMDGPU::S_CMP_GE_I32:
8084
8106
case AMDGPU::S_CMPK_GE_U32:
8085
8107
case AMDGPU::S_CMPK_GE_I32:
8086
- return optimizeCmpAnd (1 , 32 );
8108
+ return optimizeCmpAnd (1 , 32 , false );
8087
8109
case AMDGPU::S_CMP_EQ_U64:
8088
- return optimizeCmpAnd (1 , 64 );
8110
+ return optimizeCmpAnd (1 , 64 , true );
8089
8111
case AMDGPU::S_CMP_LG_U32:
8090
8112
case AMDGPU::S_CMP_LG_I32:
8091
- case AMDGPU::S_CMP_GT_U32:
8092
- case AMDGPU::S_CMP_GT_I32:
8093
8113
case AMDGPU::S_CMPK_LG_U32:
8094
8114
case AMDGPU::S_CMPK_LG_I32:
8115
+ return optimizeCmpAnd (0 , 32 , true );
8116
+ case AMDGPU::S_CMP_GT_U32:
8117
+ case AMDGPU::S_CMP_GT_I32:
8095
8118
case AMDGPU::S_CMPK_GT_U32:
8096
8119
case AMDGPU::S_CMPK_GT_I32:
8097
- return optimizeCmpAnd (0 , 32 );
8120
+ return optimizeCmpAnd (0 , 32 , false );
8098
8121
case AMDGPU::S_CMP_LG_U64:
8099
- return optimizeCmpAnd (0 , 64 );
8122
+ return optimizeCmpAnd (0 , 64 , true );
8100
8123
}
8101
8124
8102
8125
return false ;
0 commit comments