Skip to content

Commit 8e5f6dd

Browse files
authored
[AMDGPU] Remove redundant s_cmp_lg_* sX, 0 (#162352)
Remove redundant s_cmp_lg_* sX, 0 if SALU instruction already sets SCC if sX!=0. --------- Signed-off-by: John Lu <[email protected]>
1 parent a5d3522 commit 8e5f6dd

36 files changed

+2406
-3135
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10626,6 +10626,59 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
1062610626
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
1062710627
return false;
1062810628

10629+
const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10630+
this]() -> bool {
10631+
if (CmpValue != 0)
10632+
return false;
10633+
10634+
MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10635+
if (!Def || Def->getParent() != CmpInstr.getParent())
10636+
return false;
10637+
10638+
bool CanOptimize = false;
10639+
10640+
// For S_OP that set SCC = DST!=0, do the transformation
10641+
//
10642+
// s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
10643+
if (setsSCCifResultIsNonZero(*Def))
10644+
CanOptimize = true;
10645+
10646+
// s_cmp_lg_* is redundant because the SCC input value for S_CSELECT* has
10647+
// the same value that will be calculated by s_cmp_lg_*
10648+
//
10649+
// s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
10650+
// imm), 0)
10651+
if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10652+
Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
10653+
bool Op1IsNonZeroImm =
10654+
Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
10655+
bool Op2IsZeroImm =
10656+
Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
10657+
if (Op1IsNonZeroImm && Op2IsZeroImm)
10658+
CanOptimize = true;
10659+
}
10660+
10661+
if (!CanOptimize)
10662+
return false;
10663+
10664+
MachineInstr *KillsSCC = nullptr;
10665+
for (MachineInstr &MI :
10666+
make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
10667+
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10668+
return false;
10669+
if (MI.killsRegister(AMDGPU::SCC, &RI))
10670+
KillsSCC = &MI;
10671+
}
10672+
10673+
if (MachineOperand *SccDef =
10674+
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10675+
SccDef->setIsDead(false);
10676+
if (KillsSCC)
10677+
KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10678+
CmpInstr.eraseFromParent();
10679+
return true;
10680+
};
10681+
1062910682
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
1063010683
this](int64_t ExpectedValue, unsigned SrcSize,
1063110684
bool IsReversible, bool IsSigned) -> bool {
@@ -10700,16 +10753,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
1070010753
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
1070110754
return false;
1070210755

10703-
for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10704-
I != E; ++I) {
10705-
if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10706-
I->killsRegister(AMDGPU::SCC, &RI))
10756+
MachineInstr *KillsSCC = nullptr;
10757+
for (MachineInstr &MI :
10758+
make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
10759+
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
1070710760
return false;
10761+
if (MI.killsRegister(AMDGPU::SCC, &RI))
10762+
KillsSCC = &MI;
1070810763
}
1070910764

1071010765
MachineOperand *SccDef =
1071110766
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
1071210767
SccDef->setIsDead(false);
10768+
if (KillsSCC)
10769+
KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
1071310770
CmpInstr.eraseFromParent();
1071410771

1071510772
if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10753,15 +10810,15 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
1075310810
case AMDGPU::S_CMP_LG_I32:
1075410811
case AMDGPU::S_CMPK_LG_U32:
1075510812
case AMDGPU::S_CMPK_LG_I32:
10756-
return optimizeCmpAnd(0, 32, true, false);
10813+
return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
1075710814
case AMDGPU::S_CMP_GT_U32:
1075810815
case AMDGPU::S_CMPK_GT_U32:
1075910816
return optimizeCmpAnd(0, 32, false, false);
1076010817
case AMDGPU::S_CMP_GT_I32:
1076110818
case AMDGPU::S_CMPK_GT_I32:
1076210819
return optimizeCmpAnd(0, 32, false, true);
1076310820
case AMDGPU::S_CMP_LG_U64:
10764-
return optimizeCmpAnd(0, 64, true, false);
10821+
return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
1076510822
}
1076610823

1076710824
return false;

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,30 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
709709
}
710710
}
711711

712+
static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
713+
if (!MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
714+
return false;
715+
// Compares have no result
716+
if (MI.isCompare())
717+
return false;
718+
switch (MI.getOpcode()) {
719+
default:
720+
return true;
721+
case AMDGPU::S_ADD_I32:
722+
case AMDGPU::S_ADD_U32:
723+
case AMDGPU::S_ADDC_U32:
724+
case AMDGPU::S_SUB_I32:
725+
case AMDGPU::S_SUB_U32:
726+
case AMDGPU::S_SUBB_U32:
727+
case AMDGPU::S_MIN_I32:
728+
case AMDGPU::S_MIN_U32:
729+
case AMDGPU::S_MAX_I32:
730+
case AMDGPU::S_MAX_U32:
731+
case AMDGPU::S_ADDK_I32:
732+
return false;
733+
}
734+
}
735+
712736
static bool isEXP(const MachineInstr &MI) {
713737
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
714738
}

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
140140
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
141141
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
142142
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
143-
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
144143
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
145144
; CHECK-NEXT: ; %bb.1: ; %false
146145
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
345344
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
346345
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
347346
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
348-
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
349347
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
350348
; CHECK-NEXT: ; %bb.1: ; %false
351349
; CHECK-NEXT: s_mov_b32 s0, 33

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
143143
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
144144
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
145145
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
146-
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
147146
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
148147
; CHECK-NEXT: ; %bb.1: ; %false
149148
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
348347
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
349348
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
350349
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
351-
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
352350
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
353351
; CHECK-NEXT: ; %bb.1: ; %false
354352
; CHECK-NEXT: s_mov_b32 s0, 33

llvm/test/CodeGen/AMDGPU/addsub64_carry.ll

Lines changed: 8 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
180180
; CHECK-LABEL: s_add64_32:
181181
; CHECK: ; %bb.0:
182182
; CHECK-NEXT: s_add_u32 s0, s0, s2
183-
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
184-
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
185183
; CHECK-NEXT: s_addc_u32 s1, s1, s3
186-
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
187-
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
188184
; CHECK-NEXT: s_addc_u32 s2, s4, 0
189185
; CHECK-NEXT: ; return to shader part epilog
190186
%sum64 = add i64 %val64A, %val64B
@@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
199195
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
200196
; CHECK-LABEL: s_uadd_v2i64:
201197
; CHECK: ; %bb.0:
202-
; CHECK-NEXT: s_add_u32 s10, s2, s6
203-
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
204-
; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
205-
; CHECK-NEXT: s_addc_u32 s8, s3, s7
198+
; CHECK-NEXT: s_add_u32 s6, s2, s6
199+
; CHECK-NEXT: s_addc_u32 s7, s3, s7
206200
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
207201
; CHECK-NEXT: s_add_u32 s0, s0, s4
208-
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
209-
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
210202
; CHECK-NEXT: s_addc_u32 s1, s1, s5
211203
; CHECK-NEXT: v_mov_b32_e32 v2, s0
212204
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
215207
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
216208
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
217209
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
218-
; CHECK-NEXT: v_mov_b32_e32 v4, s10
219-
; CHECK-NEXT: v_mov_b32_e32 v5, s8
210+
; CHECK-NEXT: v_mov_b32_e32 v4, s6
211+
; CHECK-NEXT: v_mov_b32_e32 v5, s7
220212
; CHECK-NEXT: s_mov_b32 s1, s0
221213
; CHECK-NEXT: s_mov_b32 s3, s2
222214
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
233225
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
234226
; CHECK-LABEL: s_usub_v2i64:
235227
; CHECK: ; %bb.0:
236-
; CHECK-NEXT: s_sub_u32 s10, s2, s6
237-
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
238-
; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
239-
; CHECK-NEXT: s_subb_u32 s8, s3, s7
228+
; CHECK-NEXT: s_sub_u32 s6, s2, s6
229+
; CHECK-NEXT: s_subb_u32 s7, s3, s7
240230
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
241231
; CHECK-NEXT: s_sub_u32 s0, s0, s4
242-
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
243-
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
244232
; CHECK-NEXT: s_subb_u32 s1, s1, s5
245233
; CHECK-NEXT: v_mov_b32_e32 v2, s0
246234
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
249237
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
250238
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
251239
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
252-
; CHECK-NEXT: v_mov_b32_e32 v4, s10
253-
; CHECK-NEXT: v_mov_b32_e32 v5, s8
240+
; CHECK-NEXT: v_mov_b32_e32 v4, s6
241+
; CHECK-NEXT: v_mov_b32_e32 v5, s7
254242
; CHECK-NEXT: s_mov_b32 s1, s0
255243
; CHECK-NEXT: s_mov_b32 s3, s2
256244
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
268256
; CHECK-LABEL: s_uadd_i64:
269257
; CHECK: ; %bb.0:
270258
; CHECK-NEXT: s_add_u32 s0, s0, s2
271-
; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
272-
; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
273259
; CHECK-NEXT: s_addc_u32 s1, s1, s3
274260
; CHECK-NEXT: v_mov_b32_e32 v2, s0
275261
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
292278
; CHECK-LABEL: s_uadd_p1:
293279
; CHECK: ; %bb.0:
294280
; CHECK-NEXT: s_add_u32 s0, s0, 1
295-
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
296-
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
297281
; CHECK-NEXT: s_addc_u32 s1, s1, 0
298282
; CHECK-NEXT: v_mov_b32_e32 v2, s0
299283
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
339323
; CHECK-LABEL: s_usub_p1:
340324
; CHECK: ; %bb.0:
341325
; CHECK-NEXT: s_sub_u32 s0, s0, 1
342-
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
343-
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
344326
; CHECK-NEXT: s_subb_u32 s1, s1, 0
345327
; CHECK-NEXT: v_mov_b32_e32 v2, s0
346328
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
363345
; CHECK-LABEL: s_usub_n1:
364346
; CHECK: ; %bb.0:
365347
; CHECK-NEXT: s_sub_u32 s0, s0, -1
366-
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
367-
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
368348
; CHECK-NEXT: s_subb_u32 s1, s1, -1
369349
; CHECK-NEXT: v_mov_b32_e32 v2, s0
370350
; CHECK-NEXT: v_mov_b32_e32 v3, s1

0 commit comments

Comments
 (0)