Skip to content

Commit b94cbee

Browse files
jplehrgithub-actions[bot]
authored andcommitted
Automerge: Revert "[AMDGPU] Remove redundant s_cmp_lg_* sX, 0 " (#164116)
Reverts llvm/llvm-project#162352 Broke our buildbot: https://lab.llvm.org/buildbot/#/builders/10/builds/15674 To reproduce cd llvm-project cmake -S llvm -B thebuild -C offload/cmake/caches/AMDGPULibcBot.cmake -GNinja cd thebuild ninja ninja check-libc-amdgcn-amd-amdhsa
2 parents 9b38ca0 + 023b1f6 commit b94cbee

36 files changed

+3135
-2406
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 6 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -10626,59 +10626,6 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
1062610626
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
1062710627
return false;
1062810628

10629-
const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10630-
this]() -> bool {
10631-
if (CmpValue != 0)
10632-
return false;
10633-
10634-
MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10635-
if (!Def || Def->getParent() != CmpInstr.getParent())
10636-
return false;
10637-
10638-
bool CanOptimize = false;
10639-
10640-
// For S_OP that set SCC = DST!=0, do the transformation
10641-
//
10642-
// s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
10643-
if (setsSCCifResultIsNonZero(*Def))
10644-
CanOptimize = true;
10645-
10646-
// s_cmp_lg_* is redundant because the SCC input value for S_CSELECT* has
10647-
// the same value that will be calculated by s_cmp_lg_*
10648-
//
10649-
// s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
10650-
// imm), 0)
10651-
if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10652-
Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
10653-
bool Op1IsNonZeroImm =
10654-
Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
10655-
bool Op2IsZeroImm =
10656-
Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
10657-
if (Op1IsNonZeroImm && Op2IsZeroImm)
10658-
CanOptimize = true;
10659-
}
10660-
10661-
if (!CanOptimize)
10662-
return false;
10663-
10664-
MachineInstr *KillsSCC = nullptr;
10665-
for (MachineInstr &MI :
10666-
make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
10667-
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10668-
return false;
10669-
if (MI.killsRegister(AMDGPU::SCC, &RI))
10670-
KillsSCC = &MI;
10671-
}
10672-
10673-
if (MachineOperand *SccDef =
10674-
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10675-
SccDef->setIsDead(false);
10676-
if (KillsSCC)
10677-
KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10678-
CmpInstr.eraseFromParent();
10679-
return true;
10680-
};
10681-
1068210629
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
1068310630
this](int64_t ExpectedValue, unsigned SrcSize,
1068410631
bool IsReversible, bool IsSigned) -> bool {
@@ -10753,20 +10700,16 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
1075310700
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
1075410701
return false;
1075510702

10756-
MachineInstr *KillsSCC = nullptr;
10757-
for (MachineInstr &MI :
10758-
make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
10759-
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10703+
for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10704+
I != E; ++I) {
10705+
if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10706+
I->killsRegister(AMDGPU::SCC, &RI))
1076010707
return false;
10761-
if (MI.killsRegister(AMDGPU::SCC, &RI))
10762-
KillsSCC = &MI;
1076310708
}
1076410709

1076510710
MachineOperand *SccDef =
1076610711
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
1076710712
SccDef->setIsDead(false);
10768-
if (KillsSCC)
10769-
KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
1077010713
CmpInstr.eraseFromParent();
1077110714

1077210715
if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10810,15 +10753,15 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
1081010753
case AMDGPU::S_CMP_LG_I32:
1081110754
case AMDGPU::S_CMPK_LG_U32:
1081210755
case AMDGPU::S_CMPK_LG_I32:
10813-
return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
10756+
return optimizeCmpAnd(0, 32, true, false);
1081410757
case AMDGPU::S_CMP_GT_U32:
1081510758
case AMDGPU::S_CMPK_GT_U32:
1081610759
return optimizeCmpAnd(0, 32, false, false);
1081710760
case AMDGPU::S_CMP_GT_I32:
1081810761
case AMDGPU::S_CMPK_GT_I32:
1081910762
return optimizeCmpAnd(0, 32, false, true);
1082010763
case AMDGPU::S_CMP_LG_U64:
10821-
return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
10764+
return optimizeCmpAnd(0, 64, true, false);
1082210765
}
1082310766

1082410767
return false;

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -709,30 +709,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
709709
}
710710
}
711711

712-
static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
713-
if (!MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
714-
return false;
715-
// Compares have no result
716-
if (MI.isCompare())
717-
return false;
718-
switch (MI.getOpcode()) {
719-
default:
720-
return true;
721-
case AMDGPU::S_ADD_I32:
722-
case AMDGPU::S_ADD_U32:
723-
case AMDGPU::S_ADDC_U32:
724-
case AMDGPU::S_SUB_I32:
725-
case AMDGPU::S_SUB_U32:
726-
case AMDGPU::S_SUBB_U32:
727-
case AMDGPU::S_MIN_I32:
728-
case AMDGPU::S_MIN_U32:
729-
case AMDGPU::S_MAX_I32:
730-
case AMDGPU::S_MAX_U32:
731-
case AMDGPU::S_ADDK_I32:
732-
return false;
733-
}
734-
}
735-
736712
static bool isEXP(const MachineInstr &MI) {
737713
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
738714
}

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
140140
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
141141
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
142142
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
143+
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
143144
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
144145
; CHECK-NEXT: ; %bb.1: ; %false
145146
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -344,6 +345,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
344345
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
345346
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
346347
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
348+
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
347349
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
348350
; CHECK-NEXT: ; %bb.1: ; %false
349351
; CHECK-NEXT: s_mov_b32 s0, 33

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
143143
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
144144
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
145145
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
146+
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
146147
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
147148
; CHECK-NEXT: ; %bb.1: ; %false
148149
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -347,6 +348,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
347348
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
348349
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
349350
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
351+
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
350352
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
351353
; CHECK-NEXT: ; %bb.1: ; %false
352354
; CHECK-NEXT: s_mov_b32 s0, 33

llvm/test/CodeGen/AMDGPU/addsub64_carry.ll

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,11 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
180180
; CHECK-LABEL: s_add64_32:
181181
; CHECK: ; %bb.0:
182182
; CHECK-NEXT: s_add_u32 s0, s0, s2
183+
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
184+
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
183185
; CHECK-NEXT: s_addc_u32 s1, s1, s3
186+
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
187+
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
184188
; CHECK-NEXT: s_addc_u32 s2, s4, 0
185189
; CHECK-NEXT: ; return to shader part epilog
186190
%sum64 = add i64 %val64A, %val64B
@@ -195,10 +199,14 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
195199
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
196200
; CHECK-LABEL: s_uadd_v2i64:
197201
; CHECK: ; %bb.0:
198-
; CHECK-NEXT: s_add_u32 s6, s2, s6
199-
; CHECK-NEXT: s_addc_u32 s7, s3, s7
202+
; CHECK-NEXT: s_add_u32 s10, s2, s6
203+
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
204+
; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
205+
; CHECK-NEXT: s_addc_u32 s8, s3, s7
200206
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
201207
; CHECK-NEXT: s_add_u32 s0, s0, s4
208+
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
209+
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
202210
; CHECK-NEXT: s_addc_u32 s1, s1, s5
203211
; CHECK-NEXT: v_mov_b32_e32 v2, s0
204212
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -207,8 +215,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
207215
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
208216
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
209217
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
210-
; CHECK-NEXT: v_mov_b32_e32 v4, s6
211-
; CHECK-NEXT: v_mov_b32_e32 v5, s7
218+
; CHECK-NEXT: v_mov_b32_e32 v4, s10
219+
; CHECK-NEXT: v_mov_b32_e32 v5, s8
212220
; CHECK-NEXT: s_mov_b32 s1, s0
213221
; CHECK-NEXT: s_mov_b32 s3, s2
214222
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -225,10 +233,14 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
225233
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
226234
; CHECK-LABEL: s_usub_v2i64:
227235
; CHECK: ; %bb.0:
228-
; CHECK-NEXT: s_sub_u32 s6, s2, s6
229-
; CHECK-NEXT: s_subb_u32 s7, s3, s7
236+
; CHECK-NEXT: s_sub_u32 s10, s2, s6
237+
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
238+
; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
239+
; CHECK-NEXT: s_subb_u32 s8, s3, s7
230240
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
231241
; CHECK-NEXT: s_sub_u32 s0, s0, s4
242+
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
243+
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
232244
; CHECK-NEXT: s_subb_u32 s1, s1, s5
233245
; CHECK-NEXT: v_mov_b32_e32 v2, s0
234246
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -237,8 +249,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
237249
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
238250
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
239251
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
240-
; CHECK-NEXT: v_mov_b32_e32 v4, s6
241-
; CHECK-NEXT: v_mov_b32_e32 v5, s7
252+
; CHECK-NEXT: v_mov_b32_e32 v4, s10
253+
; CHECK-NEXT: v_mov_b32_e32 v5, s8
242254
; CHECK-NEXT: s_mov_b32 s1, s0
243255
; CHECK-NEXT: s_mov_b32 s3, s2
244256
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -256,6 +268,8 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
256268
; CHECK-LABEL: s_uadd_i64:
257269
; CHECK: ; %bb.0:
258270
; CHECK-NEXT: s_add_u32 s0, s0, s2
271+
; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
272+
; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
259273
; CHECK-NEXT: s_addc_u32 s1, s1, s3
260274
; CHECK-NEXT: v_mov_b32_e32 v2, s0
261275
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -278,6 +292,8 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
278292
; CHECK-LABEL: s_uadd_p1:
279293
; CHECK: ; %bb.0:
280294
; CHECK-NEXT: s_add_u32 s0, s0, 1
295+
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
296+
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
281297
; CHECK-NEXT: s_addc_u32 s1, s1, 0
282298
; CHECK-NEXT: v_mov_b32_e32 v2, s0
283299
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -323,6 +339,8 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
323339
; CHECK-LABEL: s_usub_p1:
324340
; CHECK: ; %bb.0:
325341
; CHECK-NEXT: s_sub_u32 s0, s0, 1
342+
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
343+
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
326344
; CHECK-NEXT: s_subb_u32 s1, s1, 0
327345
; CHECK-NEXT: v_mov_b32_e32 v2, s0
328346
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -345,6 +363,8 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
345363
; CHECK-LABEL: s_usub_n1:
346364
; CHECK: ; %bb.0:
347365
; CHECK-NEXT: s_sub_u32 s0, s0, -1
366+
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
367+
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
348368
; CHECK-NEXT: s_subb_u32 s1, s1, -1
349369
; CHECK-NEXT: v_mov_b32_e32 v2, s0
350370
; CHECK-NEXT: v_mov_b32_e32 v3, s1

0 commit comments

Comments
 (0)