-
Couldn't load subscription status.
- Fork 15k
expand-fp: Refactor modification status handling (NFC) #163542
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This is a small refactoring to set the return value of the runImpl function which indicates whether or not the IR has been changed in a single place instead of doing it separately at the insertion of supported instructions into the worklist.
|
This change was suggested in the review of PR #163153. |
The "dyn_cast" needs to be there as witnessed by the test case in CodeGen/AMDGPU/frem.ll with two constant vector operands. Refactor the loop that visits the instructions to allow for a single assignment to the "Modified" variable.
This does reflect the structure of the instruction visiting function and is more readable.
|
@llvm/pr-subscribers-backend-amdgpu Author: Frederik Harwath (frederik-h) ChangesThis is a small refactoring to set the return value of the runImpl function which indicates whether or not the IR has been changed in a single place instead of doing it separately at the insertion of supported instructions into the worklist. Patch is 64.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163542.diff 2 Files Affected:
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 04c700869cd69..2b5ced3915a2c 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -993,7 +993,6 @@ static void addToWorklist(Instruction &I,
static bool runImpl(Function &F, const TargetLowering &TLI,
AssumptionCache *AC) {
SmallVector<Instruction *, 4> Worklist;
- bool Modified = false;
unsigned MaxLegalFpConvertBitWidth =
TLI.getMaxLargeFPConvertBitWidthSupported();
@@ -1003,50 +1002,49 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
if (MaxLegalFpConvertBitWidth >= llvm::IntegerType::MAX_INT_BITS)
return false;
- for (auto It = inst_begin(&F), End = inst_end(F); It != End;) {
- Instruction &I = *It++;
+ auto ShouldHandleInst = [&](Instruction &I) {
Type *Ty = I.getType();
// TODO: This pass doesn't handle scalable vectors.
if (Ty->isScalableTy())
- continue;
+ return false;
switch (I.getOpcode()) {
case Instruction::FRem:
- if (!targetSupportsFrem(TLI, Ty) &&
- FRemExpander::canExpandType(Ty->getScalarType())) {
- addToWorklist(I, Worklist);
- Modified = true;
- }
- break;
+ return !targetSupportsFrem(TLI, Ty) &&
+ FRemExpander::canExpandType(Ty->getScalarType());
+
case Instruction::FPToUI:
case Instruction::FPToSI: {
auto *IntTy = cast<IntegerType>(Ty->getScalarType());
- if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
- continue;
-
- addToWorklist(I, Worklist);
- Modified = true;
- break;
+ return IntTy->getIntegerBitWidth() > MaxLegalFpConvertBitWidth;
}
+
case Instruction::UIToFP:
case Instruction::SIToFP: {
auto *IntTy =
cast<IntegerType>(I.getOperand(0)->getType()->getScalarType());
- if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
- continue;
-
- addToWorklist(I, Worklist);
- Modified = true;
- break;
+ return IntTy->getIntegerBitWidth() > MaxLegalFpConvertBitWidth;
}
- default:
- break;
}
+
+ return false;
+ };
+
+ bool Modified = false;
+ for (auto It = inst_begin(&F), End = inst_end(F); It != End;) {
+ Instruction &I = *It++;
+ if (!ShouldHandleInst(I))
+ continue;
+
+ addToWorklist(I, Worklist);
+ Modified = true;
}
while (!Worklist.empty()) {
Instruction *I = Worklist.pop_back_val();
- if (I->getOpcode() == Instruction::FRem) {
+
+ switch (I->getOpcode()) {
+ case Instruction::FRem: {
auto SQ = [&]() -> std::optional<SimplifyQuery> {
if (AC) {
auto Res = std::make_optional<SimplifyQuery>(
@@ -1058,11 +1056,18 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
}();
expandFRem(cast<BinaryOperator>(*I), SQ);
- } else if (I->getOpcode() == Instruction::FPToUI ||
- I->getOpcode() == Instruction::FPToSI) {
+ break;
+ }
+
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
expandFPToI(I);
- } else {
+ break;
+
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
expandIToFP(I);
+ break;
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 415828f32f920..901ce6146cc9b 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -17589,5 +17589,1363 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
+
+define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: frem_v2f64_const_zero_num:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; SI-NEXT: s_and_b64 s[2:3], vcc, exec
+; SI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0
+; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: v_mov_b32_e32 v1, s8
+; SI-NEXT: v_mov_b32_e32 v3, s4
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; CI-LABEL: frem_v2f64_const_zero_num:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT: s_mov_b32 s7, 0xf000
+; CI-NEXT: s_mov_b32 s6, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_mov_b32 s4, s2
+; CI-NEXT: s_mov_b32 s5, s3
+; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_and_b64 s[2:3], vcc, exec
+; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0
+; CI-NEXT: s_mov_b32 s2, s6
+; CI-NEXT: s_mov_b32 s3, s7
+; CI-NEXT: v_mov_b32_e32 v1, s8
+; CI-NEXT: v_mov_b32_e32 v2, v0
+; CI-NEXT: s_and_b64 s[4:5], vcc, exec
+; CI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0
+; CI-NEXT: v_mov_b32_e32 v3, s4
+; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: frem_v2f64_const_zero_num:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-NEXT: s_cselect_b32 s0, 0x7ff80000, 0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: frem_v2f64_const_zero_num:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[3:4]
+; GFX9-NEXT: s_cselect_b32 s4, 0x7ff80000, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: frem_v2f64_const_zero_num:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX10-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: s_and_b32 s3, vcc_lo, exec_lo
+; GFX10-NEXT: s_cselect_b32 s3, 0x7ff80000, 0
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: frem_v2f64_const_zero_num:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b128 v[1:4], v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX11-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT: s_and_b32 s3, vcc_lo, exec_lo
+; GFX11-NEXT: s_cselect_b32 s3, 0x7ff80000, 0
+; GFX11-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-NEXT: global_store_b128 v0, v[0:3], s[0:1]
+; GFX11-NEXT: s_endpgm
+;
+; GFX1150-LABEL: frem_v2f64_const_zero_num:
+; GFX1150: ; %bb.0:
+; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT: global_load_b128 v[1:4], v0, s[2:3]
+; GFX1150-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX1150-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX1150-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX1150-NEXT: s_and_b32 s3, vcc_lo, exec_lo
+; GFX1150-NEXT: s_cselect_b32 s3, 0x7ff80000, 0
+; GFX1150-NEXT: v_mov_b32_e32 v3, s3
+; GFX1150-NEXT: global_store_b128 v0, v[0:3], s[0:1]
+; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64_const_zero_num:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: global_load_b128 v[1:4], v0, s[2:3]
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX1200-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX1200-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX1200-NEXT: s_and_b32 s3, vcc_lo, exec_lo
+; GFX1200-NEXT: s_cselect_b32 s3, 0x7ff80000, 0
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: v_mov_b32_e32 v3, s3
+; GFX1200-NEXT: global_store_b128 v0, v[0:3], s[0:1]
+; GFX1200-NEXT: s_endpgm
+ %r0 = load <2 x double>, ptr addrspace(1) %in, align 16
+ %r1 = frem <2 x double> <double 0.0, double 0.0>, %r0
+ store <2 x double> %r1, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+define amdgpu_kernel void @frem_v2f64_const_one_denum(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: frem_v2f64_const_one_denum:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT: s_cbranch_vccz .LBB15_2
+; SI-NEXT: ; %bb.1: ; %frem.else16
+; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v1
+; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; SI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc
+; SI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc
+; SI-NEXT: s_mov_b64 vcc, exec
+; SI-NEXT: s_cbranch_execz .LBB15_3
+; SI-NEXT: s_branch .LBB15_8
+; SI-NEXT: .LBB15_2:
+; SI-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SI-NEXT: s_mov_b64 vcc, 0
+; SI-NEXT: .LBB15_3: ; %frem.compute15
+; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, 0x7ff00000
+; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; SI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; SI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; SI-NEXT: s_and_b64 s[2:3], vcc, exec
+; SI-NEXT: v_readfirstlane_b32 s2, v6
+; SI-NEXT: s_cselect_b32 s3, s2, 0
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_add_i32 s5, s3, -1
+; SI-NEXT: v_ldexp_f64 v[5:6], v[4:5], 26
+; SI-NEXT: s_cmp_lt_i32 s5, 27
+; SI-NEXT: s_cbranch_scc1 .LBB15_7
+; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
+; SI-NEXT: s_add_i32 s5, s3, 25
+; SI-NEXT: v_mov_b32_e32 v9, 0x43300000
+; SI-NEXT: v_mov_b32_e32 v4, 0
+; SI-NEXT: s_mov_b32 s3, 0x432fffff
+; SI-NEXT: .LBB15_5: ; %frem.loop_body23
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: v_mov_b32_e32 v8, v6
+; SI-NEXT: v_mov_b32_e32 v7, v5
+; SI-NEXT: v_bfi_b32 v5, s4, v9, v8
+; SI-NEXT: v_add_f64 v[10:11], v[7:8], v[4:5]
+; SI-NEXT: v_add_f64 v[5:6], v[10:11], -v[4:5]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[7:8]|, s[2:3]
+; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT: v_add_f64 v[5:6], v[7:8], -v[5:6]
+; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[5:6]
+; SI-NEXT: v_add_f64 v[10:11], v[5:6], 1.0
+; SI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
+; SI-NEXT: v_ldexp_f64 v[5:6], v[5:6], 26
+; SI-NEXT: s_sub_i32 s5, s5, 26
+; SI-NEXT: s_cmp_gt_i32 s5, 26
+; SI-NEXT: s_cbranch_scc1 .LBB15_5
+; SI-NEXT: ; %bb.6: ; %Flow50
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: .LBB15_7: ; %frem.loop_exit24
+; SI-NEXT: s_sub_i32 s2, s5, 25
+; SI-NEXT: v_ldexp_f64 v[4:5], v[5:6], s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0x432fffff
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[4:5]|, s[2:3]
+; SI-NEXT: s_brev_b32 s2, -2
+; SI-NEXT: v_mov_b32_e32 v6, 0x43300000
+; SI-NEXT: v_bfi_b32 v7, s2, v6, v5
+; SI-NEXT: v_mov_b32_e32 v6, 0
+; SI-NEXT: v_add_f64 v[8:9], v[4:5], v[6:7]
+; SI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7]
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc
+; SI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7]
+; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; SI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0
+; SI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT: v_bfi_b32 v5, s2, v5, v1
+; SI-NEXT: .LBB15_8:
+; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT: s_cbranch_vccz .LBB15_10
+; SI-NEXT: ; %bb.9: ; %frem.else
+; SI-NEXT: v_and_b32_e32 v6, 0x80000000, v3
+; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc
+; SI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
+; SI-NEXT: s_mov_b64 vcc, exec
+; SI-NEXT: s_cbranch_execz .LBB15_11
+; SI-NEXT: s_branch .LBB15_16
+; SI-NEXT: .LBB15_10:
+; SI-NEXT: ; implicit-def: $vgpr6_vgpr7
+; SI-NEXT: s_mov_b64 vcc, 0
+; SI-NEXT: .LBB15_11: ; %frem.compute
+; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_and_b32_e32 v8, 0x7fffffff, v3
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, 0x7ff00000
+; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[2:3]
+; SI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc
+; SI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; SI-NEXT: s_and_b64 s[2:3], vcc, exec
+; SI-NEXT: v_readfirstlane_b32 s2, v8
+; SI-NEXT: s_cselect_b32 s3, s2, 0
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_add_i32 s5, s3, -1
+; SI-NEXT: v_ldexp_f64 v[7:8], v[6:7], 26
+; SI-NEXT: s_cmp_lt_i32 s5, 27
+; SI-NEXT: s_cbranch_scc1 .LBB15_15
+; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
+; SI-NEXT: s_add_i32 s5, s3, 25
+; SI-NEXT: v_mov_b32_e32 v11, 0x43300000
+; SI-NEXT: v_mov_b32_e32 v6, 0
+; SI-NEXT: s_mov_b32 s3, 0x432fffff
+; SI-NEXT: .LBB15_13: ; %frem.loop_body
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: v_mov_b32_e32 v10, v8
+; SI-NEXT: v_mov_b32_e32 v9, v7
+; SI-NEXT: v_bfi_b32 v7, s4, v11, v10
+; SI-NEXT: v_add_f64 v[12:13], v[9:10], v[6:7]
+; SI-NEXT: v_add_f64 v[7:8], v[12:13], -v[6:7]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[9:10]|, s[2:3]
+; SI-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT: v_add_f64 v[7:8], v[9:10], -v[7:8]
+; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[7:8]
+; SI-NEXT: v_add_f64 v[12:13], v[7:8], 1.0
+; SI-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
+; SI-NEXT: v_ldexp_f64 v[7:8], v[7:8], 26
+; SI-NEXT: s_sub_i32 s5, s5, 26
+; SI-NEXT: s_cmp_gt_i32 s5, 26
+; SI-NEXT: s_cbranch_scc1 .LBB15_13
+; SI-NEXT: ; %bb.14: ; %Flow
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: v_mov_b32_e32 v8, v10
+; SI-NEXT: .LBB15_15: ; %frem.loop_exit
+; SI-NEXT: s_sub_i32 s2, s5, 25
+; SI-NEXT: v_ldexp_f64 v[6:7], v[7:8], s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0x432fffff
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[6:7]|, s[2:3]
+; SI-NEXT: s_brev_b32 s2, -2
+; SI-NEXT: v_mov_b32_e32 v8, 0x43300000
+; SI-NEXT: v_bfi_b32 v9, s2, v8, v7
+; SI-NEXT: v_mov_b32_e32 v8, 0
+; SI-NEXT: v_add_f64 v[10:11], v[6:7], v[8:9]
+; SI-NEXT: v_add_f64 v[8:9], v[10:11], -v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
+; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9]
+; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; SI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0
+; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT: v_bfi_b32 v7, s2, v7, v3
+; SI-NEXT: .LBB15_16: ; %Flow49
+; SI-NEXT: s_mov_b32 s4, 0
+; SI-NEXT: s_mov_b32 s5, 0x7ff00000
+; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; SI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; CI-LABEL: frem_v2f64_const_one_denum:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT: s_mov_b32 s7, 0xf000
+; CI-NEXT: s_mov_b32 s6, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_mov_b32 s4, s2
+; CI-NEXT: s_mov_b32 s5, s3
+; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT: s_cbranch_vccz .LBB15_2
+; CI-NEXT: ; %bb.1: ; %frem.else16
+; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; CI-NEXT: v_and_b32_e32 v4, 0x80000000, v1
+; CI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc
+; CI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc
+; CI-NEXT: s_cbranch_execz .LBB15_3
+; CI-NEXT: s_branch .LBB15_8
+; CI-NEXT: .LBB15_2:
+; CI-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CI-NEXT: .LBB15_3: ; %frem.compute15
+; CI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; CI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v6
+; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8
+; CI-NEXT: s_cbranch_vccnz .LBB15_7
+; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
+; CI-NEXT: v_add_i32_e32 v8, vcc, 25, v6
+; CI-NEXT: .LBB15_5: ; %frem.loop_body23
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: v_rndne_f64_e32 v[4:5], v[6:7]
+; CI-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT: v_add_f64 v[9:10], v[4:5], 1.0
+; CI-NEXT: v_cndmask_b32_e32 v5, v5,...
[truncated]
|
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/144/builds/38187 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/27/builds/17750 Here is the relevant piece of the build log for the reference |
This is a small refactoring to set the return value of the runImpl function which indicates whether or not the IR has been changed in a single place instead of doing it separately at the insertion of supported instructions into the worklist.