-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Calc IsVALU correctly during UADDO/USUBO selection #159814
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: None (LU-JOHN) ChangesFix two bugs. The first bug hid the second bug.
Patch is 501.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159814.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c2fca79979e1b..f544c215c8661 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1089,10 +1089,17 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
++UI)
if (UI.getUse().getResNo() == 1) {
- if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
- (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
- IsVALU = true;
- break;
+ if (UI->isMachineOpcode()) {
+ if (UI->getMachineOpcode() !=
+ (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
+ IsVALU = true;
+ break;
+ }
+ } else {
+ if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
+ IsVALU = true;
+ break;
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 363717b017ef0..e50bc6c4d363a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5946,6 +5946,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const DebugLoc &DL = MI.getDebugLoc();
MachineOperand &Dest0 = MI.getOperand(0);
MachineOperand &Dest1 = MI.getOperand(1);
@@ -5961,9 +5963,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.add(Src1);
// clang-format on
- BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
- .addImm(1)
- .addImm(0);
+ const TargetRegisterClass *Dest1RC = MRI.getRegClass(Dest1.getReg());
+ unsigned Dest1Size = TRI->getRegSizeInBits(*Dest1RC);
+ assert(Dest1Size == 64 || Dest1Size == 32);
+ unsigned SelOpc =
+ (Dest1Size == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+ BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(1).addImm(0);
MI.eraseFromParent();
return BB;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index b2dcd77274989..e68353e5223fb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7792,8 +7792,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
; GFX6-NEXT: s_ashr_i32 s8, s1, 31
@@ -7803,143 +7804,175 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11
-; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT: s_sub_u32 s4, 0, s10
-; GFX6-NEXT: s_subb_u32 s5, 0, s11
+; GFX6-NEXT: s_sub_u32 s12, 0, s10
+; GFX6-NEXT: s_subb_u32 s13, 0, s11
; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_ashr_i32 s12, s3, 31
-; GFX6-NEXT: s_add_u32 s2, s2, s12
-; GFX6-NEXT: s_mov_b32 s13, s12
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: s_addc_u32 s3, s3, s12
-; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13]
-; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1
-; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0
-; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0
-; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4
-; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2
-; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2
-; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4
-; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1
-; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0
-; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
-; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
-; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
-; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3
-; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3
-; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2
-; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1
-; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0
-; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1
-; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1
-; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1
-; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0
-; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0
-; GFX6-NEXT: v_mov_b32_e32 v5, s11
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3
-; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3
-; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4
-; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1]
-; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0
-; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0
-; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v6, s3
-; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3
-; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
-; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1
+; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_mul_i32 s1, s12, s14
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_mul_i32 s15, s13, s0
+; GFX6-NEXT: s_mul_i32 s16, s12, s0
+; GFX6-NEXT: s_add_i32 s1, s17, s1
+; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16
+; GFX6-NEXT: s_add_i32 s1, s1, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16
+; GFX6-NEXT: v_readfirstlane_b32 s15, v3
+; GFX6-NEXT: s_mul_i32 s17, s0, s1
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1
+; GFX6-NEXT: s_add_u32 s15, s15, s17
+; GFX6-NEXT: v_readfirstlane_b32 s17, v0
+; GFX6-NEXT: s_addc_u32 s17, 0, s17
+; GFX6-NEXT: s_mul_i32 s16, s14, s16
+; GFX6-NEXT: v_readfirstlane_b32 s18, v4
+; GFX6-NEXT: s_add_u32 s15, s15, s16
+; GFX6-NEXT: s_addc_u32 s15, s17, s18
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: s_addc_u32 s16, s16, 0
+; GFX6-NEXT: s_mul_i32 s1, s14, s1
+; GFX6-NEXT: s_add_u32 s1, s15, s1
+; GFX6-NEXT: s_addc_u32 s15, 0, s16
+; GFX6-NEXT: s_add_i32 s16, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: s_cselect_b64 s[0:1], 1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
+; GFX6-NEXT: s_addc_u32 s14, s14, s15
+; GFX6-NEXT: s_mul_i32 s0, s12, s14
+; GFX6-NEXT: v_readfirstlane_b32 s1, v0
+; GFX6-NEXT: s_add_i32 s0, s1, s0
+; GFX6-NEXT: s_mul_i32 s13, s13, s16
+; GFX6-NEXT: s_mul_i32 s1, s12, s16
+; GFX6-NEXT: s_add_i32 s0, s0, s13
; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2
+; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0
+; GFX6-NEXT: s_mul_i32 s13, s16, s0
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_add_u32 s13, s17, s13
+; GFX6-NEXT: v_readfirstlane_b32 s15, v0
+; GFX6-NEXT: s_mul_i32 s1, s14, s1
+; GFX6-NEXT: s_addc_u32 s15, 0, s15
+; GFX6-NEXT: v_readfirstlane_b32 s12, v3
+; GFX6-NEXT: s_add_u32 s1, s13, s1
+; GFX6-NEXT: s_addc_u32 s1, s15, s12
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: s_addc_u32 s12, s12, 0
+; GFX6-NEXT: s_mul_i32 s0, s14, s0
+; GFX6-NEXT: s_add_u32 s0, s1, s0
+; GFX6-NEXT: s_addc_u32 s12, 0, s12
+; GFX6-NEXT: s_add_i32 s15, s16, s0
+; GFX6-NEXT: s_cselect_b64 s[0:1], 1, 0
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
+; GFX6-NEXT: s_addc_u32 s14, s14, s12
+; GFX6-NEXT: s_ashr_i32 s12, s7, 31
+; GFX6-NEXT: s_add_u32 s0, s6, s12
+; GFX6-NEXT: s_mov_b32 s13, s12
+; GFX6-NEXT: s_addc_u32 s1, s7, s12
+; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13]
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0
+; GFX6-NEXT: v_mov_b32_e32 v2, s15
+; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2
+; GFX6-NEXT: s_mov_b32 s0, s4
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
+; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2
+; GFX6-NEXT: s_mul_i32 s1, s6, s14
+; GFX6-NEXT: v_readfirstlane_b32 s16, v3
+; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0
+; GFX6-NEXT: s_add_u32 s1, s16, s1
+; GFX6-NEXT: s_addc_u32 s4, 0, s4
+; GFX6-NEXT: s_mul_i32 s15, s7, s15
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: s_add_u32 s1, s1, s15
+; GFX6-NEXT: s_addc_u32 s1, s4, s16
+; GFX6-NEXT: v_readfirstlane_b32 s4, v0
+; GFX6-NEXT: s_addc_u32 s4, s4, 0
+; GFX6-NEXT: s_mul_i32 s14, s7, s14
+; GFX6-NEXT: s_add_u32 s14, s1, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
+; GFX6-NEXT: s_addc_u32 s15, 0, s4
+; GFX6-NEXT: s_mov_b32 s1, s5
+; GFX6-NEXT: s_mul_i32 s4, s10, s15
+; GFX6-NEXT: v_readfirstlane_b32 s5, v0
+; GFX6-NEXT: s_add_i32 s4, s5, s4
+; GFX6-NEXT: s_mul_i32 s5, s11, s14
+; GFX6-NEXT: s_add_i32 s16, s4, s5
+; GFX6-NEXT: s_sub_i32 s17, s7, s16
+; GFX6-NEXT: s_mul_i32 s4, s10, s14
+; GFX6-NEXT: s_sub_i32 s6, s6, s4
+; GFX6-NEXT: s_cselect_b64 s[4:5], 1, 0
+; GFX6-NEXT: s_or_b32 s18, s4, s5
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_subb_u32 s17, s17, s11
+; GFX6-NEXT: s_sub_i32 s19, s6, s10
+; GFX6-NEXT: s_cselect_b64 s[4:5], 1, 0
+; GFX6-NEXT: s_or_b32 s4, s4, s5
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_subb_u32 s4, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s4, s11
+; GFX6-NEXT: s_cselect_b32 s5, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s10
+; GFX6-NEXT: s_cselect_b32 s17, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s4, s11
+; GFX6-NEXT: s_cselect_b32 s4, s17, s5
+; GFX6-NEXT: s_add_u32 s5, s14, 1
+; GFX6-NEXT: s_addc_u32 s17, s15, 0
+; GFX6-NEXT: s_add_u32 s19, s14, 2
+; GFX6-NEXT: s_addc_u32 s20, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_cselect_b32 s4, s19, s5
+; GFX6-NEXT: s_cselect_b32 s5, s20, s17
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_subb_u32 s7, s7, s16
+; GFX6-NEXT: s_cmp_ge_u32 s7, s11
+; GFX6-NEXT: s_cselect_b32 s16, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s6, s10
+; GFX6-NEXT: s_cselect_b32 s6, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s7, s11
+; GFX6-NEXT: s_cselect_b32 s6, s6, s16
+; GFX6-NEXT: s_cmp_lg_u32 s6, 0
+; GFX6-NEXT: s_cselect_b32 s5, s5, s15
+; GFX6-NEXT: s_cselect_b32 s4, s4, s14
+; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
+; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT: s_sub_u32 s4, s4, s6
+; GFX6-NEXT: s_subb_u32 s5, s5, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
-; GFX9-NEXT: s_ashr_i32 s2, s1, 31
-; GFX9-NEXT: s_add_u32 s0, s0, s2
-; GFX9-NEXT: s_mov_b32 s3, s2
-; GFX9-NEXT: s_addc_u32 s1, s1, s2
-; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s0, 0, s6
-; GFX9-NEXT: s_subb_u32 s1, 0, s7
+; GFX9-NEXT: s_ashr_i32 s6, s1, 31
+; GFX9-NEXT: s_add_u32 s0, s0, s6
+; GFX9-NEXT: s_mov_b32 s7, s6
+; GFX9-NEXT: s_addc_u32 s1, s1, s6
+; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_sub_u32 s10, 0, s8
+; GFX9-NEXT: s_subb_u32 s11, 0, s9
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -7949,130 +7982,122 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: s_mul_i32 s12, s0, s4
-; GFX9-NEXT: s_mul_hi_u32 s14, s0, s5
-; GFX9-NEXT: s_mul_i32 s13, s1, s5
-; GFX9-NEXT: s_add_i32 s12, s14, s12
-; GFX9-NEXT: s_mul_i32 s15, s0, s5
-; GFX9-NEXT: s_add_i32 s12, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s5, s15
-; GFX9-NEXT: s_mul_hi_u32 s13, s5, s12
-; GFX9-NEXT: s_mul_i32 s5, s5, s12
-; GFX9-NEXT: s_add_u32 s5, s14, s5
+; GFX9-NEXT: v_readfirstlane_b32 s12, v2
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mul_i32 s5, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4
+; GFX9-NEXT: s_mul_i32 s13, s11, s4
+; GFX9-NEXT: s_add_i32 s5, s14, s5
+; GFX9-NEXT: s_mul_i32 s15, s10, s4
+; GFX9-NEXT: s_add_i32 s5, s5, s13
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15
+; GFX9-NEXT: s_mul_i32 s16, s4, s5
+; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5
+; GFX9-NEXT: s_add_u32 s14, s14, s16
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15
-; GFX9-NEXT: s_mul_i32 s15, s4, s15
-; GFX9-NEXT: s_add_u32 s5, s5, s15
-; GFX9-NEXT: s_mul_hi_u32 s14, s4, s12
-; GFX9-NEXT: s_addc_u32 s5, s13, s16
-; GFX9-NEXT: s_addc_u32 s13, s14, 0
-; GFX9-NEXT: s_mul_i32 s12, s4, s12
-; GFX9-NEXT: s_add_u32 s5, s5, s12
-; GFX9-NEXT: s_addc_u32 s12, 0, s13
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s5, v1
-; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s4, s4, s12
-; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: s_mul_i32 s5, s0, s4
-; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12
-; GFX9-NEXT: s_add_i32 s5, s13, s5
-; GFX9-NEXT: s_mul_i32 s1, s1, s12
-; GFX9-NEXT: s_add_i32 s5, s5, s1
-; GFX9-NEXT: s_mul_i32 s0, s0, s12
-; GFX9-NEXT: s_mul_hi_u32 s13, s4, s0
-; GFX9-NEXT: s_mul_i32 s14, s4, s0
-; GFX9-NEXT: s_mul_i32 s16, s12, s5
-; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0
-; GFX9-NEXT: s_mul_hi_u32 s15, s12, s5
-; GFX9-NEXT: s_add_u32 s0, s0, s16
-; GFX9-NEXT: s_addc_u32 s12, 0, s15
-; GFX9-NEXT: s_add_u32 s0, s0, s14
-; GFX9-NEXT: s_mul_hi_u32 s1, s4, s5
-; GFX9-NEXT: s_addc_u32 s0, s12, s13
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_mul_i32 s5, s4, s5
-; GFX9-NEXT: s_add_u32 s0, s0, s5
-; GFX9-NEXT: s_addc_u32 s1, 0, s1
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1
-; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s12, s4, s1
-; GFX9-NEXT: s_ashr_i32 s4, s11, 31
-; GFX9-NEXT: s_add_u32 s0, s10, s4
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT: s_mul_i32 s15, s12, s15
+; GFX9-NEXT: s_add_u32 s14, s14, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5
+; GFX9-NEXT: s_addc_u32 s13, s13, s17
+; GFX9-NEXT: s_addc_u32 s14, s16, 0
+; GFX9-NEXT: s_mul_i32 s5, s12, s5
+; GFX9-NEXT: s_add_u32 s5, s13, s5
+; GFX9-NEXT: s_addc_u32 s13, 0, s14
+; GFX9-NEXT: s_add_i32 s14, s4, s5
+; GFX9-NEXT: s_cselect_b64 s[4:5], 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s12, s12, s13
+; GFX9-NEXT: s_mul_i32 s4, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14
+; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s11, s11, s14
+; GFX9-NEXT: s_add_i32 s4, s4, s11
+; GFX9-NEXT: s_mul_i32 s10, s10, s14
+; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
+; GFX9-NEXT: s_mul_i32 s13, s12, s10
+; GFX9-NEXT: s_mul_i32 s16, s14, s4
+; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
+; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
+; GFX9-NEXT: s_add_u32 s10, s10, s16
+; GFX9-NEXT: s_addc_u32 s15, 0, s15
+; GFX9-NEXT: s_add_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
+; GFX9-NEXT: s_addc_u32 s10, s15, s11
+; GFX9-NEXT: s_addc_u32 s5, s5, 0
+; GFX9-NEXT: s_mul_i32 s4, s12, s4
+; GFX9-NEXT: s_add_u32 s4, s10, s4
+; GFX9-NEXT: s_addc_u32 s10, 0, s5
+; GFX9-NEXT: s_add_i32 s14, s14, s4
+; GFX9-NEXT: s_cselect_b64 s[4:5], 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s10, s12, s10
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_ashr_i32 s4, s3, 31
+; GFX9-NEXT...
[truncated]
|
4fb299b
to
73d2b71
Compare
4364d47
to
c240a2b
Compare
…0142) Use correct unsigned overflow instructions for S_UADDO_PSEUDO/S_USUBO_PSEUDO. Note that this issue was hidden because instruction selection never selected S_UADDO_PSEUDO/S_USUBO_PSEUDO which will be addressed in #159814. Signed-off-by: John Lu <[email protected]>
…PSEUDO (#160142) Use correct unsigned overflow instructions for S_UADDO_PSEUDO/S_USUBO_PSEUDO. Note that this issue was hidden because instruction selection never selected S_UADDO_PSEUDO/S_USUBO_PSEUDO which will be addressed in llvm/llvm-project#159814. Signed-off-by: John Lu <[email protected]>
9185749
to
3dc928c
Compare
3dc928c
to
9265712
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is OK, but you are enabling code that has not been tested before so please be alert for any breakages.
Also the code you are enabling does not make too much sense to me. If it is checking for a SALU op feeding into another SALU op, why does it generate extra code to convert from scc to a vcc-like representation of the carry? See inline comment.
; CHECK-NEXT: s_add_u32 s0, s0, 1 | ||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 | ||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 | ||
; CHECK-NEXT: s_addc_u32 s0, 1, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Objectively this code is still terrible. It should just keep the carry in scc and feed it directly from s_add -> s_addc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll look at this in a separate PR. For reference, prior to this PR, we are currently generating:
v_add_co_u32_e64 v0, s[0:1], s0, 1
s_cmp_lg_u64 s[0:1], 0
s_addc_u32 s0, 1, 0
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
Co-authored-by: Matt Arsenault <[email protected]>
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
b5f92f4
to
e4fabc6
Compare
) Fix two bugs. The first bug hid the second bug. 1. Calculate IsVALU correctly during UADDO/USUBO selection. IsVALU should be false if the carryout users are UADDO_CARRY/USUBO_CARRY. However instruction selection visits uses before defs, so the UADDO_CARRY/USUBO_CARRY nodes are normally (probably always) already converted to S_ADD_CO_PSEUDO/S_SUB_CO_PSEUDO. Fix to check for these machine opcodes. 2. Without this fix, UADDO/USUBO selection will always select the VALU instructions V_ADD_CO__U32_e64/V_SUB_CO_U32_e64. S_UADDO_PSEUDO/S_USUBO_PSEUDO were never selected in the CodeGen/AMDGPU tests. Thus, S_UADDO_PSEUDO/S_USUBO_PSEUDO cases were never hit in EmitInstrWithCustomInserter. The code generation for S_UADDO_PSEUDO/S_USUBO_PSEUDO had a bug where it could not handle code generation for 32-bit $scc_out. --------- Signed-off-by: John Lu <[email protected]> Co-authored-by: Matt Arsenault <[email protected]>
Fix two bugs. The first bug hid the second bug.
Calculate IsVALU correctly during UADDO/USUBO selection. IsVALU should be false if the carryout users are UADDO_CARRY/USUBO_CARRY. However instruction selection visits uses before defs, so the UADDO_CARRY/USUBO_CARRY nodes are normally (probably always) already converted to S_ADD_CO_PSEUDO/S_SUB_CO_PSEUDO. Fix to check for these machine opcodes.
Without this fix, UADDO/USUBO selection will always select the VALU instructions V_ADD_CO__U32_e64/V_SUB_CO_U32_e64. S_UADDO_PSEUDO/S_USUBO_PSEUDO were never selected in the CodeGen/AMDGPU tests. Thus, S_UADDO_PSEUDO/S_USUBO_PSEUDO cases were never hit in EmitInstrWithCustomInserter. The code generation for S_UADDO_PSEUDO/S_USUBO_PSEUDO had a bug where it could not handle code generation for 32-bit $scc_out.