diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 48c25d5039bfd..af4f2dc49b690 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -436,6 +436,12 @@ class RegSequenceRewriter : public Rewriter { if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands()) return false; + // Do not introduce new subregister uses in a reg_sequence. Until composing + // subregister indices is supported while folding, we're just blocking + // folding of subregister copies later in the function. + if (NewSubReg) + return false; + MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx); MO.setReg(NewReg); MO.setSubReg(NewSubReg); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 40f29c56c8f12..d41601cc0d76e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1635,6 +1635,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] @@ -1682,33 +1683,32 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] +; GFX9-NEXT: v_add3_u32 v6, v3, v2, v6 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v6, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: s_ashr_i32 s10, s3, 31 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v7, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, s11, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v1 +; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v6, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v3, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s18, s6 ; GFX9-NEXT: s_addc_u32 s1, s19, s6 @@ -1716,116 +1716,116 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s3, s3, s10 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3 +; GFX9-NEXT: v_add_f32_e32 v3, v3, v15 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s8, v8 +; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; GFX9-NEXT: v_trunc_f32_e32 v4, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 +; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v2 ; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v17, 0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v4 ; GFX9-NEXT: s_subb_u32 s20, 0, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1 -; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v12, v[3:4] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v12, v2 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[18:19], s20, v17, v[3:4] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v15, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v17, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v10, v17, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v12, v3 +; GFX9-NEXT: v_mul_hi_u32 v2, v12, v2 +; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v8, v17, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, v12, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 -; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v3, v8, v7, v3 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v17, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v12, v3, vcc +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v7, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v1, v9, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v8, v[0:1] +; GFX9-NEXT: v_xor_b32_e32 v10, s17, v4 ; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v7, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v11, s17 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc -; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 +; GFX9-NEXT: v_xor_b32_e32 v4, s4, v6 +; GFX9-NEXT: v_mul_lo_u32 v5, v8, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v7, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v10, v11, vcc +; GFX9-NEXT: v_mul_hi_u32 v10, v7, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 -; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v8, v3 +; GFX9-NEXT: v_mul_hi_u32 v2, v8, v2 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, v7, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, v8, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v6, v10, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc +; GFX9-NEXT: v_add3_u32 v3, v6, v5, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v8, s8, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 ; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 -; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, s9, v3 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3 +; GFX9-NEXT: v_xor_b32_e32 v9, s4, v9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, s4 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 ; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v7, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4] ; GFX9-NEXT: v_mov_b32_e32 v9, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 4f04c15b3d44a..8e16889c72e65 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -10041,11 +10041,9 @@ define i64 @udiv_i64_gt_smax(i8 %size) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b32 s6, 0xcccccccc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, s4, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v1 ; GFX9-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1] ; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 3 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index adc91d56c3c27..6166c05c6f895 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2216,31 +2216,31 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_clause 0x1 ; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b32 s11, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec +; GFX1264-NEXT: s_mov_b32 s11, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB4_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] +; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[6:7] +; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_mul_u64 s[6:7], s[4:5], s[10:11] +; GFX1264-NEXT: s_mov_b32 s14, -1 ; GFX1264-NEXT: s_wait_alu 0xfffe -; GFX1264-NEXT: v_mov_b32_e32 v0, s8 -; GFX1264-NEXT: v_mov_b32_e32 v1, s9 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264-NEXT: v_mov_b32_e32 v0, s6 +; GFX1264-NEXT: v_mov_b32_e32 v1, s7 +; GFX1264-NEXT: s_mov_b32 s12, s2 +; GFX1264-NEXT: s_mov_b32 s13, s3 +; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB4_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 @@ -5600,17 +5600,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: .LBB10_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s6, v2, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s6, v2, 0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s8, v3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s7, v2, v[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v0, vcc +; GFX9-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i64_uniform: @@ -5651,10 +5651,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 -; GFX1064-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v4, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -5695,10 +5694,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s7, v2, v[4:5] ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 -; GFX1032-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v4, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; @@ -5742,9 +5740,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5] ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v1, v5 -; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v5, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm @@ -5788,9 +5785,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5] ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v1, v5 -; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v5, vcc_lo ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm @@ -5800,31 +5796,31 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_clause 0x1 ; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b32 s11, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec +; GFX1264-NEXT: s_mov_b32 s11, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB10_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] +; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[6:7] +; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_mul_u64 s[6:7], s[4:5], s[10:11] +; GFX1264-NEXT: s_mov_b32 s14, -1 ; GFX1264-NEXT: s_wait_alu 0xfffe -; GFX1264-NEXT: v_mov_b32_e32 v0, s8 -; GFX1264-NEXT: v_mov_b32_e32 v1, s9 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264-NEXT: v_mov_b32_e32 v0, s6 +; GFX1264-NEXT: v_mov_b32_e32 v1, s7 +; GFX1264-NEXT: s_mov_b32 s12, s2 +; GFX1264-NEXT: s_mov_b32 s13, s3 +; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB10_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s4, v2, 0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 @@ -5833,9 +5829,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5] ; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1264-NEXT: s_mov_b32 s2, -1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mov_b32_e32 v1, v4 -; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v4, vcc ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1264-NEXT: s_endpgm @@ -5877,9 +5872,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5] ; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1232-NEXT: s_mov_b32 s2, -1 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-NEXT: v_mov_b32_e32 v1, v4 -; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v4, vcc_lo ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1232-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 45b161d7959f4..8062dbbca7393 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -5235,19 +5235,19 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX9-NEXT: .LBB13_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], s2, v2, 0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s8, v3 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v0, vcc +; GFX9-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i64_uniform: @@ -5283,9 +5283,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 -; GFX1064-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v4, vcc ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -5321,9 +5320,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 -; GFX1032-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v4, vcc_lo ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; @@ -5362,9 +5360,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v1, v5 -; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v5, vcc ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; @@ -5402,9 +5399,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v1, v5 -; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v5, vcc_lo ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll index 1c6808b613427..0c5b67580c352 100644 --- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll +++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll @@ -15,16 +15,14 @@ define noundef i64 @srem64_3(i64 noundef %i) { ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s7, 0x55555555 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v4, s6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3] -; GFX9-NEXT: v_mul_lo_u32 v6, v4, s6 +; GFX9-NEXT: v_mul_lo_u32 v5, v4, s7 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3] ; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v3 @@ -61,10 +59,9 @@ define noundef i64 @srem64_3(i64 noundef %i) { ; GFX942-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: srem64_3: @@ -73,14 +70,12 @@ define noundef i64 @srem64_3(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, 0x55555556, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x55555556, v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v6, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[4:5] +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 ; GFX1030-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GFX1030-NEXT: v_mul_lo_u32 v5, 0x55555555, v4 -; GFX1030-NEXT: v_mov_b32_e32 v2, v3 -; GFX1030-NEXT: v_add_co_u32 v2, s4, v6, v2 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[2:3] +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 +; GFX1030-NEXT: v_mul_lo_u32 v5, 0x55555555, v4 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3] ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555556, v4, v[2:3] ; GFX1030-NEXT: v_mul_lo_u32 v4, 0x55555556, v4 @@ -107,16 +102,14 @@ define noundef i64 @srem64_6(i64 noundef %i) { ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s7, 0x55555555 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v4, s6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3] -; GFX9-NEXT: v_mul_lo_u32 v6, v4, s6 +; GFX9-NEXT: v_mul_lo_u32 v5, v4, s7 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3] ; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v3 @@ -153,10 +146,9 @@ define noundef i64 @srem64_6(i64 noundef %i) { ; GFX942-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: srem64_6: @@ -165,14 +157,12 @@ define noundef i64 @srem64_6(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, 0x55555556, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x55555556, v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v6, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[4:5] +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 ; GFX1030-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GFX1030-NEXT: v_mul_lo_u32 v5, 0x55555555, v4 -; GFX1030-NEXT: v_mov_b32_e32 v2, v3 -; GFX1030-NEXT: v_add_co_u32 v2, s4, v6, v2 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[2:3] +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 +; GFX1030-NEXT: v_mul_lo_u32 v5, 0x55555555, v4 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3] ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555556, v4, v[2:3] ; GFX1030-NEXT: v_mul_lo_u32 v4, 0x55555556, v4 @@ -199,11 +189,9 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s6, 0xaaaaaaaa ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 1 @@ -235,10 +223,9 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: urem64_3: @@ -247,11 +234,9 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, 0xaaaaaaab, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0xaaaaaaab, v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v6, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[4:5] -; GFX1030-NEXT: v_mov_b32_e32 v2, v3 -; GFX1030-NEXT: v_add_co_u32 v2, s4, v6, v2 +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] ; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 1 @@ -276,11 +261,9 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s6, 0xaaaaaaaa ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 2 @@ -312,10 +295,9 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 6, v[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: urem64_6: @@ -324,11 +306,9 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, 0xaaaaaaab, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0xaaaaaaab, v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v6, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[4:5] -; GFX1030-NEXT: v_mov_b32_e32 v2, v3 -; GFX1030-NEXT: v_add_co_u32 v2, s4, v6, v2 +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] ; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 2 @@ -353,15 +333,13 @@ define noundef i64 @sdiv64_3(i64 noundef %i) { ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s7, 0x55555555 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v1 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s7 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3] ; GFX9-NEXT: v_mul_lo_u32 v5, v0, s6 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[2:3] ; GFX9-NEXT: v_add3_u32 v1, v5, v1, v4 @@ -400,16 +378,14 @@ define noundef i64 @sdiv64_3(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, 0x55555556, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x55555556, v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v6, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[4:5] -; GFX1030-NEXT: v_mov_b32_e32 v0, v3 -; GFX1030-NEXT: v_add_co_u32 v2, s4, v6, v0 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[2:3] ; GFX1030-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3] ; GFX1030-NEXT: v_mul_lo_u32 v4, 0x55555555, v0 +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 ; GFX1030-NEXT: v_mul_lo_u32 v5, 0x55555556, v0 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3] ; GFX1030-NEXT: v_mad_u64_u32 v[0:1], null, 0x55555556, v0, v[2:3] ; GFX1030-NEXT: v_add3_u32 v1, v5, v1, v4 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 31, v1 @@ -430,15 +406,13 @@ define noundef i64 @sdiv64_6(i64 noundef %i) { ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s7, 0x55555555 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v1 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s7 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3] ; GFX9-NEXT: v_mul_lo_u32 v5, v0, s6 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[2:3] ; GFX9-NEXT: v_add3_u32 v1, v5, v1, v4 @@ -477,16 +451,14 @@ define noundef i64 @sdiv64_6(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, 0x55555556, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x55555556, v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v6, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[4:5] -; GFX1030-NEXT: v_mov_b32_e32 v0, v3 -; GFX1030-NEXT: v_add_co_u32 v2, s4, v6, v0 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[2:3] ; GFX1030-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3] ; GFX1030-NEXT: v_mul_lo_u32 v4, 0x55555555, v0 +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 ; GFX1030-NEXT: v_mul_lo_u32 v5, 0x55555556, v0 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3] ; GFX1030-NEXT: v_mad_u64_u32 v[0:1], null, 0x55555556, v0, v[2:3] ; GFX1030-NEXT: v_add3_u32 v1, v5, v1, v4 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 31, v1 @@ -507,11 +479,9 @@ define noundef i64 @udiv64_3(i64 noundef %i) { ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s6, 0xaaaaaaaa ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 1 @@ -543,11 +513,9 @@ define noundef i64 @udiv64_3(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, 0xaaaaaaab, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0xaaaaaaab, v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v6, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[4:5] -; GFX1030-NEXT: v_mov_b32_e32 v0, v3 -; GFX1030-NEXT: v_add_co_u32 v2, s4, v6, v0 +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 ; GFX1030-NEXT: v_mad_u64_u32 v[0:1], null, 0xaaaaaaaa, v1, v[2:3] ; GFX1030-NEXT: v_alignbit_b32 v0, v1, v0, 1 @@ -567,11 +535,9 @@ define noundef i64 @udiv64_6(i64 noundef %i) { ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s6, 0xaaaaaaaa ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 2 @@ -603,11 +569,9 @@ define noundef i64 @udiv64_6(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, 0xaaaaaaab, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0xaaaaaaab, v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v6, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[4:5] -; GFX1030-NEXT: v_mov_b32_e32 v0, v3 -; GFX1030-NEXT: v_add_co_u32 v2, s4, v6, v0 +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 ; GFX1030-NEXT: v_mad_u64_u32 v[0:1], null, 0xaaaaaaaa, v1, v[2:3] ; GFX1030-NEXT: v_alignbit_b32 v0, v1, v0, 2 @@ -1005,8 +969,7 @@ define noundef i64 @udiv64_i32min(i64 noundef %i) { ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, 1, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -1033,8 +996,7 @@ define noundef i64 @udiv64_i32min(i64 noundef %i) { ; GFX1030-NEXT: v_alignbit_b32 v0, v1, v0, 31 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 31, v1 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, 1, v[1:2] -; GFX1030-NEXT: v_mov_b32_e32 v0, v3 -; GFX1030-NEXT: v_add_co_u32 v0, s4, v1, v0 +; GFX1030-NEXT: v_add_co_u32 v0, s4, v1, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s4 ; GFX1030-NEXT: v_alignbit_b32 v0, v1, v0, 1 ; GFX1030-NEXT: v_mov_b32_e32 v1, 0 @@ -1049,19 +1011,17 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_hi_u32 v2, v0, 3 +; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, 3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, 3, v[2:3] +; GFX9-NEXT: v_lshl_add_u32 v2, v8, 31, v8 +; GFX9-NEXT: v_add3_u32 v5, v5, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-NEXT: s_mov_b32 s6, 0x80000001 -; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3] -; GFX9-NEXT: v_lshl_add_u32 v8, v6, 31, v6 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5] -; GFX9-NEXT: v_add3_u32 v7, v7, v8, v6 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1 @@ -1112,10 +1072,9 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX942-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, s2, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: srem64_i32max: @@ -1123,23 +1082,21 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 -; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GFX1030-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; GFX1030-NEXT: v_mad_u64_u32 v[6:7], null, v8, 3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v7, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0 -; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6 -; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5] -; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v2 -; GFX1030-NEXT: v_mov_b32_e32 v4, v5 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3] -; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4 -; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1 -; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5] -; GFX1030-NEXT: v_sub_nc_u32_e32 v5, v6, v0 -; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo +; GFX1030-NEXT: v_lshl_add_u32 v8, v8, 31, v8 +; GFX1030-NEXT: v_add3_u32 v7, v7, v8, v6 +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[6:7], null, v0, -1, v[6:7] +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x80000001, v0, v[2:3] +; GFX1030-NEXT: v_sub_nc_u32_e32 v4, v7, v1 +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 +; GFX1030-NEXT: v_sub_nc_u32_e32 v4, v4, v0 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x80000001, v1, v[2:3] +; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 +; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, 1, v[2:3] ; GFX1030-NEXT: v_add_nc_u32_e32 v3, v1, v3 ; GFX1030-NEXT: v_ashrrev_i64 v[4:5], 30, v[2:3] @@ -1161,19 +1118,17 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_hi_u32 v2, v0, 3 +; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, 3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, 3, v[2:3] +; GFX9-NEXT: v_lshl_add_u32 v2, v8, 31, v8 +; GFX9-NEXT: v_add3_u32 v5, v5, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-NEXT: s_mov_b32 s6, 0x80000001 -; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3] -; GFX9-NEXT: v_lshl_add_u32 v8, v6, 31, v6 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5] -; GFX9-NEXT: v_add3_u32 v7, v7, v8, v6 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1 @@ -1222,23 +1177,21 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) { ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 -; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GFX1030-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; GFX1030-NEXT: v_mad_u64_u32 v[6:7], null, v8, 3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v7, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0 -; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6 -; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5] -; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v2 -; GFX1030-NEXT: v_mov_b32_e32 v4, v5 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3] -; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4 -; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1 -; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5] -; GFX1030-NEXT: v_sub_nc_u32_e32 v5, v6, v0 -; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo +; GFX1030-NEXT: v_lshl_add_u32 v8, v8, 31, v8 +; GFX1030-NEXT: v_add3_u32 v7, v7, v8, v6 +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[6:7], null, v0, -1, v[6:7] +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x80000001, v0, v[2:3] +; GFX1030-NEXT: v_sub_nc_u32_e32 v4, v7, v1 +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 +; GFX1030-NEXT: v_sub_nc_u32_e32 v4, v4, v0 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x80000001, v1, v[2:3] +; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 +; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, 1, v[2:3] ; GFX1030-NEXT: v_add_nc_u32_e32 v3, v1, v3 ; GFX1030-NEXT: v_ashrrev_i64 v[0:1], 30, v[2:3] @@ -1259,11 +1212,9 @@ define noundef i64 @urem64_i32max(i64 noundef %i) { ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_brev_b32 s6, -2 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 5, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, 2, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, 2, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, 2, v[2:3] ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 @@ -1303,10 +1254,9 @@ define noundef i64 @urem64_i32max(i64 noundef %i) { ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_lshrrev_b32_e32 v3, 30, v3 ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: urem64_i32max: @@ -1315,11 +1265,9 @@ define noundef i64 @urem64_i32max(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 5 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 5, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v6, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, 2, v[4:5] -; GFX1030-NEXT: v_mov_b32_e32 v2, v3 -; GFX1030-NEXT: v_add_co_u32 v2, s4, v6, v2 +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, 2, v[2:3] +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v1, 2, v[2:3] ; GFX1030-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 @@ -1346,11 +1294,9 @@ define noundef i64 @udiv64_i32max(i64 noundef %i) { ; GFX9-NEXT: v_mul_hi_u32 v2, v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 5, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, 2, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, 2, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, 2, v[2:3] ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 @@ -1390,11 +1336,9 @@ define noundef i64 @udiv64_i32max(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 5 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 5, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v6, v5 -; GFX1030-NEXT: v_mov_b32_e32 v5, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, 2, v[4:5] -; GFX1030-NEXT: v_mov_b32_e32 v2, v3 -; GFX1030-NEXT: v_add_co_u32 v2, s4, v6, v2 +; GFX1030-NEXT: v_mov_b32_e32 v2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, 2, v[2:3] +; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v1, 2, v[2:3] ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 8d65fa053eaa4..41999b249a0e8 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -1954,65 +1954,62 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mul_lo_u32 v12, v33, v3 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0 ; SDAG-NEXT: v_mul_lo_u32 v24, v27, v2 -; SDAG-NEXT: v_mul_lo_u32 v35, v35, v31 -; SDAG-NEXT: v_mul_lo_u32 v38, v32, v30 +; SDAG-NEXT: v_mul_lo_u32 v25, v35, v31 +; SDAG-NEXT: v_mul_lo_u32 v35, v32, v30 ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 -; SDAG-NEXT: v_mul_lo_u32 v25, v14, v7 +; SDAG-NEXT: v_mul_lo_u32 v38, v14, v7 ; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v14, v6, 0 ; SDAG-NEXT: v_mul_lo_u32 v39, v15, v6 -; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37 -; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36 +; SDAG-NEXT: v_mul_lo_u32 v48, v19, v37 +; SDAG-NEXT: v_mul_lo_u32 v49, v18, v36 ; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v14, 0 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; SDAG-NEXT: v_mov_b32_e32 v12, v3 ; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[12:13] -; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v16, v2 -; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v21, v25 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v2 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v21, v38 ; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v24 -; SDAG-NEXT: v_mov_b32_e32 v24, v23 -; SDAG-NEXT: v_mov_b32_e32 v23, v13 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[22:23] -; SDAG-NEXT: v_xor_b32_e32 v33, v12, v28 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v16, v39 -; SDAG-NEXT: v_mov_b32_e32 v12, v7 -; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v36, v14, v[12:13] +; SDAG-NEXT: v_mov_b32_e32 v12, v22 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[12:13] +; SDAG-NEXT: v_xor_b32_e32 v24, v16, v28 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v19, v39 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11] -; SDAG-NEXT: v_add_i32_e64 v24, s[4:5], v24, v3 -; SDAG-NEXT: v_addc_u32_e64 v25, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v17, v2, vcc +; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v3 +; SDAG-NEXT: v_addc_u32_e64 v23, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v31, vcc, v17, v2, vcc ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21] -; SDAG-NEXT: v_mov_b32_e32 v14, v23 -; SDAG-NEXT: v_mov_b32_e32 v23, v13 -; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v37, v15, v[22:23] -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v35, v11 -; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v30, v27, v[24:25] -; SDAG-NEXT: v_xor_b32_e32 v7, v7, v29 -; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3 -; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v18, v12 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v38, v11 +; SDAG-NEXT: v_mov_b32_e32 v12, v7 +; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v36, v14, v[12:13] +; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v25, v11 +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v30, v27, v[22:23] +; SDAG-NEXT: v_xor_b32_e32 v14, v31, v29 ; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v36, v15, v[13:14] -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v16, v10 -; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], v17, v19, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v12, v16 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v37, v15, v[12:13] +; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v35, v7 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v49, v3 +; SDAG-NEXT: v_add_i32_e64 v12, s[4:5], v17, v12 +; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v19, v7, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v10, vcc -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v11, v2 -; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], v12, v3, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc -; SDAG-NEXT: v_xor_b32_e32 v2, v0, v28 +; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v36, v15, v[12:13] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; SDAG-NEXT: v_xor_b32_e32 v7, v0, v28 +; SDAG-NEXT: v_add_i32_e32 v10, vcc, v12, v2 +; SDAG-NEXT: v_addc_u32_e32 v12, vcc, v13, v3, vcc ; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v33, v28 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v7, v29, vcc -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v28, vcc +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v24, v28 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v14, v29, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v7, v28, vcc ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc ; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v11, vcc ; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26 ; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc ; SDAG-NEXT: v_xor_b32_e32 v7, v7, v34 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v12, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, v4, v26 ; SDAG-NEXT: v_xor_b32_e32 v9, v5, v34 ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v6, v26 @@ -2827,49 +2824,43 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mul_lo_u32 v34, v18, v15 ; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v18, v14, 0 ; SDAG-NEXT: v_mul_lo_u32 v35, v19, v14 -; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12 -; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13 +; SDAG-NEXT: v_mul_lo_u32 v36, v23, v12 +; SDAG-NEXT: v_mul_lo_u32 v37, v22, v13 ; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v18, 0 ; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; SDAG-NEXT: v_mov_b32_e32 v20, v11 ; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21] ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v25, v34 +; SDAG-NEXT: v_add_i32_e64 v23, s[4:5], v25, v34 ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v17, v28 -; SDAG-NEXT: v_mov_b32_e32 v28, v27 -; SDAG-NEXT: v_mov_b32_e32 v27, v21 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[26:27] -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v20, v35 +; SDAG-NEXT: v_mov_b32_e32 v20, v26 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[20:21] +; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v23, v35 +; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v31, v8, v[16:17] +; SDAG-NEXT: v_add_i32_e64 v26, s[4:5], v27, v11 +; SDAG-NEXT: v_addc_u32_e64 v27, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] ; SDAG-NEXT: v_mov_b32_e32 v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v13, v18, v[20:21] -; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v31, v8, v[16:17] -; SDAG-NEXT: v_mov_b32_e32 v8, v11 -; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v28, v8 +; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v18, v[20:21] +; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v29, v17 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[26:27] +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v36, v11 +; SDAG-NEXT: v_mov_b32_e32 v20, v22 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v19, v[20:21] +; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v33, v15 +; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v37, v17 +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v23, v12 ; SDAG-NEXT: v_addc_u32_e64 v18, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v8, v10 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] -; SDAG-NEXT: v_mov_b32_e32 v22, v27 -; SDAG-NEXT: v_mov_b32_e32 v27, v21 -; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v19, v[26:27] -; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v29, v16 -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[17:18] -; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v23, v11 -; SDAG-NEXT: v_mov_b32_e32 v11, v21 -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v22, v11 -; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v33, v16 -; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v36, v17 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v19, v[11:12] -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 -; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v16, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v16 +; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v15, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc -; SDAG-NEXT: v_add_i32_e32 v8, vcc, v11, v10 -; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v12, v17, vcc -; SDAG-NEXT: v_mov_b32_e32 v10, v20 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v19, v[17:18] +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; SDAG-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v10, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 3a7f3e41002d2..3e6b812c12d7f 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -61,32 +61,31 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 -; SDAG-NEXT: v_mul_lo_u32 v6, v10, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v10, v[2:3] -; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 -; SDAG-NEXT: v_add3_u32 v5, v5, v6, v12 -; SDAG-NEXT: v_mov_b32_e32 v6, v2 -; SDAG-NEXT: v_mov_b32_e32 v2, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[4:5] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v14, v10, v6 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[2:3] +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: v_add3_u32 v5, v5, v14, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v6 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v12, v[4:5] +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v6, v9, v11 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v8, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] +; SDAG-NEXT: v_add3_u32 v5, v9, v5, v6 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v4 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v5, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB0_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] @@ -103,10 +102,9 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2] -; SDAG-NEXT: v_mov_b32_e32 v7, v4 -; SDAG-NEXT: v_mov_b32_e32 v4, v2 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4] -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 +; SDAG-NEXT: v_mov_b32_e32 v1, v3 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[1:2] +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v4, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] @@ -429,32 +427,31 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 -; SDAG-NEXT: v_mul_lo_u32 v6, v10, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v10, v[2:3] -; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 -; SDAG-NEXT: v_add3_u32 v5, v5, v6, v12 -; SDAG-NEXT: v_mov_b32_e32 v6, v2 -; SDAG-NEXT: v_mov_b32_e32 v2, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[4:5] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v14, v10, v6 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[2:3] +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: v_add3_u32 v5, v5, v14, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v6 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v12, v[4:5] +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v6, v9, v11 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v8, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] +; SDAG-NEXT: v_add3_u32 v5, v9, v5, v6 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v4 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v5, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB1_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] @@ -471,10 +468,9 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2] -; SDAG-NEXT: v_mov_b32_e32 v7, v4 -; SDAG-NEXT: v_mov_b32_e32 v4, v2 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4] -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 +; SDAG-NEXT: v_mov_b32_e32 v1, v3 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[1:2] +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v4, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] @@ -743,22 +739,22 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG: ; %bb.0: ; %fp-to-i-entry ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, v0 -; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 +; SDAG-NEXT: v_bfe_u32 v6, v4, 23, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v6 ; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc ; SDAG-NEXT: s_cbranch_execz .LBB2_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end -; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 -; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc -; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc ; SDAG-NEXT: s_movk_i32 s6, 0xff7f -; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc ; SDAG-NEXT: s_mov_b32 s7, -1 ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] ; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] @@ -774,66 +770,65 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] -; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v5, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 +; SDAG-NEXT: v_or_b32_e32 v4, 0x800000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB2_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else -; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 -; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 -; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] -; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 +; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v6 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v6 +; SDAG-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 ; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, v5 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mov_b32_e32 v4, v6 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 ; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v7, v9, v12 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB2_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB2_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v4, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2] @@ -949,8 +944,8 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 -; GISEL-NEXT: v_or3_b32 v9, v1, v2, 1 +; GISEL-NEXT: v_lshl_or_b32 v9, v0, 16, v0 +; GISEL-NEXT: v_or3_b32 v10, v1, v2, 1 ; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x96 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -968,7 +963,7 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] @@ -977,24 +972,24 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v10, 0 ; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 ; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v10, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v10, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr10 ; GISEL-NEXT: .LBB2_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB2_6 @@ -1005,9 +1000,9 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v9, 0 +; GISEL-NEXT: v_mul_lo_u32 v5, v4, v9 ; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: .LBB2_6: ; %Flow1 @@ -1098,22 +1093,22 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG: ; %bb.0: ; %fp-to-i-entry ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, v0 -; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 +; SDAG-NEXT: v_bfe_u32 v6, v4, 23, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v6 ; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc ; SDAG-NEXT: s_cbranch_execz .LBB3_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end -; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 -; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc -; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc ; SDAG-NEXT: s_movk_i32 s6, 0xff7f -; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc ; SDAG-NEXT: s_mov_b32 s7, -1 ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] ; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] @@ -1129,66 +1124,65 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] -; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v5, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 +; SDAG-NEXT: v_or_b32_e32 v4, 0x800000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB3_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else -; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 -; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 -; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] -; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 +; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v6 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v6 +; SDAG-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 ; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, v5 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mov_b32_e32 v4, v6 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 ; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v7, v9, v12 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB3_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB3_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v4, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2] @@ -1304,8 +1298,8 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 -; GISEL-NEXT: v_or3_b32 v9, v1, v2, 1 +; GISEL-NEXT: v_lshl_or_b32 v9, v0, 16, v0 +; GISEL-NEXT: v_or3_b32 v10, v1, v2, 1 ; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x96 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -1323,7 +1317,7 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] @@ -1332,24 +1326,24 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v10, 0 ; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 ; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v10, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v10, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr10 ; GISEL-NEXT: .LBB3_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB3_6 @@ -1360,9 +1354,9 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v9, 0 +; GISEL-NEXT: v_mul_lo_u32 v5, v4, v9 ; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: .LBB3_6: ; %Flow1 @@ -1481,22 +1475,22 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG: ; %bb.0: ; %fp-to-i-entry ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, v0 -; SDAG-NEXT: v_bfe_u32 v5, v4, 7, 8 +; SDAG-NEXT: v_bfe_u32 v6, v4, 7, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v6 ; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc ; SDAG-NEXT: s_cbranch_execz .LBB6_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end -; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 -; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc -; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc ; SDAG-NEXT: s_movk_i32 s6, 0xff7f -; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc ; SDAG-NEXT: s_mov_b32 s7, -1 ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] ; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] @@ -1511,66 +1505,65 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: s_movk_i32 s4, 0x7f ; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: s_mov_b64 s[4:5], 0x85 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] -; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v5, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0 +; SDAG-NEXT: v_or_b32_e32 v4, 0x80, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB6_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else -; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5 -; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5 -; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff7a, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] -; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v10, s[4:5], -1, v0 +; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v6 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v6 +; SDAG-NEXT: v_add_u32_e32 v7, 0xffffff7a, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2 ; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 -; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v8, v[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, v5 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mov_b32_e32 v4, v6 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10 ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_mul_lo_u32 v3, v6, v11 -; SDAG-NEXT: v_mul_lo_u32 v7, v6, v12 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v12, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v3, v10, v11 +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v12 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: v_add3_u32 v3, v8, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: .LBB6_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB6_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] ; SDAG-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v8 ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 @@ -1830,22 +1823,22 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG: ; %bb.0: ; %fp-to-i-entry ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, v0 -; SDAG-NEXT: v_bfe_u32 v5, v4, 7, 8 +; SDAG-NEXT: v_bfe_u32 v6, v4, 7, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v6 ; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc ; SDAG-NEXT: s_cbranch_execz .LBB7_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end -; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 -; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc -; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc ; SDAG-NEXT: s_movk_i32 s6, 0xff7f -; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc ; SDAG-NEXT: s_mov_b32 s7, -1 ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] ; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] @@ -1860,66 +1853,65 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: s_movk_i32 s4, 0x7f ; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: s_mov_b64 s[4:5], 0x85 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] -; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v5, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0 +; SDAG-NEXT: v_or_b32_e32 v4, 0x80, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB7_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else -; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5 -; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5 -; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff7a, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] -; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v10, s[4:5], -1, v0 +; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v6 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v6 +; SDAG-NEXT: v_add_u32_e32 v7, 0xffffff7a, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2 ; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 -; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v8, v[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, v5 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mov_b32_e32 v4, v6 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10 ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_mul_lo_u32 v3, v6, v11 -; SDAG-NEXT: v_mul_lo_u32 v7, v6, v12 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v12, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v3, v10, v11 +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v12 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: v_add3_u32 v3, v8, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: .LBB7_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB7_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] ; SDAG-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v8 ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 3d3e8bea7e33e..051a0c51b0867 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -155,61 +155,61 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-SAFE-SDAG: ; %bb.0: -; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; VI-SAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-SAFE-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; VI-SAFE-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SAFE-SDAG-NEXT: s_mov_b32 s0, s4 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 8 -; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s4, 0xffe -; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s6 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; VI-SAFE-SDAG-NEXT: s_mov_b32 s1, s5 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; VI-SAFE-SDAG-NEXT: s_bfe_u32 s5, s7, 0xb0014 -; VI-SAFE-SDAG-NEXT: s_or_b32 s6, s8, s4 -; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s5 +; VI-SAFE-SDAG-NEXT: s_mov_b32 s4, s0 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s0, s3, 8 +; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s0, 0xffe +; VI-SAFE-SDAG-NEXT: s_and_b32 s0, s3, 0x1ff +; VI-SAFE-SDAG-NEXT: s_or_b32 s0, s0, s2 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s0, 0 +; VI-SAFE-SDAG-NEXT: s_mov_b32 s5, s1 +; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; VI-SAFE-SDAG-NEXT: s_bfe_u32 s1, s3, 0xb0014 +; VI-SAFE-SDAG-NEXT: s_or_b32 s2, s8, s0 +; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s1 ; VI-SAFE-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13 -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s6, 0x1000 +; VI-SAFE-SDAG-NEXT: s_or_b32 s0, s2, 0x1000 ; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s8, s4, s8 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s8, s0, s8 ; VI-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v0, s8 -; VI-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 +; VI-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s0, v0 ; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SAFE-SDAG-NEXT: s_add_i32 s10, s5, 0xfffffc10 -; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; VI-SAFE-SDAG-NEXT: s_lshl_b32 s5, s10, 12 -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s8, s4 -; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5 +; VI-SAFE-SDAG-NEXT: s_add_i32 s10, s1, 0xfffffc10 +; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; VI-SAFE-SDAG-NEXT: s_lshl_b32 s1, s10, 12 +; VI-SAFE-SDAG-NEXT: s_or_b32 s0, s8, s0 +; VI-SAFE-SDAG-NEXT: s_or_b32 s1, s2, s1 ; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s10, 1 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s11, s4, s5 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s11, s0, s1 ; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s11, 7 ; VI-SAFE-SDAG-NEXT: s_cmp_gt_i32 s8, 5 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-SAFE-SDAG-NEXT: s_cmp_eq_u32 s8, 3 ; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-SAFE-SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; VI-SAFE-SDAG-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] ; VI-SAFE-SDAG-NEXT: s_lshr_b32 s8, s11, 2 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; VI-SAFE-SDAG-NEXT: s_addc_u32 s4, s8, 0 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-SAFE-SDAG-NEXT: s_addc_u32 s0, s8, 0 ; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s10, 31 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, s4, 0x7c00 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, s0, 0x7c00 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; VI-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f ; VI-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0 ; VI-SAFE-SDAG-NEXT: v_mov_b32_e32 v1, s8 ; VI-SAFE-SDAG-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 16 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s0, s3, 16 ; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s4, 0x8000 -; VI-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-SAFE-SDAG-NEXT: s_and_b32 s0, s0, 0x8000 +; VI-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s0, v0 +; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-SAFE-SDAG-NEXT: s_endpgm ; ; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index b443e654350c5..cd85c301e16d5 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -151,20 +151,20 @@ entry: define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MulMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -174,8 +174,8 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MulMul: @@ -1698,20 +1698,20 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -1719,10 +1719,10 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MultipleUses_add1: @@ -1851,20 +1851,20 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -1872,10 +1872,10 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0 +; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 ; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2_MultipleUses_add1: @@ -2004,20 +2004,20 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -2025,10 +2025,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s0 +; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s4 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MultipleUses_mul1: @@ -2163,20 +2163,20 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -2184,10 +2184,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s0 +; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s4 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2_MultipleUses_mul1: @@ -2322,31 +2322,31 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s0 +; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s4 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MultipleUses_mul2: @@ -2479,20 +2479,20 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -2500,10 +2500,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s0 +; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s4 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2_MultipleUses_mul2: diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 10fac09ef4ec0..3a97724d81fbe 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -4906,19 +4906,19 @@ entry: define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_hilo: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:4 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4933,7 +4933,7 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc32_hilo: diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index c0de009e935e6..c0c0d3ded117d 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5168,38 +5168,34 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v4, v2 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v6, v4, v2 ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1] -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4 +; GFX1200-GISEL-NEXT: v_mul_lo_u32 v7, v4, v2 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v2, v[0:1] -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-GISEL-NEXT: v_mov_b32_e32 v7, v0 -; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v4, v2 +; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v4, v2 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1] -; GFX1200-GISEL-NEXT: v_mul_lo_u32 v3, v4, v2 +; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v0, v5, vcc_lo +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v3, v[1:2] ; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v6, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v2, v[0:1] -; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v3, v4 -; GFX1200-GISEL-NEXT: v_mul_lo_u32 v5, v3, v4 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1200-GISEL-NEXT: v_mov_b32_e32 v6, v0 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v7, vcc_lo -; GFX1200-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v3, 1 +; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v0, vcc_lo +; GFX1200-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v7, 1 +; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v7, v4 +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v2, v[3:4] +; GFX1200-GISEL-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v6, v[0:1] ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2] -; GFX1200-GISEL-NEXT: v_mul_hi_u32 v2, v5, v7 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v0, v4, v[1:2] -; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v5, v7 -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v6, v[2:3] -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v7, v[1:2] +; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v5, v8 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v4, v[2:3] +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v5, v3, v[0:1] +; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v5, v8 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v8, v[2:3] ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] entry: @@ -6041,76 +6037,72 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 +; GFX1200-GISEL-NEXT: v_add_co_u32 v9, s0, v2, 1 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo -; GFX1200-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v1, vcc_lo +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v8, v4 -; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo -; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v10, v6 +; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v9, v6 +; GFX1200-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v11, vcc_lo, 0, v3, s0 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 -; GFX1200-GISEL-NEXT: v_mul_lo_u32 v13, v10, v6 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-NEXT: v_mul_lo_u32 v13, v9, v6 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v8, v5, v[0:1] -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v10, v7, v[1:2] -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v9, v4, v[2:3] -; GFX1200-GISEL-NEXT: v_mov_b32_e32 v14, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v12, v8 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: v_add_co_u32 v15, s0, v13, v9 +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v9, v7, v[1:2] +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v10, v4, v[2:3] ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v11, v6, v[0:1] -; GFX1200-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v12, v8 -; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v14, v9, vcc_lo -; GFX1200-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v13, v10 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v3, v4 -; GFX1200-GISEL-NEXT: v_mov_b32_e32 v8, v2 -; GFX1200-GISEL-NEXT: v_mul_lo_u32 v15, v3, v4 -; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v10, v6 -; GFX1200-GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 +; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v14, v4 +; GFX1200-GISEL-NEXT: v_mul_hi_u32 v3, v15, v6 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v8, v11, vcc_lo -; GFX1200-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v12, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v3, v5, v[0:1] +; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v1, v10, vcc_lo +; GFX1200-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v11, vcc_lo, v2, v11, s0 +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[8:9], null, v14, v5, v[0:1] +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[9:10], null, v15, v7, v[3:4] +; GFX1200-GISEL-NEXT: v_mul_lo_u32 v10, v15, v6 +; GFX1200-GISEL-NEXT: v_mul_lo_u32 v7, v14, v4 +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v16, v4, v[8:9] +; GFX1200-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v12, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[4:5], null, v11, v6, v[9:10] ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v14, vcc_lo -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v10, v7, v[1:2] -; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v15, v12 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v9, v4, v[2:3] +; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1200-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1 -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v11, v6, v[0:1] -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[4:5], null, v15, v5, v[1:2] -; GFX1200-GISEL-NEXT: v_mov_b32_e32 v1, v2 -; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v16, v9 -; GFX1200-GISEL-NEXT: v_mul_lo_u32 v14, v16, v9 -; GFX1200-GISEL-NEXT: v_mov_b32_e32 v11, v3 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo -; GFX1200-GISEL-NEXT: v_mul_lo_u32 v8, v15, v12 -; GFX1200-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v15, 1 +; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v7, v8 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v10, v9 +; GFX1200-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v7, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo -; GFX1200-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v16, 1 +; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v3, vcc_lo +; GFX1200-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v10, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[5:6], null, v7, v5, v[0:1] +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v10, v2, v[1:2] +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1200-GISEL-NEXT: v_mul_hi_u32 v2, v14, v15 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo -; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v8, v10 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[5:6], null, v16, v7, v[0:1] -; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v14, v15 -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[6:7], null, v2, v12, v[4:5] -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v8, v13, v[1:2] -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v9, v[5:6] -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[4:5], null, v14, v11, v[0:1] -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v10, v[1:2] -; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v8, v10 +; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v4, vcc_lo +; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v11, v12 +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[5:6], null, v3, v8, v[5:6] +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v9, v[1:2] +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[6:7], null, v11, v13, v[0:1] +; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v11, v12 +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[7:8], null, v14, v10, v[2:3] +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v12, v[6:7] ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v2, v14, v15 -; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v15, v[4:5] +; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v15, v[7:8] ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index 94c2e518a9fd3..8f7456b788f81 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -32,13 +32,11 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; GFX9-LABEL: umulo_i64_v_v: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v5, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v2, v5 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v1, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v9, v7 @@ -54,18 +52,16 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; GFX10-LABEL: umulo_i64_v_v: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 -; GFX10-NEXT: v_mad_u64_u32 v[6:7], s4, v4, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v5, v2, 0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v5, v3, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_add3_u32 v1, v1, v6, v8 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v0, v3, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v0, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v4, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v4, v3, 0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v1, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_add3_u32 v1, v1, v5, v7 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo @@ -78,23 +74,22 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v5, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v5, v3, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_add3_u32 v1, v1, v6, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v1, v6 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo +; GFX11-NEXT: v_add3_u32 v1, v1, v6, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v9, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -106,23 +101,21 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v4, v3, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v5, v2, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v5, v3, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v4, v1 -; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v0, v3, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v4, v2, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v3, 0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8 +; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add3_u32 v1, v1, v5, v7 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 @@ -181,11 +174,10 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, v1 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v10, v6 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v1, v6 ; GFX9-NEXT: v_mad_i64_i32 v[10:11], s[4:5], v4, v3, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v8 @@ -215,14 +207,13 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 ; GFX10-NEXT: v_mad_u64_u32 v[6:7], s4, v4, v3, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 ; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v5, v2, 0 ; GFX10-NEXT: v_mad_i64_i32 v[10:11], s4, v5, v3, 0 -; GFX10-NEXT: v_mov_b32_e32 v12, v1 -; GFX10-NEXT: v_add3_u32 v1, v1, v6, v8 -; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v12, v6 +; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v1, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_add3_u32 v1, v1, v6, v8 ; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v12, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo @@ -249,38 +240,37 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v5, v2, 0 ; GFX11-NEXT: v_mad_i64_i32 v[10:11], null, v5, v3, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v12, v1 -; GFX11-NEXT: v_add3_u32 v1, v1, v6, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v1, v6 ; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX11-NEXT: v_add3_u32 v1, v1, v6, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10 ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -294,45 +284,44 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0 ; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v4, v3, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0 ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v5, v2, 0 ; GFX12-NEXT: v_mad_co_i64_i32 v[10:11], null, v5, v3, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v12, v1 -; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v1, v6 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v8 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo ; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo ; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo ; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index fbda0e71a74c6..1fc7349882ba1 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -9465,47 +9465,47 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[12:13], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[10:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s96, s13, 30 -; GFX12-NEXT: s_lshr_b32 s98, s13, 31 -; GFX12-NEXT: s_lshr_b32 s92, s13, 28 -; GFX12-NEXT: s_lshr_b32 s94, s13, 29 -; GFX12-NEXT: s_lshr_b32 s78, s13, 26 -; GFX12-NEXT: s_lshr_b32 s88, s13, 27 +; GFX12-NEXT: s_lshr_b32 s96, s11, 30 +; GFX12-NEXT: s_lshr_b32 s98, s11, 31 +; GFX12-NEXT: s_lshr_b32 s92, s11, 28 +; GFX12-NEXT: s_lshr_b32 s94, s11, 29 +; GFX12-NEXT: s_lshr_b32 s78, s11, 26 +; GFX12-NEXT: s_lshr_b32 s88, s11, 27 ; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000 -; GFX12-NEXT: s_lshr_b32 s66, s13, 24 -; GFX12-NEXT: s_lshr_b32 s74, s13, 25 +; GFX12-NEXT: s_lshr_b32 s66, s11, 24 +; GFX12-NEXT: s_lshr_b32 s74, s11, 25 ; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s96 -; GFX12-NEXT: s_lshr_b32 s56, s13, 22 -; GFX12-NEXT: s_lshr_b32 s62, s13, 23 +; GFX12-NEXT: s_lshr_b32 s56, s11, 22 +; GFX12-NEXT: s_lshr_b32 s62, s11, 23 ; GFX12-NEXT: v_dual_mov_b32 v2, s97 :: v_dual_mov_b32 v3, s100 ; GFX12-NEXT: v_dual_mov_b32 v4, s101 :: v_dual_mov_b32 v5, s92 ; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000 -; GFX12-NEXT: s_lshr_b32 s44, s13, 20 -; GFX12-NEXT: s_lshr_b32 s52, s13, 21 -; GFX12-NEXT: s_lshr_b32 s30, s13, 18 -; GFX12-NEXT: s_lshr_b32 s40, s13, 19 -; GFX12-NEXT: s_lshr_b32 s18, s13, 16 -; GFX12-NEXT: s_lshr_b32 s26, s13, 17 -; GFX12-NEXT: s_lshr_b32 s2, s13, 14 -; GFX12-NEXT: s_lshr_b32 s4, s13, 15 +; GFX12-NEXT: s_lshr_b32 s44, s11, 20 +; GFX12-NEXT: s_lshr_b32 s52, s11, 21 +; GFX12-NEXT: s_lshr_b32 s30, s11, 18 +; GFX12-NEXT: s_lshr_b32 s40, s11, 19 +; GFX12-NEXT: s_lshr_b32 s18, s11, 16 +; GFX12-NEXT: s_lshr_b32 s26, s11, 17 +; GFX12-NEXT: s_lshr_b32 s2, s11, 14 +; GFX12-NEXT: s_lshr_b32 s4, s11, 15 ; GFX12-NEXT: v_dual_mov_b32 v6, s93 :: v_dual_mov_b32 v7, s94 ; GFX12-NEXT: v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s78 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX12-NEXT: s_lshr_b32 s6, s13, 12 -; GFX12-NEXT: s_lshr_b32 s8, s13, 13 +; GFX12-NEXT: s_lshr_b32 s6, s11, 12 +; GFX12-NEXT: s_lshr_b32 s8, s11, 13 ; GFX12-NEXT: v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s88 ; GFX12-NEXT: v_dual_mov_b32 v12, s89 :: v_dual_mov_b32 v13, s66 ; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX12-NEXT: s_lshr_b32 s10, s13, 10 -; GFX12-NEXT: s_lshr_b32 s14, s13, 11 +; GFX12-NEXT: s_lshr_b32 s12, s11, 10 +; GFX12-NEXT: s_lshr_b32 s14, s11, 11 ; GFX12-NEXT: v_dual_mov_b32 v14, s67 :: v_dual_mov_b32 v15, s74 ; GFX12-NEXT: v_dual_mov_b32 v16, s75 :: v_dual_mov_b32 v17, s56 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 @@ -9516,14 +9516,14 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX12-NEXT: s_lshr_b32 s16, s13, 8 -; GFX12-NEXT: s_lshr_b32 s20, s13, 9 +; GFX12-NEXT: s_lshr_b32 s16, s11, 8 +; GFX12-NEXT: s_lshr_b32 s20, s11, 9 ; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v19, s62 ; GFX12-NEXT: v_dual_mov_b32 v20, s63 :: v_dual_mov_b32 v21, s44 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX12-NEXT: s_lshr_b32 s22, s13, 6 -; GFX12-NEXT: s_lshr_b32 s24, s13, 7 +; GFX12-NEXT: s_lshr_b32 s22, s11, 6 +; GFX12-NEXT: s_lshr_b32 s24, s11, 7 ; GFX12-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s52 ; GFX12-NEXT: v_dual_mov_b32 v24, s53 :: v_dual_mov_b32 v25, s30 ; GFX12-NEXT: v_dual_mov_b32 v26, s31 :: v_dual_mov_b32 v27, s40 @@ -9531,7 +9531,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s26 ; GFX12-NEXT: v_mov_b32_e32 v32, s27 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX12-NEXT: s_clause 0x7 ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:496 ; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:480 @@ -9544,36 +9544,36 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 ; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 ; GFX12-NEXT: v_mov_b32_e32 v5, s6 -; GFX12-NEXT: s_lshr_b32 s28, s13, 4 -; GFX12-NEXT: s_lshr_b32 s34, s13, 5 -; GFX12-NEXT: s_lshr_b32 s36, s13, 2 -; GFX12-NEXT: s_lshr_b32 s38, s13, 3 +; GFX12-NEXT: s_lshr_b32 s28, s11, 4 +; GFX12-NEXT: s_lshr_b32 s34, s11, 5 +; GFX12-NEXT: s_lshr_b32 s36, s11, 2 +; GFX12-NEXT: s_lshr_b32 s38, s11, 3 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 -; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 -; GFX12-NEXT: s_lshr_b32 s42, s13, 1 -; GFX12-NEXT: s_mov_b32 s46, s13 +; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s12 +; GFX12-NEXT: s_lshr_b32 s42, s11, 1 +; GFX12-NEXT: s_mov_b32 s46, s11 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s14 +; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 ; GFX12-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 -; GFX12-NEXT: s_lshr_b32 s48, s12, 30 -; GFX12-NEXT: s_lshr_b32 s50, s12, 31 +; GFX12-NEXT: s_lshr_b32 s48, s10, 30 +; GFX12-NEXT: s_lshr_b32 s50, s10, 31 ; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s20 ; GFX12-NEXT: v_dual_mov_b32 v16, s21 :: v_dual_mov_b32 v17, s22 -; GFX12-NEXT: s_lshr_b32 s54, s12, 28 -; GFX12-NEXT: s_lshr_b32 s58, s12, 29 +; GFX12-NEXT: s_lshr_b32 s54, s10, 28 +; GFX12-NEXT: s_lshr_b32 s58, s10, 29 ; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v18, s23 :: v_dual_mov_b32 v19, s24 ; GFX12-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v21, s28 -; GFX12-NEXT: s_lshr_b32 s60, s12, 26 -; GFX12-NEXT: s_lshr_b32 s64, s12, 27 +; GFX12-NEXT: s_lshr_b32 s60, s10, 26 +; GFX12-NEXT: s_lshr_b32 s64, s10, 27 ; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s34 @@ -9588,43 +9588,43 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v1, s36 :: v_dual_mov_b32 v2, s37 ; GFX12-NEXT: v_dual_mov_b32 v3, s38 :: v_dual_mov_b32 v4, s39 ; GFX12-NEXT: v_mov_b32_e32 v5, s46 -; GFX12-NEXT: s_lshr_b32 s68, s12, 24 -; GFX12-NEXT: s_lshr_b32 s70, s12, 25 -; GFX12-NEXT: s_lshr_b32 s72, s12, 22 -; GFX12-NEXT: s_lshr_b32 s76, s12, 23 +; GFX12-NEXT: s_lshr_b32 s68, s10, 24 +; GFX12-NEXT: s_lshr_b32 s70, s10, 25 +; GFX12-NEXT: s_lshr_b32 s72, s10, 22 +; GFX12-NEXT: s_lshr_b32 s76, s10, 23 ; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v6, s47 :: v_dual_mov_b32 v7, s42 ; GFX12-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v9, s48 -; GFX12-NEXT: s_lshr_b32 s80, s12, 20 -; GFX12-NEXT: s_lshr_b32 s82, s12, 21 +; GFX12-NEXT: s_lshr_b32 s80, s10, 20 +; GFX12-NEXT: s_lshr_b32 s82, s10, 21 ; GFX12-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v10, s49 :: v_dual_mov_b32 v11, s50 ; GFX12-NEXT: v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v13, s54 -; GFX12-NEXT: s_lshr_b32 s84, s12, 18 -; GFX12-NEXT: s_lshr_b32 s86, s12, 19 +; GFX12-NEXT: s_lshr_b32 s84, s10, 18 +; GFX12-NEXT: s_lshr_b32 s86, s10, 19 ; GFX12-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v14, s55 :: v_dual_mov_b32 v15, s58 ; GFX12-NEXT: v_dual_mov_b32 v16, s59 :: v_dual_mov_b32 v17, s60 -; GFX12-NEXT: s_lshr_b32 s90, s12, 16 -; GFX12-NEXT: s_lshr_b32 s98, s12, 17 +; GFX12-NEXT: s_lshr_b32 s90, s10, 16 +; GFX12-NEXT: s_lshr_b32 s98, s10, 17 ; GFX12-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v18, s61 :: v_dual_mov_b32 v19, s64 ; GFX12-NEXT: v_dual_mov_b32 v20, s65 :: v_dual_mov_b32 v21, s68 -; GFX12-NEXT: s_lshr_b32 s96, s12, 14 -; GFX12-NEXT: s_lshr_b32 s100, s12, 15 -; GFX12-NEXT: s_lshr_b32 s94, s12, 13 -; GFX12-NEXT: s_lshr_b32 s88, s12, 11 -; GFX12-NEXT: s_lshr_b32 s74, s12, 9 -; GFX12-NEXT: s_lshr_b32 s62, s12, 7 -; GFX12-NEXT: s_lshr_b32 s52, s12, 5 -; GFX12-NEXT: s_lshr_b32 s40, s12, 3 -; GFX12-NEXT: s_lshr_b32 s26, s12, 1 +; GFX12-NEXT: s_lshr_b32 s96, s10, 14 +; GFX12-NEXT: s_lshr_b32 s100, s10, 15 +; GFX12-NEXT: s_lshr_b32 s94, s10, 13 +; GFX12-NEXT: s_lshr_b32 s88, s10, 11 +; GFX12-NEXT: s_lshr_b32 s74, s10, 9 +; GFX12-NEXT: s_lshr_b32 s62, s10, 7 +; GFX12-NEXT: s_lshr_b32 s52, s10, 5 +; GFX12-NEXT: s_lshr_b32 s40, s10, 3 +; GFX12-NEXT: s_lshr_b32 s26, s10, 1 ; GFX12-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v22, s69 :: v_dual_mov_b32 v23, s70 @@ -9639,19 +9639,19 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v1, s72 :: v_dual_mov_b32 v2, s73 ; GFX12-NEXT: v_dual_mov_b32 v3, s76 :: v_dual_mov_b32 v4, s77 ; GFX12-NEXT: v_mov_b32_e32 v5, s80 -; GFX12-NEXT: s_lshr_b32 s92, s12, 12 -; GFX12-NEXT: s_lshr_b32 s78, s12, 10 +; GFX12-NEXT: s_lshr_b32 s92, s10, 12 +; GFX12-NEXT: s_lshr_b32 s78, s10, 10 ; GFX12-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v6, s81 :: v_dual_mov_b32 v7, s82 ; GFX12-NEXT: v_dual_mov_b32 v8, s83 :: v_dual_mov_b32 v9, s84 -; GFX12-NEXT: s_lshr_b32 s66, s12, 8 -; GFX12-NEXT: s_lshr_b32 s56, s12, 6 -; GFX12-NEXT: s_lshr_b32 s44, s12, 4 -; GFX12-NEXT: s_lshr_b32 s30, s12, 2 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[12:13], 0x10000 +; GFX12-NEXT: s_lshr_b32 s66, s10, 8 +; GFX12-NEXT: s_lshr_b32 s56, s10, 6 +; GFX12-NEXT: s_lshr_b32 s44, s10, 4 +; GFX12-NEXT: s_lshr_b32 s30, s10, 2 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000 @@ -9695,8 +9695,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v16, s41 :: v_dual_mov_b32 v17, s30 ; GFX12-NEXT: v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v19, s26 ; GFX12-NEXT: v_dual_mov_b32 v20, s27 :: v_dual_mov_b32 v21, s18 -; GFX12-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s12 -; GFX12-NEXT: v_mov_b32_e32 v24, s13 +; GFX12-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s10 +; GFX12-NEXT: v_mov_b32_e32 v24, s11 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:80 ; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 341332e60b5c0..4ce3b46211e64 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -1843,11 +1843,10 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_ashr_i32 s0, s3, 31 -; GFX7-HSA-NEXT: s_mov_b32 s1, s3 -; GFX7-HSA-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s1, s2, 31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -1861,11 +1860,10 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s3, 31 -; GFX8-NOHSA-NEXT: s_mov_b32 s1, s3 -; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s2, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s2, 31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -1902,8 +1900,8 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX9-HSA-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-HSA-NEXT: s_ashr_i32 s5, s2, 31 ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-HSA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 4217384cdd5ce..8589158f11a70 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -5933,17 +5933,17 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[1:2], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 @@ -5956,8 +5956,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v4i16_to_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 5ce8a2b5f862e..0573de4a7f2d1 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -1555,13 +1555,14 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOHSA-NEXT: s_mov_b32 s8, s2 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3 -; SI-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v1 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v4 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NOHSA-NEXT: s_endpgm ; @@ -1571,14 +1572,15 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCNX3-HSA-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s1 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v1 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCNX3-HSA-NEXT: s_endpgm ; ; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64: @@ -1591,13 +1593,14 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v1 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v4 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v4 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v5 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCNX3-NOHSA-NEXT: s_endpgm ; @@ -1626,14 +1629,15 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GCN-HSA-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5 +; GCN-HSA-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GCN-HSA-NEXT: s_endpgm %ld = load <2 x i32>, ptr addrspace(1) %in %ext = sext <2 x i32> %ld to <2 x i64> @@ -1902,36 +1906,36 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 -; SI-NOHSA-NEXT: s_mov_b32 s6, -1 -; SI-NOHSA-NEXT: s_mov_b32 s10, s6 -; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s2, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s2 +; SI-NOHSA-NEXT: s_mov_b32 s11, s3 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOHSA-NEXT: s_mov_b32 s8, s2 -; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: s_mov_b32 s8, s6 +; SI-NOHSA-NEXT: s_mov_b32 s9, s7 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; SI-NOHSA-NEXT: v_mov_b32_e32 v9, 0 ; SI-NOHSA-NEXT: v_mov_b32_e32 v11, v9 -; SI-NOHSA-NEXT: s_mov_b32 s4, s0 -; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_mov_b32 s0, s4 +; SI-NOHSA-NEXT: s_mov_b32 s1, s5 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v2 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v3 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v0 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v1 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) expcnt(0) ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v6 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v7 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v4 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v5 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; SI-NOHSA-NEXT: s_endpgm ; ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64: @@ -1981,36 +1985,36 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 -; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 -; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 -; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 +; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 +; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 -; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 +; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6 +; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, 0 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v9 -; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 -; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4 +; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v2 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v3 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCNX3-NOHSA-NEXT: s_nop 0 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v0 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v1 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 ; GCNX3-NOHSA-NEXT: s_nop 0 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v4 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v5 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCNX3-NOHSA-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v8i32_to_v8i64: @@ -2091,17 +2095,17 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v8i32_to_v8i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 -; SI-NOHSA-NEXT: s_mov_b32 s6, -1 -; SI-NOHSA-NEXT: s_mov_b32 s10, s6 -; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s2, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s2 +; SI-NOHSA-NEXT: s_mov_b32 s11, s3 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOHSA-NEXT: s_mov_b32 s8, s2 -; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: s_mov_b32 s8, s6 +; SI-NOHSA-NEXT: s_mov_b32 s9, s7 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NOHSA-NEXT: s_mov_b32 s4, s0 -; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_mov_b32 s0, s4 +; SI-NOHSA-NEXT: s_mov_b32 s1, s5 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 @@ -2121,10 +2125,10 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; SI-NOHSA-NEXT: v_mov_b32_e32 v13, v3 ; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v0 ; SI-NOHSA-NEXT: v_mov_b32_e32 v9, v1 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[4:7], 0 offset:48 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[4:7], 0 offset:32 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 offset:16 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; SI-NOHSA-NEXT: s_endpgm ; ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64: @@ -2369,13 +2373,13 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 @@ -2402,10 +2406,10 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 @@ -2414,30 +2418,30 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v15 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v14 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v10 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v8 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v10 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v11 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[12:15] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 @@ -2611,60 +2615,115 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; EG-NEXT: MOV * T16.Z, T1.Y, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64: -; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, 0 -; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32 -; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48 -; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16 -; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3] -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(2) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11 -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v15 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16 -; GCN-HSA-NEXT: s_endpgm +; GCN-GFX900-HSA-LABEL: global_sextload_v16i32_to_v16i64: +; GCN-GFX900-HSA: ; %bb.0: +; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v36, 0 +; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3] +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v24, v4 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v26, v5 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v20, v6 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v22, v7 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v16, v2 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v18, v3 +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v28, v8 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v30, v9 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v0, v10 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v2, v11 +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v32, v12 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v34, v13 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v14 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v15 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16 +; GCN-GFX900-HSA-NEXT: s_endpgm +; +; GCN-GFX908-HSA-LABEL: global_sextload_v16i32_to_v16i64: +; GCN-GFX908-HSA: ; %bb.0: +; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v0, 0 +; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] offset:32 +; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:48 +; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[9:12], v0, s[2:3] offset:16 +; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[13:16], v0, s[2:3] +; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v4 +; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v6 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v5 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v5 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v6 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v3 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v8 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v21, v7 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v23, v8 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v5, v1 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v7, v2 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v17, v3 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v19, v4 +; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v12 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v2, 31, v11 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v10 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v9 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v9 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v10 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v1, v11 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v3, v12 +; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v16 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v15 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v14 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v13 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v33, v13 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v35, v14 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v9, v15 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v11, v16 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:96 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:112 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:64 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:32 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:48 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[33:36], s[0:1] +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:16 +; GCN-GFX908-HSA-NEXT: s_endpgm %ld = load <16 x i32>, ptr addrspace(1) %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, ptr addrspace(1) %out @@ -3137,26 +3196,26 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[12:13] ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1 @@ -3223,26 +3282,26 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[24:27] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v15 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v14 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v13 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v12 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v12 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v13 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v14 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v11 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v10 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v9 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v8 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v8 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v10 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v11 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 @@ -3253,58 +3312,58 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v16 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v17 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v19 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v18 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v18 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v19 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[23:26] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[23:26] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v9 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v9 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[15:18] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[16:19] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s3 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v15 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v1 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v11 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, v1 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[4:7] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, v1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCNX3-HSA-NEXT: s_endpgm ; @@ -3941,13 +4000,13 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1 @@ -4032,29 +4091,29 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v19 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v12 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v13 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v10 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v11 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v12 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v13 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v11 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index b6ff99214249a..a5f6c2fe5d264 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -92,11 +92,11 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX940-SDAG-NEXT: v_mov_b32_e32 v9, s0 ; GFX940-SDAG-NEXT: s_mov_b64 s[2:3], exec ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(4) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v6, v11 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v5, v10 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v6, v11 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, v13 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, v13 ; GFX940-SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v4 ; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v5 @@ -523,11 +523,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX940-SDAG-NEXT: v_mov_b32_e32 v9, s0 ; GFX940-SDAG-NEXT: s_mov_b64 s[2:3], exec ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(4) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v6, v11 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v5, v10 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v6, v11 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, v13 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, v13 ; GFX940-SDAG-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v4 ; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v5 diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 11c62a7312755..d4f75051b04d4 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -226,19 +226,17 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; CI-NEXT: v_mov_b32_e32 v8, 0 ; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v1, v[7:8] ; CI-NEXT: v_ashrrev_i32_e32 v13, 31, v1 -; CI-NEXT: v_mov_b32_e32 v11, v10 -; CI-NEXT: v_mov_b32_e32 v10, v8 -; CI-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v13, v[9:10] -; CI-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CI-NEXT: v_mov_b32_e32 v7, v9 +; CI-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v13, v[7:8] +; CI-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CI-NEXT: v_mad_i64_i32 v[10:11], s[4:5], v1, v12, 0 ; CI-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, vcc ; CI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v13, v[8:9] ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v13, v0, v[10:11] ; CI-NEXT: v_add_i32_e32 v8, vcc, v8, v0 ; CI-NEXT: v_addc_u32_e32 v9, vcc, v9, v1, vcc -; CI-NEXT: v_mov_b32_e32 v1, v7 ; CI-NEXT: v_add_i32_e32 v0, vcc, v6, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; CI-NEXT: v_addc_u32_e32 v1, vcc, v7, v3, vcc ; CI-NEXT: v_addc_u32_e32 v2, vcc, v8, v4, vcc ; CI-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc ; CI-NEXT: s_setpc_b64 s[30:31] @@ -280,27 +278,24 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0 -; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v1, v[8:9] ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v11 -; GFX9-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9] -; GFX9-NEXT: v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[8:9] +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v14, v[10:11] +; GFX9-NEXT: v_mad_i64_i32 v[12:13], s[4:5], v1, v12, 0 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13] -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v10, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v11, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v5, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: mad_i64_i32_sextops_i32_i128: @@ -312,25 +307,24 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX1100-NEXT: v_ashrrev_i32_e32 v15, 31, v1 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8] -; GFX1100-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[7:8], null, v0, v15, v[9:10] -; GFX1100-NEXT: v_mov_b32_e32 v10, v8 -; GFX1100-NEXT: v_mad_i64_i32 v[8:9], null, v1, v14, 0 +; GFX1100-NEXT: v_mov_b32_e32 v7, v9 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[7:8] +; GFX1100-NEXT: v_mad_i64_i32 v[7:8], null, v1, v14, 0 +; GFX1100-NEXT: v_add_co_u32 v9, s0, v10, v12 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-NEXT: v_add_co_ci_u32_e64 v10, null, 0, 0, s0 +; GFX1100-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[7:8] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_add_co_u32 v10, s0, v11, v10 -; GFX1100-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[8:9] -; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[10:11] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12 -; GFX1100-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo +; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[9:10] +; GFX1100-NEXT: v_add_co_u32 v7, vcc_lo, v0, v12 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1100-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v13, vcc_lo ; GFX1100-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 -; GFX1100-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo -; GFX1100-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo +; GFX1100-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v3, vcc_lo +; GFX1100-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v7, v4, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1100-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v8, v5, vcc_lo ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_i64_i32_sextops_i32_i128: @@ -338,21 +332,20 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1150-NEXT: v_mad_u64_u32 v[6:7], null, v0, v1, 0 ; GFX1150-NEXT: v_mov_b32_e32 v8, 0 -; GFX1150-NEXT: v_ashrrev_i32_e32 v12, 31, v0 -; GFX1150-NEXT: v_ashrrev_i32_e32 v13, 31, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mad_u64_u32 v[9:10], null, v12, v1, v[7:8] -; GFX1150-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8 +; GFX1150-NEXT: v_ashrrev_i32_e32 v13, 31, v0 +; GFX1150-NEXT: v_ashrrev_i32_e32 v14, 31, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-NEXT: v_mad_i64_i32 v[11:12], null, v1, v13, 0 +; GFX1150-NEXT: v_mad_u64_u32 v[9:10], null, v13, v1, v[7:8] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mad_u64_u32 v[7:8], null, v0, v13, v[9:10] -; GFX1150-NEXT: v_mov_b32_e32 v10, v8 -; GFX1150-NEXT: v_mad_i64_i32 v[8:9], null, v1, v12, 0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_add_co_u32 v10, s0, v11, v10 -; GFX1150-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v13, v0, v[8:9] -; GFX1150-NEXT: v_mad_u64_u32 v[8:9], null, v12, v13, v[10:11] +; GFX1150-NEXT: v_mov_b32_e32 v7, v9 +; GFX1150-NEXT: v_mad_u64_u32 v[7:8], null, v0, v14, v[7:8] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v14, v0, v[11:12] +; GFX1150-NEXT: v_add_co_u32 v8, s0, v10, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_add_co_ci_u32_e64 v9, null, 0, 0, s0 +; GFX1150-NEXT: v_mad_u64_u32 v[8:9], null, v13, v14, v[8:9] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1150-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0 ; GFX1150-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo @@ -372,22 +365,21 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v0, v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v0 -; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v12, v1, v[7:8] -; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8 +; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v0 +; GFX12-NEXT: v_ashrrev_i32_e32 v14, 31, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_i64_i32 v[11:12], null, v1, v13, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v13, v1, v[7:8] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v0, v13, v[9:10] -; GFX12-NEXT: v_mov_b32_e32 v10, v8 -; GFX12-NEXT: v_mad_co_i64_i32 v[8:9], null, v1, v12, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_u32 v10, s0, v11, v10 +; GFX12-NEXT: v_mov_b32_e32 v7, v9 +; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v0, v14, v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v14, v0, v[11:12] +; GFX12-NEXT: v_add_co_u32 v8, s0, v10, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 -; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v13, v0, v[8:9] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v12, v13, v[10:11] +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, 0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v13, v14, v[8:9] ; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll index fab5d386446d3..6f21df3a06ce7 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -28,29 +28,28 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400 +; GCN-NEXT: buffer_load_dword v5, v[1:2], s[8:11], 0 addr64 offset:400 ; GCN-NEXT: s_load_dword s4, s[4:5], 0xf ; GCN-NEXT: s_mov_b64 s[2:3], 0 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_max_i32_e32 v3, s4, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v4 -; GCN-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NEXT: buffer_atomic_cmpswap v[5:6], v[1:2], s[8:11], 0 addr64 offset:400 glc +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_max_i32_e32 v4, s4, v5 +; GCN-NEXT: v_mov_b32_e32 v3, v4 +; GCN-NEXT: v_mov_b32_e32 v4, v5 +; GCN-NEXT: buffer_atomic_cmpswap v[3:4], v[1:2], s[8:11], 0 addr64 offset:400 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v4, v5 +; GCN-NEXT: v_mov_b32_e32 v5, v3 ; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN-NEXT: s_cbranch_execnz .LBB0_2 ; GCN-NEXT: ; %bb.3: ; %atomicrmw.end ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GCN-NEXT: .LBB0_4: ; %exit ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 01eb1b1a353d1..2003cb163a985 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -2653,41 +2653,38 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; VI-LABEL: s_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4c -; VI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x7c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x7c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s12, v0, 0 -; VI-NEXT: s_mul_i32 s4, s12, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v6, 0 -; VI-NEXT: s_mul_i32 s6, s13, s10 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0 +; VI-NEXT: s_mul_i32 s3, s8, s3 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_add_u32_e32 v3, vcc, s3, v3 +; VI-NEXT: s_mul_i32 s12, s9, s2 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v6, 0 +; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3 ; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v6, v[4:5] -; VI-NEXT: v_mov_b32_e32 v8, s8 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s14, v8, v[2:3] -; VI-NEXT: v_mov_b32_e32 v3, v7 -; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: v_mov_b32_e32 v8, s13 -; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s8, v8, v[6:7] -; VI-NEXT: s_mul_i32 s6, s15, s8 -; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v2 -; VI-NEXT: v_mov_b32_e32 v2, v5 -; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s9, v8, v[2:3] -; VI-NEXT: s_mul_i32 s6, s14, s9 -; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v6 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; VI-NEXT: v_mov_b32_e32 v1, v4 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: v_mad_u64_u32 v[6:7], s[2:3], s1, v6, v[4:5] +; VI-NEXT: v_mov_b32_e32 v8, s0 +; VI-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s10, v8, v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, v6 +; VI-NEXT: v_mov_b32_e32 v6, s9 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s0, v6, v[4:5] +; VI-NEXT: s_mul_i32 s8, s11, s0 +; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v9 +; VI-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; VI-NEXT: v_addc_u32_e64 v3, s[2:3], 0, 0, vcc +; VI-NEXT: s_mul_i32 s8, s10, s1 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s1, v6, v[2:3] +; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_i128: @@ -3012,52 +3009,49 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; VI-NEXT: v_add_u32_e32 v3, vcc, v15, v3 ; VI-NEXT: v_add_u32_e32 v15, vcc, v3, v2 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10] -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: v_mov_b32_e32 v3, v10 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v5, v[2:3] -; VI-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v6, v0, v[14:15] -; VI-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; VI-NEXT: v_addc_u32_e64 v4, s[0:1], 0, 0, vcc -; VI-NEXT: v_mul_lo_u32 v0, v7, v0 -; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v5, v[3:4] -; VI-NEXT: v_mul_lo_u32 v1, v6, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v10 -; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v10, vcc, v3, v9 -; VI-NEXT: v_addc_u32_e32 v11, vcc, v4, v0, vcc +; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v6, v0, v[14:15] ; VI-NEXT: v_mov_b32_e32 v9, v2 +; VI-NEXT: v_mul_lo_u32 v2, v7, v0 +; VI-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10] +; VI-NEXT: v_mul_lo_u32 v4, v6, v1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v2, v15 +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v10 +; VI-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc +; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[2:3] +; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v6 +; VI-NEXT: v_add_u32_e32 v10, vcc, v0, v14 +; VI-NEXT: v_addc_u32_e32 v11, vcc, v1, v2, vcc ; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 4, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v14, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v14, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v10, v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v13, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v12, v4, v3 ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; GFX9-NEXT: v_add3_u32 v9, v9, v13, v10 -; GFX9-NEXT: v_mul_lo_u32 v13, v6, v1 +; GFX9-NEXT: v_add3_u32 v9, v9, v12, v10 +; GFX9-NEXT: v_mul_lo_u32 v15, v6, v1 ; GFX9-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v0, v[8:9] -; GFX9-NEXT: v_mov_b32_e32 v10, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4] -; GFX9-NEXT: v_mul_lo_u32 v0, v7, v0 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, v5, v[10:11] -; GFX9-NEXT: v_add3_u32 v0, v0, v9, v13 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v0, vcc -; GFX9-NEXT: global_store_dwordx4 v12, v[2:5], s[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11] +; GFX9-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[10:11] +; GFX9-NEXT: v_mul_lo_u32 v10, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v6, v0, v[8:9] +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v13, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[8:9] +; GFX9-NEXT: v_add3_u32 v5, v10, v7, v15 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc +; GFX9-NEXT: global_store_dwordx4 v14, v[2:5], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i128: @@ -3071,22 +3065,20 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v15, v5, v2 +; GFX10-NEXT: v_mul_lo_u32 v14, v5, v2 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, v0 ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10] -; GFX10-NEXT: v_mov_b32_e32 v14, v12 -; GFX10-NEXT: v_mov_b32_e32 v12, v10 -; GFX10-NEXT: v_mad_u64_u32 v[9:10], s0, v0, v5, v[11:12] +; GFX10-NEXT: v_mov_b32_e32 v9, v11 ; GFX10-NEXT: v_mul_lo_u32 v11, v4, v3 ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v2, 0 -; GFX10-NEXT: v_mul_lo_u32 v12, v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v10 -; GFX10-NEXT: v_add3_u32 v3, v3, v11, v15 -; GFX10-NEXT: v_add_co_u32 v10, s0, v14, v4 +; GFX10-NEXT: v_mul_lo_u32 v4, v6, v1 +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s0, v0, v5, v[9:10] +; GFX10-NEXT: v_add3_u32 v3, v3, v11, v14 +; GFX10-NEXT: v_add_co_u32 v10, s0, v12, v10 ; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s0, 0, 0, s0 ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v5, v[10:11] -; GFX10-NEXT: v_add3_u32 v3, v7, v3, v12 +; GFX10-NEXT: v_add3_u32 v3, v7, v3, v4 ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] @@ -3097,37 +3089,37 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 +; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_lshlrev_b32 v17, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v15, s[0:1] -; GFX11-NEXT: global_load_b128 v[4:7], v15, s[2:3] +; GFX11-NEXT: global_load_b128 v[0:3], v17, s[0:1] +; GFX11-NEXT: global_load_b128 v[4:7], v17, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0 -; GFX11-NEXT: v_mul_lo_u32 v14, v5, v2 +; GFX11-NEXT: v_mul_lo_u32 v18, v5, v2 ; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10] -; GFX11-NEXT: v_dual_mov_b32 v13, v12 :: v_dual_mov_b32 v12, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12] -; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v4, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[15:16], null, v4, v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v11, v9 +; GFX11-NEXT: v_add3_u32 v16, v16, v3, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v1, v4, v[11:12] ; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mul_lo_u32 v10, v7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add3_u32 v12, v12, v3, v14 -; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v11, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12] +; GFX11-NEXT: v_mul_lo_u32 v12, v7, v0 +; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 -; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v6, v0, v[11:12] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v6, v0, v[15:16] ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3] -; GFX11-NEXT: v_add3_u32 v0, v10, v14, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v13 +; GFX11-NEXT: v_add3_u32 v0, v12, v11, v4 +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo -; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3] +; GFX11-NEXT: global_store_b128 v17, v[8:11], s[2:3] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul_i128: @@ -3142,29 +3134,27 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX12-NEXT: global_load_b128 v[4:7], v13, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0 -; GFX12-NEXT: v_mul_lo_u32 v15, v5, v2 +; GFX12-NEXT: v_mul_lo_u32 v14, v5, v2 ; GFX12-NEXT: v_mul_lo_u32 v7, v7, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10] -; GFX12-NEXT: v_mov_b32_e32 v14, v12 -; GFX12-NEXT: v_mov_b32_e32 v12, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[11:12] +; GFX12-NEXT: v_mov_b32_e32 v9, v11 ; GFX12-NEXT: v_mul_lo_u32 v11, v4, v3 ; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v2, 0 -; GFX12-NEXT: v_mul_lo_u32 v12, v6, v1 -; GFX12-NEXT: v_mov_b32_e32 v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add3_u32 v3, v3, v11, v15 -; GFX12-NEXT: v_add_co_u32 v10, s0, v14, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mul_lo_u32 v4, v6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[9:10] +; GFX12-NEXT: v_add3_u32 v3, v3, v11, v14 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_u32 v10, s0, v12, v10 ; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11] -; GFX12-NEXT: v_add3_u32 v3, v7, v3, v12 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add3_u32 v3, v7, v3, v4 ; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 704947523f677..afe1f33d15e42 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -203,30 +203,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 ; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v16, v13, v5 +; GFX9-NEXT: v_mul_lo_u32 v18, v13, v5 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[14:15] ; GFX9-NEXT: v_mul_lo_u32 v9, v11, v4 -; GFX9-NEXT: v_mul_lo_u32 v10, v10, v23 -; GFX9-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v23, v11, v[13:14] -; GFX9-NEXT: v_add3_u32 v8, v8, v16, v9 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v22, v13, v[14:15] +; GFX9-NEXT: v_add3_u32 v8, v8, v18, v9 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8] -; GFX9-NEXT: v_mov_b32_e32 v8, v14 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mul_lo_u32 v12, v12, v22 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v22, v11, v[8:9] -; GFX9-NEXT: v_add3_u32 v4, v10, v7, v12 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v7, v13 +; GFX9-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v23, v11, v[14:15] +; GFX9-NEXT: v_mul_lo_u32 v4, v12, v22 +; GFX9-NEXT: v_mul_lo_u32 v12, v10, v23 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v17, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v22, v11, v[9:10] +; GFX9-NEXT: v_add3_u32 v4, v12, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v4, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v20 @@ -1683,27 +1680,24 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mul_lo_u32 v19, v12, v7 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v12, 0 ; GFX9-NEXT: v_mov_b32_e32 v17, 0 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, 0 +; GFX9-NEXT: v_mul_lo_u32 v18, v13, v6 ; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v12, v[16:17] -; GFX9-NEXT: v_mul_lo_u32 v18, v13, v6 -; GFX9-NEXT: v_mul_lo_u32 v16, v15, v4 -; GFX9-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v13, v[11:12] ; GFX9-NEXT: v_add3_u32 v10, v10, v19, v18 ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, v[9:10] -; GFX9-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-NEXT: v_mul_lo_u32 v10, v14, v5 -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, v6, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[14:15] -; GFX9-NEXT: v_add3_u32 v6, v16, v9, v10 +; GFX9-NEXT: v_mov_b32_e32 v16, v11 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v13, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v6, v14, v5 +; GFX9-NEXT: v_mul_lo_u32 v14, v15, v4 +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 +; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[11:12] +; GFX9-NEXT: v_add3_u32 v6, v14, v9, v6 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v6, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 57c54c4de7102..d06d9f97db71c 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -1594,20 +1594,20 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 ; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 ; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 ; GCN-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_waitcnt vmcnt(2) @@ -1632,7 +1632,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: v_sdiv_i23: @@ -1783,20 +1783,20 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 ; GCN-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; GCN-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 ; GCN-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GCN-NEXT: s_waitcnt vmcnt(2) @@ -1819,7 +1819,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: v_sdiv_i24: diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index ae70abc7317c3..e2bcf3f6a2e2c 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -9779,118 +9779,111 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0 ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:224 ; GFX6-NEXT: s_mov_b32 s2, 0x86a00 ; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:208 ; GFX6-NEXT: s_mov_b32 s2, 0x86600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:192 ; GFX6-NEXT: s_mov_b32 s2, 0x86200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:176 ; GFX6-NEXT: s_mov_b32 s2, 0x85e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:160 ; GFX6-NEXT: s_mov_b32 s2, 0x85a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:144 ; GFX6-NEXT: s_mov_b32 s2, 0x85600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:128 ; GFX6-NEXT: s_mov_b32 s2, 0x85200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:112 ; GFX6-NEXT: s_mov_b32 s2, 0x84e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:96 ; GFX6-NEXT: s_mov_b32 s2, 0x84a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:80 ; GFX6-NEXT: s_mov_b32 s2, 0x84600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80 -; GFX6-NEXT: s_mov_b32 s2, 0x84200 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64 -; GFX6-NEXT: s_mov_b32 s2, 0x83a00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16 -; GFX6-NEXT: s_mov_b32 s2, 0x83200 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32 -; GFX6-NEXT: s_mov_b32 s2, 0x83600 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:64 +; GFX6-NEXT: s_mov_b32 s2, 0x84200 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[8:11], v[5:6], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:16 +; GFX6-NEXT: s_mov_b32 s2, 0x83600 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:32 +; GFX6-NEXT: s_mov_b32 s2, 0x83a00 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -9905,17 +9898,16 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[8:9] -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48 +; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:48 ; GFX6-NEXT: s_mov_b32 s0, 0x83e00 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v8 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: v_mov_b32_e32 v7, 1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: buffer_store_dword v7, v4, s[40:43], 0 offen ; GFX6-NEXT: ;;#ASMSTART @@ -9938,7 +9930,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_mov_b32 s6, 0x83200 ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[8:15] ; GFX6-NEXT: ;;#ASMEND @@ -9957,6 +9949,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s33 ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s6 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s6 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s6 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v11, off, s[40:43], s6 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_mov_b64 vcc, s[6:7] ; GFX6-NEXT: s_cbranch_execz .LBB1_2 @@ -10187,126 +10184,127 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b32 s0, 0x86a00 -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:240 +; GFX6-NEXT: s_mov_b32 s0, 0x86a00 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x86600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x86200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x83a00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s0, 0x83e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x83e00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s0, 0x83a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x83600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x83200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:16 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll index 1cc5b7f7d14ee..57496c2be54be 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll @@ -85,13 +85,13 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(ptr addrspace(1) %p) #0 { ; GFX908-DAG: v_accvgpr_read_b32 ; GFX900: NumVgprs: 256 -; GFX900: ScratchSize: 132 -; GFX908: NumVgprs: 252 +; GFX900: ScratchSize: 148 +; GFX908: NumVgprs: 254 ; GFX908: ScratchSize: 0 ; GFX900: VGPRBlocks: 63 -; GFX908: VGPRBlocks: 62 +; GFX908: VGPRBlocks: 63 ; GFX900: NumVGPRsForWavesPerEU: 256 -; GFX908: NumVGPRsForWavesPerEU: 252 +; GFX908: NumVGPRsForWavesPerEU: 254 define amdgpu_kernel void @max_256_vgprs_spill_9x32(ptr addrspace(1) %p) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %p1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 8150328dd24f0..ef1adbb395e76 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -282,43 +282,43 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s0, v2 -; VI-NEXT: v_readfirstlane_b32 s1, v3 -; VI-NEXT: v_readfirstlane_b32 s2, v0 -; VI-NEXT: v_readfirstlane_b32 s3, v1 -; VI-NEXT: s_ashr_i32 s8, s3, 16 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_ashr_i32 s9, s2, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_ashr_i32 s10, s1, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 -; VI-NEXT: s_ashr_i32 s11, s0, 16 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s0, s2, s0 -; VI-NEXT: s_ashr_i32 s2, s9, s11 -; VI-NEXT: s_ashr_i32 s1, s3, s1 -; VI-NEXT: s_ashr_i32 s3, s8, s10 -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s1, s1, s3 -; VI-NEXT: s_or_b32 s0, s0, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_readfirstlane_b32 s4, v2 +; VI-NEXT: v_readfirstlane_b32 s5, v3 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_ashr_i32 s8, s7, 16 +; VI-NEXT: s_sext_i32_i16 s7, s7 +; VI-NEXT: s_ashr_i32 s9, s6, 16 +; VI-NEXT: s_sext_i32_i16 s6, s6 +; VI-NEXT: s_ashr_i32 s10, s5, 16 +; VI-NEXT: s_sext_i32_i16 s5, s5 +; VI-NEXT: s_ashr_i32 s11, s4, 16 +; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: s_ashr_i32 s4, s6, s4 +; VI-NEXT: s_ashr_i32 s6, s9, s11 +; VI-NEXT: s_ashr_i32 s5, s7, s5 +; VI-NEXT: s_ashr_i32 s7, s8, s10 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 530226baa775e..0d682a6627a1a 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -2509,9 +2509,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, v3 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v3 ; VI-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc ; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1] ; VI-NEXT: v_alignbit_b32 v0, v1, v0, 7 @@ -2530,9 +2528,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s4, v[2:3] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3] -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v1, v3 ; GCN-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1] ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 7 @@ -2548,10 +2544,8 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GFX1030-NEXT: v_mul_hi_u32 v2, 0x71b47843, v4 ; GFX1030-NEXT: v_mad_u64_u32 v[0:1], null, 0x71b47843, v5, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v0, v1 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xa7c5ac4, v4, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v1, v3 -; GFX1030-NEXT: v_add_co_u32 v0, s4, v0, v1 +; GFX1030-NEXT: v_add_co_u32 v0, s4, v1, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s4 ; GFX1030-NEXT: v_mad_u64_u32 v[0:1], null, 0xa7c5ac4, v5, v[0:1] ; GFX1030-NEXT: v_alignbit_b32 v0, v1, v0, 7 diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index c6cc479b5deb1..5360ff2fa402f 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -2411,7 +2411,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -2435,7 +2434,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2457,7 +2455,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -2480,8 +2477,7 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -2502,8 +2498,7 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 ; GFX12-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm