diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 7ed055e8da2b69..a5fc9cad971225 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4165,22 +4165,17 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, SDLoc SL(N); unsigned RHSVal = RHS->getZExtValue(); - // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31) - if (RHSVal == 32) { + // For C >= 32 + // (sra i64:x, C) -> build_pair (sra hi_32(x), C - 32), (sra hi_32(x), 31) + if (RHSVal >= 32) { SDValue Hi = getHiHalf64(N->getOperand(0), DAG); - SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, - DAG.getConstant(31, SL, MVT::i32)); + Hi = DAG.getFreeze(Hi); + SDValue HiShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, + DAG.getConstant(31, SL, MVT::i32)); + SDValue LoShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, + DAG.getConstant(RHSVal - 32, SL, MVT::i32)); - SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift}); - return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); - } - - // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31) - if (RHSVal == 63) { - SDValue Hi = getHiHalf64(N->getOperand(0), DAG); - SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, - DAG.getConstant(31, SL, MVT::i32)); - SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift}); + SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {LoShift, HiShift}); return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); } diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll index 8ed8d905c5512e..eaceafcb06089e 100644 --- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -685,16 +685,16 @@ define amdgpu_kernel void @ashr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_bfe_i32 v6, v3, 0, 16 -; CI-NEXT: v_ashr_i64 v[3:4], v[2:3], 56 -; CI-NEXT: v_bfe_i32 v5, v2, 0, 16 +; CI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; CI-NEXT: v_bfe_i32 v5, v3, 0, 16 +; CI-NEXT: v_ashrrev_i32_e32 v3, 24, v3 ; CI-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; CI-NEXT: v_bfe_u32 v4, v6, 8, 16 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_bfe_u32 v5, v5, 8, 16 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v3, v4, v3 -; CI-NEXT: v_or_b32_e32 v2, v5, v2 +; CI-NEXT: v_bfe_u32 v5, v5, 8, 16 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_bfe_u32 v4, v4, 8, 16 +; CI-NEXT: v_or_b32_e32 v3, v5, v3 +; CI-NEXT: v_or_b32_e32 v2, v4, v2 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll index 933c6506d02709..613fdf388c0f17 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll @@ -150,9 +150,9 @@ define i32 @mul_one_bit_hi_hi_u32_lshr_ashr(i32 %arg, i32 %arg1, ptr %arg2) { ; CHECK-LABEL: mul_one_bit_hi_hi_u32_lshr_ashr: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v0 -; CHECK-NEXT: v_ashrrev_i64 v[0:1], 33, v[3:4] -; CHECK-NEXT: flat_store_dword v[2:3], v4 +; CHECK-NEXT: v_mul_hi_u32 v0, v1, v0 +; CHECK-NEXT: flat_store_dword v[2:3], v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 1, v0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 06c04172118097..51398a45055eb3 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -4398,9 +4398,10 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_ashrrev_i64 v[2:3], 33, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v4 +; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-O0-LABEL: v_sdiv_i128_v_pow2k: diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 3465c782bd700b..189b8977933811 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -1433,15 +1433,25 @@ define i128 @fptoui_f32_to_i128(float %x) { } define i128 @fptosi_f16_to_i128(half %x) { -; GCN-LABEL: fptosi_f16_to_i128: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: fptosi_f16_to_i128: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fptosi_f16_to_i128: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptosi half %x to i128 ret i128 %cvt } diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 32c86c094aaa33..cce50222132918 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1643,15 +1643,15 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s8, s4, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[4:5], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s6, s4, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s5, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -1666,14 +1666,14 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 48 -; GCN-HSA-NEXT: s_ashr_i32 s4, s2, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s1, s3 +; GCN-HSA-NEXT: s_ashr_i32 s0, s2, 16 +; GCN-HSA-NEXT: s_ashr_i32 s1, s3, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -6213,19 +6213,20 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s5 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s4, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s5, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -6240,24 +6241,25 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s4, s3 ; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16 +; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s10, s3, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -6579,33 +6581,35 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s5, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -6613,8 +6617,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6622,13 +6626,15 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_mov_b32 s8, s5 ; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-HSA-NEXT: s_ashr_i32 s13, s5, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s6, s5, 31 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s12, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s7, s7, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 @@ -6637,8 +6643,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6646,8 +6652,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6660,8 +6666,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -7185,59 +7191,63 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s4, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[0:1], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s1, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[16:17], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 @@ -7247,124 +7257,128 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s6, s19 -; GCN-HSA-NEXT: s_mov_b32 s10, s17 -; GCN-HSA-NEXT: s_mov_b32 s20, s15 -; GCN-HSA-NEXT: s_mov_b32 s22, s13 -; GCN-HSA-NEXT: s_lshr_b32 s24, s18, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s16, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s12, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[18:19], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[18:19], s[18:19], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s12, s7 +; GCN-HSA-NEXT: s_mov_b32 s14, s5 +; GCN-HSA-NEXT: s_mov_b32 s16, s3 +; GCN-HSA-NEXT: s_mov_b32 s18, s1 +; GCN-HSA-NEXT: s_ashr_i32 s27, s1, 31 +; GCN-HSA-NEXT: s_ashr_i32 s29, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s30, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s0, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[2:3], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[16:17], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[14:15], s[14:15], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[16:17], s[16:17], 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[28:29], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s28, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s31, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s33, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s34, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s35, s7, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[24:25], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s28, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s29, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s11 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s17 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s24, s8, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s25, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GCN-HSA-NEXT: s_add_u32 s14, s8, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 +; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s34 +; GCN-HSA-NEXT: s_add_u32 s14, s8, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s31 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 +; GCN-HSA-NEXT: s_add_u32 s14, s8, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -8330,139 +8344,147 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s50, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s52, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s7 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s1, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s3, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s3, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s9, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s9, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s11, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s11, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s13, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s15, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s15, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s54, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s12, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s12, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s8, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[70:71], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[48:49], s[0:1], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[58:59], s[2:3], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[12:13], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[14:15], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[14:15], 0x100000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s71 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s47 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[56:57], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[52:53], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[50:51], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[54:55], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[54:55], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[56:57], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s23 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[38:39], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s37 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -8475,198 +8497,206 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s40, s15 -; GCN-HSA-NEXT: s_mov_b32 s48, s13 -; GCN-HSA-NEXT: s_mov_b32 s50, s11 -; GCN-HSA-NEXT: s_mov_b32 s52, s9 -; GCN-HSA-NEXT: s_mov_b32 s54, s7 -; GCN-HSA-NEXT: s_mov_b32 s56, s5 -; GCN-HSA-NEXT: s_mov_b32 s44, s3 -; GCN-HSA-NEXT: s_mov_b32 s58, s1 -; GCN-HSA-NEXT: s_lshr_b32 s60, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s62, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s64, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s68, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s70, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s72, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s74, s0, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[38:39], s[2:3], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[14:15], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[40:41], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[46:47], s[6:7], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 +; GCN-HSA-NEXT: s_mov_b32 s34, s15 +; GCN-HSA-NEXT: s_ashr_i32 s41, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s42, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s57, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s59, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s61, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s63, s9, 31 +; GCN-HSA-NEXT: s_ashr_i32 s65, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s67, s11, 31 +; GCN-HSA-NEXT: s_ashr_i32 s69, s11, 16 +; GCN-HSA-NEXT: s_mov_b32 s44, s13 +; GCN-HSA-NEXT: s_mov_b32 s46, s11 +; GCN-HSA-NEXT: s_mov_b32 s48, s9 +; GCN-HSA-NEXT: s_mov_b32 s50, s7 +; GCN-HSA-NEXT: s_mov_b32 s52, s5 +; GCN-HSA-NEXT: s_mov_b32 s38, s3 +; GCN-HSA-NEXT: s_mov_b32 s36, s1 +; GCN-HSA-NEXT: s_lshr_b32 s54, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s58, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s60, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[34:35], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s33, s1, 31 +; GCN-HSA-NEXT: s_ashr_i32 s40, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s43, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s70, s13, 31 +; GCN-HSA-NEXT: s_ashr_i32 s71, s13, 16 +; GCN-HSA-NEXT: s_ashr_i32 s72, s15, 31 +; GCN-HSA-NEXT: s_ashr_i32 s73, s15, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[14:15], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[74:75], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[72:73], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[70:71], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[68:69], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[66:67], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[54:55], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s58, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s59, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 -; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s49 -; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s46 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s42 -; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s43 -; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s43 -; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GCN-HSA-NEXT: s_add_u32 s38, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39 -; GCN-HSA-NEXT: s_add_u32 s38, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s43 +; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s73 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s72 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s71 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 +; GCN-HSA-NEXT: s_add_u32 s38, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s70 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 ; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s69 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s67 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GCN-HSA-NEXT: s_add_u32 s36, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s37 +; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s37 +; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 +; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s30 +; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s31 +; GCN-HSA-NEXT: s_addc_u32 s31, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s44 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 +; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s38 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s36 +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xa0 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GCN-HSA-NEXT: s_add_u32 s8, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 64 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 88beb0683f8e0d..70bdebedf85bcb 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -6405,14 +6405,15 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 8 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 @@ -6429,8 +6430,8 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s5 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -6449,23 +6450,24 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s12, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s14, s2, 8 +; GFX7-HSA-NEXT: s_ashr_i32 s5, s3, 31 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-HSA-NEXT: s_ashr_i32 s18, s3, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6505,23 +6507,24 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 +; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s3, 31 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s3, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6616,24 +6619,25 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: s_mov_b32 s8, s3 ; GFX12-NEXT: s_lshr_b32 s10, s2, 16 ; GFX12-NEXT: s_lshr_b32 s12, s2, 24 -; GFX12-NEXT: s_lshr_b32 s14, s2, 8 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 -; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 +; GFX12-NEXT: s_ashr_i32 s15, s3, 31 +; GFX12-NEXT: s_ashr_i32 s18, s3, 24 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX12-NEXT: s_lshr_b32 s14, s2, 8 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s15 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX12-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v5, s17 ; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s9 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s7 ; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s11 ; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13 -; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v7, s15 -; GFX12-NEXT: v_mov_b32_e32 v6, s14 +; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 @@ -7025,56 +7029,58 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA: ; %bb.0: ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s11, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s11, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s4, s11 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s10, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s10, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s10, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s9, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s9, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s26, s9 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s8, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s8, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s8, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[8:9], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s14, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s26, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s4, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s38, s7, 24 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s11 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 @@ -7094,8 +7100,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 @@ -7120,19 +7126,19 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 8 +; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s31, s5, 24 ; GFX7-HSA-NEXT: s_mov_b32 s24, s5 ; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8 ; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[34:35], s[4:5], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[4:5], s[6:7], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s36, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 @@ -7156,6 +7162,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -7170,8 +7178,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 @@ -7181,8 +7189,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: s_add_u32 s10, s0, 32 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -7219,103 +7227,105 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s14, s7 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s6, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s5, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s5, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s26, s5 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s4, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s7, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s22, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s6, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s5, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s5, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s14, s5 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s4, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 8 +; GFX8-NOHSA-NEXT: s_ashr_i32 s19, s5, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s31, s5, 24 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[34:35], s[4:5], 56 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[38:39], s[6:7], 56 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x60 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x50 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 64 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 48 +; GFX8-NOHSA-NEXT: s_ashr_i32 s30, s7, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[18:19], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x50 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -7425,60 +7435,63 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s2, s7, 16 -; GFX12-NEXT: s_lshr_b32 s8, s7, 8 -; GFX12-NEXT: s_mov_b32 s10, s7 -; GFX12-NEXT: s_lshr_b32 s12, s6, 16 -; GFX12-NEXT: s_lshr_b32 s14, s6, 24 -; GFX12-NEXT: s_lshr_b32 s16, s6, 8 +; GFX12-NEXT: s_lshr_b32 s8, s7, 16 +; GFX12-NEXT: s_lshr_b32 s10, s7, 8 +; GFX12-NEXT: s_mov_b32 s12, s7 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GFX12-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX12-NEXT: s_ashr_i32 s33, s7, 31 +; GFX12-NEXT: s_ashr_i32 s36, s7, 24 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: s_lshr_b32 s18, s5, 16 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: s_lshr_b32 s14, s6, 16 +; GFX12-NEXT: s_lshr_b32 s16, s6, 24 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s35 -; GFX12-NEXT: v_dual_mov_b32 v4, s34 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v9, s11 -; GFX12-NEXT: s_lshr_b32 s20, s5, 8 -; GFX12-NEXT: s_mov_b32 s22, s5 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v3, s33 +; GFX12-NEXT: s_lshr_b32 s18, s6, 8 +; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s35 +; GFX12-NEXT: v_dual_mov_b32 v4, s34 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v9, s13 +; GFX12-NEXT: s_lshr_b32 s20, s5, 16 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s9 -; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s13 -; GFX12-NEXT: s_lshr_b32 s24, s4, 16 -; GFX12-NEXT: s_lshr_b32 s26, s4, 24 -; GFX12-NEXT: s_lshr_b32 s28, s4, 8 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 -; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s11 +; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s15 +; GFX12-NEXT: s_lshr_b32 s22, s5, 8 +; GFX12-NEXT: s_mov_b32 s24, s5 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 -; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v7, s17 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX12-NEXT: s_lshr_b32 s26, s4, 16 +; GFX12-NEXT: s_lshr_b32 s28, s4, 24 +; GFX12-NEXT: s_ashr_i32 s29, s5, 31 +; GFX12-NEXT: s_ashr_i32 s31, s5, 24 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v17, s19 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s17 +; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v7, s19 +; GFX12-NEXT: s_lshr_b32 s30, s4, 8 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s5 -; GFX12-NEXT: v_mov_b32_e32 v18, s4 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX12-NEXT: v_mov_b32_e32 v6, s18 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v3, s21 -; GFX12-NEXT: v_dual_mov_b32 v1, s23 :: v_dual_mov_b32 v2, s20 -; GFX12-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v8, s24 -; GFX12-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v10, s26 -; GFX12-NEXT: v_dual_mov_b32 v21, s31 :: v_dual_mov_b32 v20, s30 -; GFX12-NEXT: v_dual_mov_b32 v23, s29 :: v_dual_mov_b32 v22, s28 +; GFX12-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s29 +; GFX12-NEXT: v_dual_mov_b32 v1, s21 :: v_dual_mov_b32 v2, s31 +; GFX12-NEXT: v_mov_b32_e32 v9, s25 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v11, s23 +; GFX12-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v17, s27 +; GFX12-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v19, s7 +; GFX12-NEXT: v_dual_mov_b32 v18, s6 :: v_dual_mov_b32 v21, s3 +; GFX12-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v23, s5 +; GFX12-NEXT: v_mov_b32_e32 v22, s4 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:80 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in @@ -8191,146 +8204,151 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s38, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s42, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s30, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s34, s5 ; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 24 ; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s34, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 16 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s54, s1 -; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s52, s0, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[38:39], s[0:1], 56 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[64:65], s[2:3], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[68:69], s[4:5], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 +; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s1, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s1, 24 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s3, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s44, s3 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s52, s1 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s54, s0, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s71 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s63 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s56 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s57 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s62 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s63 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s17 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[54:55], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[52:53], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[48:49], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[46:47], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s6 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[54:55], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[42:43], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s55 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s43 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s16 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:128 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s52 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s53 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s31 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s27 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s25 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -8343,207 +8361,213 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s52, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s62, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s42, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s36, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s30, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s34, s3 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s1, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 8 -; GFX7-HSA-NEXT: s_mov_b32 s16, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s66, s0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s68, s0, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s70, s0, 8 -; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[18:19], s[2:3], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[44:45], s[4:5], 56 -; GFX7-HSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[6:7], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[64:65], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s42, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s44, s6, 16 +; GFX7-HSA-NEXT: s_ashr_i32 s41, s1, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s43, s3, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s45, s3, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s48, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s50, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s46, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s54, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s36, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s34, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s3, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s3, 8 +; GFX7-HSA-NEXT: s_mov_b32 s26, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s1, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s1, 8 +; GFX7-HSA-NEXT: s_mov_b32 s12, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s0, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s62, s0, 8 +; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s66, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s67, s5, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s68, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s69, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[64:65], s[6:7], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[60:61], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[58:59], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[56:57], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s64, s8, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s65, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s50 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xe0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s51 -; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s51 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s44 -; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xc0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s45 -; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s45 -; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s65 -; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 -; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s43 -; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 +; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[44:45], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[42:43], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[40:41], 0x80000 +; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s58 +; GFX7-HSA-NEXT: s_add_u32 s58, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s59 +; GFX7-HSA-NEXT: s_addc_u32 s59, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48 +; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s49 +; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s49 +; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s49 +; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s63 +; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s69 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s58 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s38 +; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s30 +; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s31 +; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s31 +; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s57 +; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s28 -; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s29 -; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s29 -; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x80 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 +; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s65 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29 ; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s45 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s18 -; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x70 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s19 -; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x60 -; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s66 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s38 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s24 +; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s30 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s28 +; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s35 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s25 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: s_add_u32 s12, s8, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-HSA-NEXT: s_add_u32 s16, s8, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-HSA-NEXT: s_addc_u32 s17, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: s_add_u32 s14, s8, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: s_addc_u32 s15, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: s_add_u32 s10, s8, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 @@ -8556,116 +8580,120 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s50, s7 -; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s6, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s54, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s6, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s5, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s5, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s62, s5 -; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s4, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s7, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s54, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s6, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s62, s5, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s5, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s46, s5 +; GFX8-NOHSA-NEXT: s_lshr_b32 s42, s4, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s4, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s4, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s3, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s28, s3 -; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s2, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s1, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s1, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s64, s1 -; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s0, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s68, s0, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s70, s0, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s64, s3, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s30, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s1, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s1, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s18, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s0, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s0, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s0, 8 +; GFX8-NOHSA-NEXT: s_ashr_i32 s65, s3, 24 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[12:13], s[0:1], 56 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[26:27], s[2:3], 56 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 56 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 56 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[70:71], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[44:45], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s4, s1, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s1, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s3, 31 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[64:65], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s64, s5, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s5, 24 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 -; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s74 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s75 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xe0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s66, s7, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s7, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s50 +; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xf0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s51 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xd0 +; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s66 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 +; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xe0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s54 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s55 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s52 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 +; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xd0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s52 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s54 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s72 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s73 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s56 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s56 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s59 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NOHSA-NEXT: s_add_u32 s42, s8, 0xb0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NOHSA-NEXT: s_addc_u32 s43, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s42 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s58 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NOHSA-NEXT: s_add_u32 s42, s8, 0xa0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44 +; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xc0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s45 +; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s61 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xb0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s43, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s62 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s63 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s61 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xa0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s40 @@ -8673,94 +8701,90 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NOHSA-NEXT: s_addc_u32 s41, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s43 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s41 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX8-NOHSA-NEXT: s_add_u32 s34, s8, 0x80 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NOHSA-NEXT: s_addc_u32 s35, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NOHSA-NEXT: s_add_u32 s36, s8, 0x80 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NOHSA-NEXT: s_addc_u32 s37, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s37 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 -; GFX8-NOHSA-NEXT: s_add_u32 s26, s8, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX8-NOHSA-NEXT: s_addc_u32 s27, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NOHSA-NEXT: s_add_u32 s26, s8, 0x60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x70 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s27, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s30 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX8-NOHSA-NEXT: s_add_u32 s22, s8, 0x50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX8-NOHSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NOHSA-NEXT: s_add_u32 s16, s8, 64 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NOHSA-NEXT: s_addc_u32 s17, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s8, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -8960,83 +8984,87 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s36, s7, 16 -; GFX12-NEXT: s_lshr_b32 s38, s7, 8 -; GFX12-NEXT: s_mov_b32 s40, s7 -; GFX12-NEXT: s_lshr_b32 s42, s6, 16 -; GFX12-NEXT: s_lshr_b32 s44, s6, 24 -; GFX12-NEXT: s_ashr_i64 s[74:75], s[6:7], 56 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX12-NEXT: s_lshr_b32 s46, s6, 8 +; GFX12-NEXT: s_lshr_b32 s40, s7, 16 +; GFX12-NEXT: s_lshr_b32 s50, s6, 8 +; GFX12-NEXT: s_lshr_b32 s62, s3, 16 +; GFX12-NEXT: s_ashr_i32 s51, s3, 24 +; GFX12-NEXT: s_lshr_b32 s42, s7, 8 +; GFX12-NEXT: s_mov_b32 s44, s7 +; GFX12-NEXT: s_lshr_b32 s46, s6, 16 +; GFX12-NEXT: s_lshr_b32 s48, s6, 24 +; GFX12-NEXT: s_lshr_b32 s38, s5, 16 +; GFX12-NEXT: s_lshr_b32 s52, s5, 8 +; GFX12-NEXT: s_mov_b32 s54, s5 +; GFX12-NEXT: s_lshr_b32 s56, s4, 16 +; GFX12-NEXT: s_lshr_b32 s58, s4, 24 +; GFX12-NEXT: s_lshr_b32 s60, s4, 8 +; GFX12-NEXT: s_lshr_b32 s36, s3, 8 +; GFX12-NEXT: s_mov_b32 s34, s3 +; GFX12-NEXT: s_lshr_b32 s28, s2, 16 +; GFX12-NEXT: s_lshr_b32 s26, s2, 24 +; GFX12-NEXT: s_lshr_b32 s24, s2, 8 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000 +; GFX12-NEXT: s_ashr_i32 s39, s3, 31 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 +; GFX12-NEXT: s_ashr_i32 s62, s5, 31 +; GFX12-NEXT: s_ashr_i32 s63, s5, 24 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000 +; GFX12-NEXT: s_ashr_i32 s50, s7, 31 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s37 +; GFX12-NEXT: s_ashr_i32 s7, s7, 24 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s75 -; GFX12-NEXT: v_dual_mov_b32 v2, s74 :: v_dual_mov_b32 v5, s41 -; GFX12-NEXT: s_lshr_b32 s48, s5, 16 -; GFX12-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s41 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s39 -; GFX12-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v9, s43 -; GFX12-NEXT: s_lshr_b32 s50, s5, 8 -; GFX12-NEXT: s_mov_b32 s52, s5 -; GFX12-NEXT: v_dual_mov_b32 v8, s42 :: v_dual_mov_b32 v11, s45 -; GFX12-NEXT: v_dual_mov_b32 v10, s44 :: v_dual_mov_b32 v13, s73 -; GFX12-NEXT: s_lshr_b32 s54, s4, 16 -; GFX12-NEXT: s_lshr_b32 s56, s4, 24 -; GFX12-NEXT: s_ashr_i64 s[70:71], s[4:5], 56 -; GFX12-NEXT: v_dual_mov_b32 v12, s72 :: v_dual_mov_b32 v15, s47 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[48:49], 0x80000 -; GFX12-NEXT: v_mov_b32_e32 v14, s46 -; GFX12-NEXT: s_lshr_b32 s58, s4, 8 +; GFX12-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v3, s50 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s45 +; GFX12-NEXT: v_dual_mov_b32 v4, s44 :: v_dual_mov_b32 v7, s43 +; GFX12-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v9, s47 +; GFX12-NEXT: v_dual_mov_b32 v8, s46 :: v_dual_mov_b32 v11, s49 +; GFX12-NEXT: v_dual_mov_b32 v10, s48 :: v_dual_mov_b32 v13, s67 +; GFX12-NEXT: v_dual_mov_b32 v12, s66 :: v_dual_mov_b32 v15, s5 +; GFX12-NEXT: v_mov_b32_e32 v14, s4 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[38:39], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX12-NEXT: s_lshr_b32 s60, s3, 16 +; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:240 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:224 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:208 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:192 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s71 -; GFX12-NEXT: v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v2, s70 -; GFX12-NEXT: v_mov_b32_e32 v5, s53 -; GFX12-NEXT: s_lshr_b32 s34, s3, 8 -; GFX12-NEXT: s_mov_b32 s30, s3 -; GFX12-NEXT: s_lshr_b32 s24, s2, 16 -; GFX12-NEXT: s_lshr_b32 s22, s2, 24 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s52 :: v_dual_mov_b32 v7, s51 -; GFX12-NEXT: v_dual_mov_b32 v6, s50 :: v_dual_mov_b32 v9, s55 -; GFX12-NEXT: s_lshr_b32 s20, s2, 8 -; GFX12-NEXT: s_ashr_i64 s[26:27], s[2:3], 56 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s62 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s63 +; GFX12-NEXT: v_mov_b32_e32 v5, s55 ; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s54 :: v_dual_mov_b32 v11, s57 -; GFX12-NEXT: v_dual_mov_b32 v10, s56 :: v_dual_mov_b32 v13, s29 -; GFX12-NEXT: s_lshr_b32 s18, s1, 16 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s54 :: v_dual_mov_b32 v7, s53 +; GFX12-NEXT: v_dual_mov_b32 v6, s52 :: v_dual_mov_b32 v9, s57 +; GFX12-NEXT: v_dual_mov_b32 v8, s56 :: v_dual_mov_b32 v11, s59 +; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s31 +; GFX12-NEXT: s_lshr_b32 s22, s1, 16 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v15, s59 -; GFX12-NEXT: v_dual_mov_b32 v14, s58 :: v_dual_mov_b32 v17, s61 -; GFX12-NEXT: s_lshr_b32 s14, s1, 8 -; GFX12-NEXT: s_mov_b32 s62, s1 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v16, s60 :: v_dual_mov_b32 v19, s27 -; GFX12-NEXT: v_dual_mov_b32 v18, s26 :: v_dual_mov_b32 v21, s31 -; GFX12-NEXT: s_lshr_b32 s64, s0, 16 -; GFX12-NEXT: s_lshr_b32 s66, s0, 24 -; GFX12-NEXT: s_ashr_i64 s[12:13], s[0:1], 56 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v20, s30 :: v_dual_mov_b32 v23, s35 -; GFX12-NEXT: v_mov_b32_e32 v22, s34 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s30 :: v_dual_mov_b32 v15, s61 +; GFX12-NEXT: v_dual_mov_b32 v14, s60 :: v_dual_mov_b32 v17, s3 +; GFX12-NEXT: s_lshr_b32 s16, s1, 8 +; GFX12-NEXT: s_mov_b32 s18, s1 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s39 +; GFX12-NEXT: v_dual_mov_b32 v18, s51 :: v_dual_mov_b32 v21, s35 +; GFX12-NEXT: s_lshr_b32 s14, s0, 16 +; GFX12-NEXT: s_lshr_b32 s12, s0, 24 +; GFX12-NEXT: s_ashr_i32 s6, s1, 31 +; GFX12-NEXT: s_ashr_i32 s33, s1, 24 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v23, s37 +; GFX12-NEXT: v_mov_b32_e32 v22, s36 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160 @@ -9044,24 +9072,25 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:128 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:112 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v3, s23 -; GFX12-NEXT: v_dual_mov_b32 v1, s25 :: v_dual_mov_b32 v2, s22 -; GFX12-NEXT: v_mov_b32_e32 v5, s17 -; GFX12-NEXT: s_lshr_b32 s68, s0, 8 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s27 +; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s26 +; GFX12-NEXT: v_mov_b32_e32 v5, s21 +; GFX12-NEXT: s_lshr_b32 s64, s0, 8 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s25 +; GFX12-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v9, s23 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s21 -; GFX12-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v9, s19 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v11, s13 -; GFX12-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v13, s7 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s6 +; GFX12-NEXT: v_dual_mov_b32 v10, s33 :: v_dual_mov_b32 v13, s19 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[0:1], s[68:69], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v15, s15 -; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v17, s5 -; GFX12-NEXT: v_dual_mov_b32 v16, s4 :: v_dual_mov_b32 v19, s3 -; GFX12-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v21, s11 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[64:65], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s17 +; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v17, s15 +; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v19, s13 +; GFX12-NEXT: v_dual_mov_b32 v18, s12 :: v_dual_mov_b32 v21, s11 ; GFX12-NEXT: v_dual_mov_b32 v20, s10 :: v_dual_mov_b32 v23, s1 ; GFX12-NEXT: v_mov_b32_e32 v22, s0 ; GFX12-NEXT: s_clause 0x5 @@ -10214,34 +10243,34 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_sext_i32_i16 s0, s3 -; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80000 -; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s7, s1, s0 -; GFX8-NOHSA-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s4, s3 +; GFX8-NOHSA-NEXT: s_bfe_i32 s5, s3, 0x80000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xffff0000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 16 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NOHSA-NEXT: s_bfe_i32 s6, s2, 0x80000 -; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s3, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s2, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX8-NOHSA-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s3, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s6, s6, s0 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 56 -; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s5, 0x80000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 16 -; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xffff0000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s2, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s2, s4, 0x80000 -; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NOHSA-NEXT: s_or_b32 s1, s2, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s6, s5 +; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s3 +; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -10276,26 +10305,26 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: s_ashr_i64 s[4:5], s[2:3], 56 -; GFX12-TRUE16-NEXT: s_lshr_b32 s6, s2, 16 -; GFX12-TRUE16-NEXT: s_lshr_b32 s7, s3, 16 -; GFX12-TRUE16-NEXT: s_bfe_i32 s5, s3, 0x80000 +; GFX12-TRUE16-NEXT: s_lshr_b32 s4, s2, 16 +; GFX12-TRUE16-NEXT: s_lshr_b32 s5, s3, 16 +; GFX12-TRUE16-NEXT: s_ashr_i32 s6, s3, 24 +; GFX12-TRUE16-NEXT: s_bfe_i32 s7, s3, 0x80000 ; GFX12-TRUE16-NEXT: s_sext_i32_i16 s3, s3 ; GFX12-TRUE16-NEXT: s_ashr_i32 s8, s2, 24 ; GFX12-TRUE16-NEXT: s_mov_b32 s9, s2 ; GFX12-TRUE16-NEXT: s_sext_i32_i16 s2, s2 -; GFX12-TRUE16-NEXT: s_bfe_i32 s7, s7, 0x80000 +; GFX12-TRUE16-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX12-TRUE16-NEXT: s_lshr_b32 s3, s3, 8 -; GFX12-TRUE16-NEXT: s_bfe_i32 s6, s6, 0x80000 +; GFX12-TRUE16-NEXT: s_bfe_i32 s4, s4, 0x80000 ; GFX12-TRUE16-NEXT: s_bfe_i32 s9, s9, 0x80000 ; GFX12-TRUE16-NEXT: s_lshr_b32 s2, s2, 8 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s7, s4 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s3 +; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s7, s3 ; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s9, s2 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s6, s8 +; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s8 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-TRUE16-NEXT: s_endpgm @@ -10306,25 +10335,25 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: s_ashr_i64 s[4:5], s[2:3], 56 -; GFX12-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 -; GFX12-FAKE16-NEXT: s_lshr_b32 s7, s3, 16 -; GFX12-FAKE16-NEXT: s_bfe_i32 s5, s3, 0x80000 +; GFX12-FAKE16-NEXT: s_lshr_b32 s4, s2, 16 +; GFX12-FAKE16-NEXT: s_lshr_b32 s5, s3, 16 +; GFX12-FAKE16-NEXT: s_ashr_i32 s6, s3, 24 +; GFX12-FAKE16-NEXT: s_bfe_i32 s7, s3, 0x80000 ; GFX12-FAKE16-NEXT: s_sext_i32_i16 s3, s3 ; GFX12-FAKE16-NEXT: s_ashr_i32 s8, s2, 24 ; GFX12-FAKE16-NEXT: s_bfe_i32 s9, s2, 0x80000 ; GFX12-FAKE16-NEXT: s_sext_i32_i16 s2, s2 -; GFX12-FAKE16-NEXT: s_bfe_i32 s7, s7, 0x80000 +; GFX12-FAKE16-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, 8 -; GFX12-FAKE16-NEXT: s_bfe_i32 s6, s6, 0x80000 +; GFX12-FAKE16-NEXT: s_bfe_i32 s4, s4, 0x80000 ; GFX12-FAKE16-NEXT: s_lshr_b32 s2, s2, 8 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s7, s4 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s3 +; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s7, s3 ; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s9, s2 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s6, s8 +; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s8 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-FAKE16-NEXT: s_endpgm @@ -10706,62 +10735,62 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s5, 16 ; GFX8-NOHSA-NEXT: s_sext_i32_i16 s10, s5 ; GFX8-NOHSA-NEXT: s_bfe_i32 s11, s5, 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s5, 16 -; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s4, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xffff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NOHSA-NEXT: s_sext_i32_i16 s12, s4 ; GFX8-NOHSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX8-NOHSA-NEXT: s_or_b32 s5, s3, s5 -; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s4, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s11, 0xffff, s11 ; GFX8-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s10, s10, 0xffff0000 -; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s5, 16 ; GFX8-NOHSA-NEXT: s_or_b32 s10, s11, s10 ; GFX8-NOHSA-NEXT: s_and_b32 s11, s12, 0xffff0000 ; GFX8-NOHSA-NEXT: s_bfe_i32 s12, s4, 0x80000 -; GFX8-NOHSA-NEXT: s_or_b32 s4, s2, s3 -; GFX8-NOHSA-NEXT: s_sext_i32_i16 s2, s7 -; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s7, 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s5, 16 +; GFX8-NOHSA-NEXT: s_ashr_i32 s4, s4, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s8, s8, 0x80000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s9, s9, 0x80000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s8, s4 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s8, s7 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s9, s5 +; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s9, s7, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s8, s8, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s12, 0xffff, s12 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xffff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s9, s6 ; GFX8-NOHSA-NEXT: s_or_b32 s11, s12, s11 -; GFX8-NOHSA-NEXT: s_or_b32 s12, s3, s2 -; GFX8-NOHSA-NEXT: s_sext_i32_i16 s2, s6 -; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s6, 0x80000 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xffff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s7, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s13, s3, s2 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 -; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s9, 0x80000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s6, 16 -; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NOHSA-NEXT: s_lshl_b32 s9, s9, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s12, s6, 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s7, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s6, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s9, s9, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s12, 0xffff, s12 +; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NOHSA-NEXT: s_or_b32 s2, s3, s2 -; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s6, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s6, s8, 0x80000 -; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s6, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NOHSA-NEXT: s_or_b32 s9, s12, s9 +; GFX8-NOHSA-NEXT: s_or_b32 s3, s3, s7 +; GFX8-NOHSA-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -10814,43 +10843,43 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 -; GFX12-NEXT: s_lshr_b32 s8, s6, 16 -; GFX12-NEXT: s_lshr_b32 s9, s7, 16 -; GFX12-NEXT: s_bfe_i32 s3, s7, 0x80000 +; GFX12-NEXT: s_lshr_b32 s2, s6, 16 +; GFX12-NEXT: s_lshr_b32 s3, s7, 16 +; GFX12-NEXT: s_ashr_i32 s14, s7, 24 +; GFX12-NEXT: s_bfe_i32 s15, s7, 0x80000 ; GFX12-NEXT: s_sext_i32_i16 s7, s7 ; GFX12-NEXT: s_ashr_i32 s16, s6, 24 ; GFX12-NEXT: s_bfe_i32 s17, s6, 0x80000 ; GFX12-NEXT: s_sext_i32_i16 s6, s6 -; GFX12-NEXT: s_lshr_b32 s10, s4, 16 -; GFX12-NEXT: s_lshr_b32 s11, s5, 16 -; GFX12-NEXT: s_ashr_i32 s12, s5, 16 -; GFX12-NEXT: s_bfe_i32 s13, s5, 0x80000 +; GFX12-NEXT: s_lshr_b32 s8, s4, 16 +; GFX12-NEXT: s_lshr_b32 s9, s5, 16 +; GFX12-NEXT: s_ashr_i32 s10, s5, 16 +; GFX12-NEXT: s_bfe_i32 s11, s5, 0x80000 ; GFX12-NEXT: s_sext_i32_i16 s5, s5 -; GFX12-NEXT: s_ashr_i32 s14, s4, 24 -; GFX12-NEXT: s_bfe_i32 s15, s4, 0x80000 +; GFX12-NEXT: s_ashr_i32 s12, s4, 24 +; GFX12-NEXT: s_bfe_i32 s13, s4, 0x80000 ; GFX12-NEXT: s_sext_i32_i16 s4, s4 -; GFX12-NEXT: s_bfe_i32 s9, s9, 0x80000 +; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000 ; GFX12-NEXT: s_lshr_b32 s7, s7, 8 -; GFX12-NEXT: s_bfe_i32 s8, s8, 0x80000 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80000 ; GFX12-NEXT: s_lshr_b32 s6, s6, 8 -; GFX12-NEXT: s_lshr_b32 s12, s12, 8 -; GFX12-NEXT: s_bfe_i32 s11, s11, 0x80000 +; GFX12-NEXT: s_lshr_b32 s10, s10, 8 +; GFX12-NEXT: s_bfe_i32 s9, s9, 0x80000 ; GFX12-NEXT: s_lshr_b32 s5, s5, 8 -; GFX12-NEXT: s_bfe_i32 s10, s10, 0x80000 +; GFX12-NEXT: s_bfe_i32 s8, s8, 0x80000 ; GFX12-NEXT: s_lshr_b32 s4, s4, 8 -; GFX12-NEXT: s_pack_ll_b32_b16 s2, s9, s2 -; GFX12-NEXT: s_pack_ll_b32_b16 s3, s3, s7 +; GFX12-NEXT: s_pack_ll_b32_b16 s3, s3, s14 +; GFX12-NEXT: s_pack_ll_b32_b16 s7, s15, s7 ; GFX12-NEXT: s_pack_ll_b32_b16 s6, s17, s6 -; GFX12-NEXT: s_pack_ll_b32_b16 s7, s8, s16 -; GFX12-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX12-NEXT: s_pack_ll_b32_b16 s5, s13, s5 -; GFX12-NEXT: s_pack_ll_b32_b16 s10, s10, s14 -; GFX12-NEXT: s_pack_ll_b32_b16 s4, s15, s4 -; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s2 -; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s10 -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s11 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s2, s16 +; GFX12-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX12-NEXT: s_pack_ll_b32_b16 s5, s11, s5 +; GFX12-NEXT: s_pack_ll_b32_b16 s8, s8, s12 +; GFX12-NEXT: s_pack_ll_b32_b16 s4, s13, s4 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s8 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s9 ; GFX12-NEXT: v_mov_b32_e32 v6, s5 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 @@ -11586,11 +11615,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_sext_i32_i16 s0, s4 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s4, 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff0000 ; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NOHSA-NEXT: s_or_b32 s15, s1, s0 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[4:5], 56 +; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s5, 24 ; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s13, 0x80000 ; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 @@ -11719,183 +11748,94 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: BFE_INT * T18.Y, PV.Z, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; -; GFX12-TRUE16-LABEL: constant_sextload_v32i8_to_v32i16: -; GFX12-TRUE16: ; %bb.0: -; GFX12-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: s_lshr_b32 s13, s5, 16 -; GFX12-TRUE16-NEXT: s_lshr_b32 s16, s0, 16 -; GFX12-TRUE16-NEXT: s_lshr_b32 s17, s1, 16 -; GFX12-TRUE16-NEXT: s_ashr_i32 s18, s1, 16 -; GFX12-TRUE16-NEXT: s_bfe_i32 s19, s1, 0x80000 -; GFX12-TRUE16-NEXT: s_sext_i32_i16 s20, s1 -; GFX12-TRUE16-NEXT: s_ashr_i32 s21, s0, 24 -; GFX12-TRUE16-NEXT: s_bfe_i32 s22, s0, 0x80000 -; GFX12-TRUE16-NEXT: s_sext_i32_i16 s23, s0 -; GFX12-TRUE16-NEXT: s_ashr_i64 s[0:1], s[4:5], 56 -; GFX12-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 -; GFX12-TRUE16-NEXT: s_sext_i32_i16 s1, s5 -; GFX12-TRUE16-NEXT: s_bfe_i32 s13, s13, 0x80000 -; GFX12-TRUE16-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s13, s0 -; GFX12-TRUE16-NEXT: s_lshr_b32 s1, s1, 8 -; GFX12-TRUE16-NEXT: s_ashr_i32 s13, s4, 24 -; GFX12-TRUE16-NEXT: s_bfe_i32 s12, s12, 0x80000 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s5, s1 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s12, s13 -; GFX12-TRUE16-NEXT: s_sext_i32_i16 s12, s4 -; GFX12-TRUE16-NEXT: s_bfe_i32 s4, s4, 0x80000 -; GFX12-TRUE16-NEXT: s_lshr_b32 s12, s12, 8 -; GFX12-TRUE16-NEXT: s_ashr_i32 s13, s7, 16 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX12-TRUE16-NEXT: s_lshr_b32 s12, s13, 8 -; GFX12-TRUE16-NEXT: s_sext_i32_i16 s13, s7 -; GFX12-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 -; GFX12-TRUE16-NEXT: s_bfe_i32 s7, s7, 0x80000 -; GFX12-TRUE16-NEXT: s_lshr_b32 s13, s13, 8 -; GFX12-TRUE16-NEXT: s_lshr_b32 s10, s6, 16 -; GFX12-TRUE16-NEXT: s_bfe_i32 s11, s11, 0x80000 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s13 -; GFX12-TRUE16-NEXT: s_sext_i32_i16 s13, s6 -; GFX12-TRUE16-NEXT: s_lshr_b32 s14, s2, 16 -; GFX12-TRUE16-NEXT: s_lshr_b32 s15, s3, 16 -; GFX12-TRUE16-NEXT: s_ashr_i32 s24, s3, 16 -; GFX12-TRUE16-NEXT: s_bfe_i32 s25, s3, 0x80000 -; GFX12-TRUE16-NEXT: s_sext_i32_i16 s3, s3 -; GFX12-TRUE16-NEXT: s_ashr_i32 s26, s2, 24 -; GFX12-TRUE16-NEXT: s_bfe_i32 s27, s2, 0x80000 -; GFX12-TRUE16-NEXT: s_sext_i32_i16 s2, s2 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX12-TRUE16-NEXT: s_ashr_i32 s12, s6, 24 -; GFX12-TRUE16-NEXT: s_bfe_i32 s6, s6, 0x80000 -; GFX12-TRUE16-NEXT: s_lshr_b32 s13, s13, 8 -; GFX12-TRUE16-NEXT: s_bfe_i32 s10, s10, 0x80000 -; GFX12-TRUE16-NEXT: s_lshr_b32 s24, s24, 8 -; GFX12-TRUE16-NEXT: s_bfe_i32 s15, s15, 0x80000 -; GFX12-TRUE16-NEXT: s_lshr_b32 s3, s3, 8 -; GFX12-TRUE16-NEXT: s_bfe_i32 s14, s14, 0x80000 -; GFX12-TRUE16-NEXT: s_lshr_b32 s2, s2, 8 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX12-TRUE16-NEXT: s_lshr_b32 s18, s18, 8 -; GFX12-TRUE16-NEXT: s_bfe_i32 s17, s17, 0x80000 -; GFX12-TRUE16-NEXT: s_lshr_b32 s20, s20, 8 -; GFX12-TRUE16-NEXT: s_bfe_i32 s16, s16, 0x80000 -; GFX12-TRUE16-NEXT: s_lshr_b32 s23, s23, 8 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s15, s24 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s25, s3 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s14, s26 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s27, s2 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s18 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s19, s20 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s21 -; GFX12-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s22, s23 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v9, s14 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s15 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s16 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v12, s19 :: v_dual_mov_b32 v15, s17 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v14, s18 -; GFX12-TRUE16-NEXT: s_clause 0x3 -; GFX12-TRUE16-NEXT: global_store_b128 v16, v[0:3], s[8:9] offset:48 -; GFX12-TRUE16-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:32 -; GFX12-TRUE16-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:16 -; GFX12-TRUE16-NEXT: global_store_b128 v16, v[12:15], s[8:9] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: constant_sextload_v32i8_to_v32i16: -; GFX12-FAKE16: ; %bb.0: -; GFX12-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: s_lshr_b32 s13, s5, 16 -; GFX12-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 -; GFX12-FAKE16-NEXT: s_lshr_b32 s17, s1, 16 -; GFX12-FAKE16-NEXT: s_ashr_i32 s18, s1, 16 -; GFX12-FAKE16-NEXT: s_bfe_i32 s19, s1, 0x80000 -; GFX12-FAKE16-NEXT: s_sext_i32_i16 s20, s1 -; GFX12-FAKE16-NEXT: s_ashr_i32 s21, s0, 24 -; GFX12-FAKE16-NEXT: s_bfe_i32 s22, s0, 0x80000 -; GFX12-FAKE16-NEXT: s_sext_i32_i16 s23, s0 -; GFX12-FAKE16-NEXT: s_ashr_i64 s[0:1], s[4:5], 56 -; GFX12-FAKE16-NEXT: s_lshr_b32 s12, s4, 16 -; GFX12-FAKE16-NEXT: s_bfe_i32 s1, s5, 0x80000 -; GFX12-FAKE16-NEXT: s_sext_i32_i16 s5, s5 -; GFX12-FAKE16-NEXT: s_bfe_i32 s13, s13, 0x80000 -; GFX12-FAKE16-NEXT: s_lshr_b32 s5, s5, 8 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s13, s0 -; GFX12-FAKE16-NEXT: s_ashr_i32 s13, s4, 24 -; GFX12-FAKE16-NEXT: s_bfe_i32 s12, s12, 0x80000 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s12, s13 -; GFX12-FAKE16-NEXT: s_sext_i32_i16 s12, s4 -; GFX12-FAKE16-NEXT: s_bfe_i32 s4, s4, 0x80000 -; GFX12-FAKE16-NEXT: s_lshr_b32 s12, s12, 8 -; GFX12-FAKE16-NEXT: s_ashr_i32 s13, s7, 16 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX12-FAKE16-NEXT: s_lshr_b32 s12, s13, 8 -; GFX12-FAKE16-NEXT: s_sext_i32_i16 s13, s7 -; GFX12-FAKE16-NEXT: s_lshr_b32 s11, s7, 16 -; GFX12-FAKE16-NEXT: s_bfe_i32 s7, s7, 0x80000 -; GFX12-FAKE16-NEXT: s_lshr_b32 s13, s13, 8 -; GFX12-FAKE16-NEXT: s_lshr_b32 s10, s6, 16 -; GFX12-FAKE16-NEXT: s_bfe_i32 s11, s11, 0x80000 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s13 -; GFX12-FAKE16-NEXT: s_sext_i32_i16 s13, s6 -; GFX12-FAKE16-NEXT: s_lshr_b32 s14, s2, 16 -; GFX12-FAKE16-NEXT: s_lshr_b32 s15, s3, 16 -; GFX12-FAKE16-NEXT: s_ashr_i32 s24, s3, 16 -; GFX12-FAKE16-NEXT: s_bfe_i32 s25, s3, 0x80000 -; GFX12-FAKE16-NEXT: s_sext_i32_i16 s3, s3 -; GFX12-FAKE16-NEXT: s_ashr_i32 s26, s2, 24 -; GFX12-FAKE16-NEXT: s_bfe_i32 s27, s2, 0x80000 -; GFX12-FAKE16-NEXT: s_sext_i32_i16 s2, s2 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX12-FAKE16-NEXT: s_ashr_i32 s12, s6, 24 -; GFX12-FAKE16-NEXT: s_bfe_i32 s6, s6, 0x80000 -; GFX12-FAKE16-NEXT: s_lshr_b32 s13, s13, 8 -; GFX12-FAKE16-NEXT: s_bfe_i32 s10, s10, 0x80000 -; GFX12-FAKE16-NEXT: s_lshr_b32 s24, s24, 8 -; GFX12-FAKE16-NEXT: s_bfe_i32 s15, s15, 0x80000 -; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, 8 -; GFX12-FAKE16-NEXT: s_bfe_i32 s14, s14, 0x80000 -; GFX12-FAKE16-NEXT: s_lshr_b32 s2, s2, 8 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX12-FAKE16-NEXT: s_lshr_b32 s18, s18, 8 -; GFX12-FAKE16-NEXT: s_bfe_i32 s17, s17, 0x80000 -; GFX12-FAKE16-NEXT: s_lshr_b32 s20, s20, 8 -; GFX12-FAKE16-NEXT: s_bfe_i32 s16, s16, 0x80000 -; GFX12-FAKE16-NEXT: s_lshr_b32 s23, s23, 8 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s15, s24 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s25, s3 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s14, s26 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s27, s2 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s18 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s19, s20 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s21 -; GFX12-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s22, s23 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v9, s14 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s15 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s16 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v12, s19 :: v_dual_mov_b32 v15, s17 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v14, s18 -; GFX12-FAKE16-NEXT: s_clause 0x3 -; GFX12-FAKE16-NEXT: global_store_b128 v16, v[0:3], s[8:9] offset:48 -; GFX12-FAKE16-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:32 -; GFX12-FAKE16-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:16 -; GFX12-FAKE16-NEXT: global_store_b128 v16, v[12:15], s[8:9] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX12-LABEL: constant_sextload_v32i8_to_v32i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshr_b32 s17, s1, 16 +; GFX12-NEXT: s_ashr_i32 s18, s1, 16 +; GFX12-NEXT: s_lshr_b32 s12, s4, 16 +; GFX12-NEXT: s_lshr_b32 s18, s18, 8 +; GFX12-NEXT: s_bfe_i32 s17, s17, 0x80000 +; GFX12-NEXT: s_bfe_i32 s19, s1, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_pack_ll_b32_b16 s17, s17, s18 +; GFX12-NEXT: s_ashr_i32 s18, s4, 24 +; GFX12-NEXT: s_bfe_i32 s12, s12, 0x80000 +; GFX12-NEXT: s_lshr_b32 s1, s1, 8 +; GFX12-NEXT: s_pack_ll_b32_b16 s12, s12, s18 +; GFX12-NEXT: s_sext_i32_i16 s18, s4 +; GFX12-NEXT: s_pack_ll_b32_b16 s1, s19, s1 +; GFX12-NEXT: s_bfe_i32 s4, s4, 0x80000 +; GFX12-NEXT: s_lshr_b32 s18, s18, 8 +; GFX12-NEXT: s_ashr_i32 s19, s7, 16 +; GFX12-NEXT: s_pack_ll_b32_b16 s4, s4, s18 +; GFX12-NEXT: s_lshr_b32 s18, s19, 8 +; GFX12-NEXT: s_sext_i32_i16 s19, s7 +; GFX12-NEXT: s_lshr_b32 s11, s7, 16 +; GFX12-NEXT: s_bfe_i32 s7, s7, 0x80000 +; GFX12-NEXT: s_lshr_b32 s19, s19, 8 +; GFX12-NEXT: s_lshr_b32 s10, s6, 16 +; GFX12-NEXT: s_bfe_i32 s11, s11, 0x80000 +; GFX12-NEXT: s_pack_ll_b32_b16 s7, s7, s19 +; GFX12-NEXT: s_sext_i32_i16 s19, s6 +; GFX12-NEXT: s_lshr_b32 s13, s5, 16 +; GFX12-NEXT: s_ashr_i32 s26, s5, 24 +; GFX12-NEXT: s_bfe_i32 s27, s5, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_lshr_b32 s14, s2, 16 +; GFX12-NEXT: s_lshr_b32 s15, s3, 16 +; GFX12-NEXT: s_ashr_i32 s22, s3, 16 +; GFX12-NEXT: s_bfe_i32 s23, s3, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_ashr_i32 s24, s2, 24 +; GFX12-NEXT: s_bfe_i32 s25, s2, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s2, s2 +; GFX12-NEXT: s_pack_ll_b32_b16 s11, s11, s18 +; GFX12-NEXT: s_ashr_i32 s18, s6, 24 +; GFX12-NEXT: s_bfe_i32 s6, s6, 0x80000 +; GFX12-NEXT: s_lshr_b32 s19, s19, 8 +; GFX12-NEXT: s_bfe_i32 s10, s10, 0x80000 +; GFX12-NEXT: s_lshr_b32 s16, s0, 16 +; GFX12-NEXT: s_ashr_i32 s20, s0, 24 +; GFX12-NEXT: s_bfe_i32 s21, s0, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s0, s0 +; GFX12-NEXT: s_bfe_i32 s13, s13, 0x80000 +; GFX12-NEXT: s_lshr_b32 s5, s5, 8 +; GFX12-NEXT: s_lshr_b32 s22, s22, 8 +; GFX12-NEXT: s_bfe_i32 s15, s15, 0x80000 +; GFX12-NEXT: s_lshr_b32 s3, s3, 8 +; GFX12-NEXT: s_bfe_i32 s14, s14, 0x80000 +; GFX12-NEXT: s_lshr_b32 s2, s2, 8 +; GFX12-NEXT: s_pack_ll_b32_b16 s6, s6, s19 +; GFX12-NEXT: s_pack_ll_b32_b16 s10, s10, s18 +; GFX12-NEXT: s_bfe_i32 s16, s16, 0x80000 +; GFX12-NEXT: s_lshr_b32 s0, s0, 8 +; GFX12-NEXT: s_pack_ll_b32_b16 s13, s13, s26 +; GFX12-NEXT: s_pack_ll_b32_b16 s5, s27, s5 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10 +; GFX12-NEXT: s_pack_ll_b32_b16 s15, s15, s22 +; GFX12-NEXT: s_pack_ll_b32_b16 s3, s23, s3 +; GFX12-NEXT: s_pack_ll_b32_b16 s14, s14, s24 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s25, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s12 +; GFX12-NEXT: s_pack_ll_b32_b16 s16, s16, s20 +; GFX12-NEXT: s_pack_ll_b32_b16 s0, s21, s0 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s13 +; GFX12-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v9, s14 +; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s15 +; GFX12-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s16 +; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s17 +; GFX12-NEXT: v_mov_b32_e32 v14, s1 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] +; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = sext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 94dc980e6b5cce..3753737d251e4b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1734,15 +1734,14 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v3 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[5:6], v[3:4], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v4 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v4, 0, 16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -1755,16 +1754,15 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx2 v[3:4], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 +; GCN-HSA-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashr_i64 v[7:8], v[3:4], 48 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v3 -; GCN-HSA-NEXT: v_bfe_i32 v2, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v3, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5 +; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i32: @@ -5972,17 +5970,18 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[1:2], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[6:7], v[1:2], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v3, 0, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 16, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v3, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -5998,7 +5997,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx2 v[1:2], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 @@ -6006,12 +6005,13 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[1:2], 48 -; GCN-HSA-NEXT: v_bfe_i32 v2, v4, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 @@ -6352,26 +6352,28 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[14:15], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v3 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v7, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64: @@ -6399,27 +6401,29 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v7, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-HSA-NEXT: v_ashr_i64 v[14:15], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 -; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v10, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_bfe_i32 v0, v7, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64: @@ -6966,48 +6970,52 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v3 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v11, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[25:26], v[6:7], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v5 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[26:27], v[4:5], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v5 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v5, 0, 16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v6, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: @@ -7056,14 +7064,16 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 ; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v5 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] @@ -7076,29 +7086,31 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GCN-HSA-NEXT: v_ashr_i64 v[14:15], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v2, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 -; GCN-HSA-NEXT: v_bfe_i32 v10, v17, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v6, v16, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_bfe_i32 v0, v11, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v16, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_bfe_i32 v8, v15, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64: @@ -8095,52 +8107,60 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v15 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[2:3], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v1 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v25, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[6:7], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v26, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[4:5], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[10:11], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v26, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v11 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[8:9], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v9 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v9 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[14:15], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v15 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v21, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[12:13], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v13 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v13 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v13, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:16 @@ -8198,7 +8218,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -8206,9 +8226,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 @@ -8233,137 +8253,145 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[8:9], 48 -; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v5 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v11 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 -; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[10:11], 48 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 -; GCN-HSA-NEXT: v_bfe_i32 v18, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v10, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[0:1], 48 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[2:3], 48 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v7 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[4:5], 48 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GCN-HSA-NEXT: v_bfe_i32 v16, v3, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[6:7], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, v11 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[12:13], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 -; GCN-HSA-NEXT: v_bfe_i32 v16, v3, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[14:15], 48 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v15 +; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v15 +; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v19, v1, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v14, v18, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v18, v24, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v24, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v26, v26, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[19:22] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19] +; GCN-HSA-NEXT: v_bfe_i32 v23, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v19, v10, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v25, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v25, v9, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v22, v22, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; GCN-HSA-NEXT: v_bfe_i32 v13, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v3, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v25 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-HSA-NEXT: v_bfe_i32 v9, v23, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v11, v14, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[23:26] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[3:4], v[13:16] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v21, v21, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-HSA-NEXT: v_bfe_i32 v17, v22, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_bfe_i32 v3, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v5, v12, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[5:8] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll index 5fc1a87e71a1a6..75e7a63c540e53 100644 --- a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll @@ -110,10 +110,10 @@ define i64 @range_metadata_sext_i8_signed_range_i64(ptr addrspace(1) %ptr) { ; SDAG-LABEL: range_metadata_sext_i8_signed_range_i64: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; SDAG-NEXT: global_load_dwordx2 v[1:2], v[0:1], off glc ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_lshlrev_b32_e32 v1, 23, v0 -; SDAG-NEXT: v_ashrrev_i64 v[0:1], 55, v[0:1] +; SDAG-NEXT: v_bfe_i32 v0, v1, 0, 9 +; SDAG-NEXT: v_bfe_i32 v1, v1, 8, 1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: range_metadata_sext_i8_signed_range_i64: diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index c5c95380fde9ba..bb642155cd0aaf 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -463,14 +463,12 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 { ; SI-LABEL: mad_i64_i32_sextops_i31_i63: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; SI-NEXT: v_ashr_i64 v[4:5], v[3:4], 33 -; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 33 -; SI-NEXT: v_mul_lo_u32 v1, v4, v0 -; SI-NEXT: v_mul_hi_i32 v4, v4, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v2 -; SI-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc +; SI-NEXT: v_bfe_i32 v0, v0, 0, 31 +; SI-NEXT: v_bfe_i32 v1, v1, 0, 31 +; SI-NEXT: v_mul_lo_u32 v4, v0, v1 +; SI-NEXT: v_mul_hi_i32 v1, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: mad_i64_i32_sextops_i31_i63: diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 11cf129b1e4792..4377e7569747aa 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -463,57 +463,53 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 { ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_load_dword s6, s[4:5], 0xb -; SI-NEXT: s_load_dword s4, s[4:5], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_lshl_b32 s7, s4, 8 -; SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_mul_i32 s5, s4, s6 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 -; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_bfe_i32 s0, s8, 0x180000 +; SI-NEXT: s_bfe_i32 s1, s2, 0x180000 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_mul_i32 s0, s1, s0 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s1, v0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[4:5], 0x2c -; VI-NEXT: s_load_dword s6, s[4:5], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s2, 8 -; VI-NEXT: s_lshl_b32 s5, s6, 8 -; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_bfe_i32 s2, s2, 0x180000 +; VI-NEXT: s_bfe_i32 s3, s4, 0x180000 +; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0 ; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smul24_i33: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s7, 8 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 +; GFX9-NEXT: s_bfe_i32 s6, s7, 0x180000 ; GFX9-NEXT: s_mul_hi_i32 s5, s4, s6 ; GFX9-NEXT: s_mul_i32 s4, s4, s6 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31 @@ -580,45 +576,45 @@ entry: define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_smulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[4:5], 0xd -; SI-NEXT: s_load_dword s7, s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s6, s[4:5], 0x34 -; VI-NEXT: s_load_dword s7, s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smulhi24_i33: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s7, 8 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; GFX9-NEXT: s_mul_hi_i32 s4, s4, s6 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index a166c4f93462d1..15eb41a1a5b65b 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -461,18 +461,19 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s2, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s0, s2, 8 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GCN-NEXT: s_ashr_i32 s1, s3, 8 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s0, s0, s8 +; GCN-NEXT: s_xor_b32 s0, s1, s0 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s2, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -491,18 +492,19 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_load_dword s2, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i32 s0, s2, 8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GCN-IR-NEXT: s_ashr_i32 s1, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s0, s0, s8 +; GCN-IR-NEXT: s_xor_b32 s0, s1, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-IR-NEXT: s_or_b32 s2, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -676,11 +678,11 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-NEXT: s_ashr_i32 s8, s0, 1 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -688,30 +690,30 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_ashr_i32 s0, s3, 1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: s_abs_i32 s0, s2 -; GCN-NEXT: s_xor_b32 s1, s2, s8 +; GCN-NEXT: s_abs_i32 s2, s0 +; GCN-NEXT: s_xor_b32 s0, s0, s8 +; GCN-NEXT: s_ashr_i32 s0, s0, 31 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-NEXT: s_mul_i32 s3, s2, s9 -; GCN-NEXT: s_sub_i32 s0, s0, s3 -; GCN-NEXT: s_add_i32 s8, s2, 1 -; GCN-NEXT: s_sub_i32 s3, s0, s9 -; GCN-NEXT: s_cmp_ge_u32 s0, s9 -; GCN-NEXT: s_cselect_b32 s2, s8, s2 -; GCN-NEXT: s_cselect_b32 s0, s3, s0 -; GCN-NEXT: s_add_i32 s3, s2, 1 -; GCN-NEXT: s_cmp_ge_u32 s0, s9 -; GCN-NEXT: s_cselect_b32 s0, s3, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_sub_i32 s0, s0, s1 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_mul_i32 s3, s1, s9 +; GCN-NEXT: s_sub_i32 s2, s2, s3 +; GCN-NEXT: s_add_i32 s8, s1, 1 +; GCN-NEXT: s_sub_i32 s3, s2, s9 +; GCN-NEXT: s_cmp_ge_u32 s2, s9 +; GCN-NEXT: s_cselect_b32 s1, s8, s1 +; GCN-NEXT: s_cselect_b32 s2, s3, s2 +; GCN-NEXT: s_add_i32 s3, s1, 1 +; GCN-NEXT: s_cmp_ge_u32 s2, s9 +; GCN-NEXT: s_cselect_b32 s1, s3, s1 +; GCN-NEXT: s_xor_b32 s1, s1, s0 +; GCN-NEXT: s_sub_i32 s0, s1, s0 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -720,11 +722,11 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-IR-NEXT: s_ashr_i32 s8, s0, 1 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -732,30 +734,30 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_ashr_i32 s0, s3, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-IR-NEXT: s_abs_i32 s0, s2 -; GCN-IR-NEXT: s_xor_b32 s1, s2, s8 +; GCN-IR-NEXT: s_abs_i32 s2, s0 +; GCN-IR-NEXT: s_xor_b32 s0, s0, s8 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 31 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s0, v0 -; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-IR-NEXT: s_mul_i32 s3, s2, s9 -; GCN-IR-NEXT: s_sub_i32 s0, s0, s3 -; GCN-IR-NEXT: s_add_i32 s8, s2, 1 -; GCN-IR-NEXT: s_sub_i32 s3, s0, s9 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 -; GCN-IR-NEXT: s_cselect_b32 s2, s8, s2 -; GCN-IR-NEXT: s_cselect_b32 s0, s3, s0 -; GCN-IR-NEXT: s_add_i32 s3, s2, 1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 -; GCN-IR-NEXT: s_cselect_b32 s0, s3, s2 -; GCN-IR-NEXT: s_xor_b32 s0, s0, s1 -; GCN-IR-NEXT: s_sub_i32 s0, s0, s1 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-IR-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-IR-NEXT: s_mul_i32 s3, s1, s9 +; GCN-IR-NEXT: s_sub_i32 s2, s2, s3 +; GCN-IR-NEXT: s_add_i32 s8, s1, 1 +; GCN-IR-NEXT: s_sub_i32 s3, s2, s9 +; GCN-IR-NEXT: s_cmp_ge_u32 s2, s9 +; GCN-IR-NEXT: s_cselect_b32 s1, s8, s1 +; GCN-IR-NEXT: s_cselect_b32 s2, s3, s2 +; GCN-IR-NEXT: s_add_i32 s3, s1, 1 +; GCN-IR-NEXT: s_cmp_ge_u32 s2, s9 +; GCN-IR-NEXT: s_cselect_b32 s1, s3, s1 +; GCN-IR-NEXT: s_xor_b32 s1, s1, s0 +; GCN-IR-NEXT: s_sub_i32 s0, s1, s0 ; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 @@ -772,18 +774,19 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s2, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s0, s2, 9 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GCN-NEXT: s_ashr_i32 s1, s3, 9 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s0, s0, s8 +; GCN-NEXT: s_xor_b32 s0, s1, s0 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s2, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -802,18 +805,19 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_load_dword s2, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i32 s0, s2, 9 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GCN-IR-NEXT: s_ashr_i32 s1, s3, 9 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s0, s0, s8 +; GCN-IR-NEXT: s_xor_b32 s0, s1, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-IR-NEXT: s_or_b32 s2, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -838,11 +842,11 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-NEXT: s_ashr_i32 s8, s0, 7 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -850,30 +854,30 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_ashr_i32 s0, s3, 7 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: s_abs_i32 s0, s2 -; GCN-NEXT: s_xor_b32 s1, s2, s8 +; GCN-NEXT: s_abs_i32 s2, s0 +; GCN-NEXT: s_xor_b32 s0, s0, s8 +; GCN-NEXT: s_ashr_i32 s0, s0, 31 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-NEXT: s_mul_i32 s3, s2, s9 -; GCN-NEXT: s_sub_i32 s0, s0, s3 -; GCN-NEXT: s_add_i32 s8, s2, 1 -; GCN-NEXT: s_sub_i32 s3, s0, s9 -; GCN-NEXT: s_cmp_ge_u32 s0, s9 -; GCN-NEXT: s_cselect_b32 s2, s8, s2 -; GCN-NEXT: s_cselect_b32 s0, s3, s0 -; GCN-NEXT: s_add_i32 s3, s2, 1 -; GCN-NEXT: s_cmp_ge_u32 s0, s9 -; GCN-NEXT: s_cselect_b32 s0, s3, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_sub_i32 s0, s0, s1 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_mul_i32 s3, s1, s9 +; GCN-NEXT: s_sub_i32 s2, s2, s3 +; GCN-NEXT: s_add_i32 s8, s1, 1 +; GCN-NEXT: s_sub_i32 s3, s2, s9 +; GCN-NEXT: s_cmp_ge_u32 s2, s9 +; GCN-NEXT: s_cselect_b32 s1, s8, s1 +; GCN-NEXT: s_cselect_b32 s2, s3, s2 +; GCN-NEXT: s_add_i32 s3, s1, 1 +; GCN-NEXT: s_cmp_ge_u32 s2, s9 +; GCN-NEXT: s_cselect_b32 s1, s3, s1 +; GCN-NEXT: s_xor_b32 s1, s1, s0 +; GCN-NEXT: s_sub_i32 s0, s1, s0 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -882,11 +886,11 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-IR-NEXT: s_ashr_i32 s8, s0, 7 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -894,30 +898,30 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_ashr_i32 s0, s3, 7 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-IR-NEXT: s_abs_i32 s0, s2 -; GCN-IR-NEXT: s_xor_b32 s1, s2, s8 +; GCN-IR-NEXT: s_abs_i32 s2, s0 +; GCN-IR-NEXT: s_xor_b32 s0, s0, s8 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 31 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s0, v0 -; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-IR-NEXT: s_mul_i32 s3, s2, s9 -; GCN-IR-NEXT: s_sub_i32 s0, s0, s3 -; GCN-IR-NEXT: s_add_i32 s8, s2, 1 -; GCN-IR-NEXT: s_sub_i32 s3, s0, s9 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 -; GCN-IR-NEXT: s_cselect_b32 s2, s8, s2 -; GCN-IR-NEXT: s_cselect_b32 s0, s3, s0 -; GCN-IR-NEXT: s_add_i32 s3, s2, 1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 -; GCN-IR-NEXT: s_cselect_b32 s0, s3, s2 -; GCN-IR-NEXT: s_xor_b32 s0, s0, s1 -; GCN-IR-NEXT: s_sub_i32 s0, s0, s1 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-IR-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-IR-NEXT: s_mul_i32 s3, s1, s9 +; GCN-IR-NEXT: s_sub_i32 s2, s2, s3 +; GCN-IR-NEXT: s_add_i32 s8, s1, 1 +; GCN-IR-NEXT: s_sub_i32 s3, s2, s9 +; GCN-IR-NEXT: s_cmp_ge_u32 s2, s9 +; GCN-IR-NEXT: s_cselect_b32 s1, s8, s1 +; GCN-IR-NEXT: s_cselect_b32 s2, s3, s2 +; GCN-IR-NEXT: s_add_i32 s3, s1, 1 +; GCN-IR-NEXT: s_cmp_ge_u32 s2, s9 +; GCN-IR-NEXT: s_cselect_b32 s1, s3, s1 +; GCN-IR-NEXT: s_xor_b32 s1, s1, s0 +; GCN-IR-NEXT: s_sub_i32 s0, s1, s0 ; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 @@ -938,27 +942,27 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[6:7], s[12:13], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], 40 +; GCN-NEXT: s_ashr_i32 s4, s13, 8 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: s_ashr_i32 s5, s9, 8 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GCN-NEXT: s_xor_b32 s4, s5, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s5, s8, s6 -; GCN-NEXT: s_ashr_i32 s5, s5, 30 -; GCN-NEXT: s_ashr_i64 s[10:11], s[14:15], 40 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: s_ashr_i32 s6, s11, 8 +; GCN-NEXT: s_ashr_i32 s7, s15, 8 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: s_or_b32 s5, s5, 1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GCN-NEXT: s_cselect_b32 s5, s5, 0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s5, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GCN-NEXT: s_xor_b32 s4, s4, s10 +; GCN-NEXT: s_or_b32 s8, s4, 1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s8, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, s7 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GCN-NEXT: s_xor_b32 s4, s6, s7 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-NEXT: s_or_b32 s6, s4, 1 @@ -984,27 +988,27 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[12:13], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[10:11], 40 +; GCN-IR-NEXT: s_ashr_i32 s4, s13, 8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: s_ashr_i32 s5, s9, 8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GCN-IR-NEXT: s_xor_b32 s4, s5, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s5, s8, s6 -; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 -; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[14:15], 40 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-IR-NEXT: s_ashr_i32 s6, s11, 8 +; GCN-IR-NEXT: s_ashr_i32 s7, s15, 8 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: s_or_b32 s5, s5, 1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GCN-IR-NEXT: s_cselect_b32 s5, s5, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s5, v2 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GCN-IR-NEXT: s_xor_b32 s4, s4, s10 +; GCN-IR-NEXT: s_or_b32 s8, s4, 1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s8, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s7 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GCN-IR-NEXT: s_xor_b32 s4, s6, s7 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-IR-NEXT: s_or_b32 s6, s4, 1 @@ -1794,21 +1798,21 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-NEXT: s_ashr_i32 s2, s3, 8 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-NEXT: s_mov_b32 s3, 0x41c00000 +; GCN-NEXT: s_mov_b32 s2, 0x41c00000 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_or_b32 s2, s0, 1 +; GCN-NEXT: s_or_b32 s3, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v2, -v1, v0, s3 +; GCN-NEXT: v_mad_f32 v2, -v1, v0, s2 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-NEXT: s_cselect_b32 s0, s3, 0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1821,21 +1825,21 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 8 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-IR-NEXT: s_mov_b32 s3, 0x41c00000 +; GCN-IR-NEXT: s_mov_b32 s2, 0x41c00000 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_or_b32 s2, s0, 1 +; GCN-IR-NEXT: s_or_b32 s3, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s3 +; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s2 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| ; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-IR-NEXT: s_cselect_b32 s0, s3, 0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1851,23 +1855,23 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; GCN-LABEL: s_test_sdiv24_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s2, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-NEXT: s_ashr_i32 s0, s3, 8 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_or_b32 s3, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v0, -v1, s8, v0 +; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: s_or_b32 s2, s0, 1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s8 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s2 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-NEXT: s_cselect_b32 s0, s3, 0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1877,23 +1881,23 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-LABEL: s_test_sdiv24_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s2, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-IR-NEXT: s_ashr_i32 s0, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_or_b32 s3, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v0, -v1, s8, v0 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: s_or_b32 s2, s0, 1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s8 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s2 ; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-IR-NEXT: s_cselect_b32 s0, s3, 0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1909,18 +1913,18 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) { ; GCN-LABEL: v_test_sdiv24_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GCN-NEXT: v_or_b32_e32 v0, 1, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GCN-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 +; GCN-NEXT: v_mad_f32 v3, -v2, v0, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1929,18 +1933,18 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) { ; GCN-IR-LABEL: v_test_sdiv24_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, s4 +; GCN-IR-NEXT: v_mad_f32 v3, -v2, v0, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1954,18 +1958,18 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) { ; GCN-LABEL: v_test_sdiv24_pow2_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GCN-NEXT: v_or_b32_e32 v0, 1, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GCN-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 +; GCN-NEXT: v_mad_f32 v3, -v2, v0, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1974,18 +1978,18 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) { ; GCN-IR-LABEL: v_test_sdiv24_pow2_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, s4 +; GCN-IR-NEXT: v_mad_f32 v3, -v2, v0, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1999,27 +2003,28 @@ define i64 @v_test_sdiv24_pow2_k_den_i64(i64 %x) { ; GCN-LABEL: v_test_sdiv24_pow2_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 17, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 15 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IR-LABEL: v_test_sdiv24_pow2_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x38000000, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x38000000, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v1, -v2, s4, v1 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, s4, v0 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, s4 -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index 65a17ed67481cc..5734c81c043fdd 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -112,9 +112,10 @@ define i128 @v_ashr_i128_vk(i128 %lhs) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, v1 ; GCN-NEXT: v_lshl_b64 v[0:1], v[2:3], 31 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 1, v4 -; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], 33 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 1, v4 +; GCN-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v2, 1, v3 +; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = ashr i128 %lhs, 33 ret i128 %shl diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index af78768520d23f..ec03043873c3da 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -482,12 +482,12 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_ashr_i64 s[0:1], s[2:3], 48 ; SI-NEXT: s_ashr_i32 s1, s2, 16 ; SI-NEXT: s_sext_i32_i16 s2, s2 +; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_ashr_i32 s0, s3, 16 ; SI-NEXT: s_sext_i32_i16 s3, s3 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -555,9 +555,9 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addr ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 ; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 6ca8f490ff1651..47998767a948cf 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -374,7 +374,7 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s4, s0 ; CI-NEXT: s_mov_b32 s5, s1 -; CI-NEXT: s_ashr_i64 s[0:1], s[2:3], 48 +; CI-NEXT: s_ashr_i32 s0, s3, 16 ; CI-NEXT: s_ashr_i32 s1, s2, 16 ; CI-NEXT: s_lshr_b32 s8, s2, 16 ; CI-NEXT: s_lshr_b32 s9, s3, 16 @@ -488,9 +488,9 @@ define amdgpu_kernel void @v_abs_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: s_mov_b32 s0, s4 ; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48 -; CI-NEXT: v_bfe_i32 v4, v1, 0, 16 -; CI-NEXT: v_bfe_i32 v3, v0, 0, 16 +; CI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; CI-NEXT: v_ashrrev_i32_e32 v3, 16, v1 +; CI-NEXT: v_bfe_i32 v4, v0, 0, 16 ; CI-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 @@ -502,18 +502,18 @@ define amdgpu_kernel void @v_abs_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: v_sub_i32_e32 v6, vcc, 0, v6 ; CI-NEXT: v_bfe_i32 v7, v7, 0, 16 ; CI-NEXT: v_bfe_i32 v6, v6, 0, 16 -; CI-NEXT: v_max_i32_e32 v0, v3, v0 -; CI-NEXT: v_max_i32_e32 v1, v4, v1 -; CI-NEXT: v_max_i32_e32 v3, v5, v6 -; CI-NEXT: v_max_i32_e32 v2, v2, v7 +; CI-NEXT: v_max_i32_e32 v0, v4, v0 +; CI-NEXT: v_max_i32_e32 v1, v2, v1 +; CI-NEXT: v_max_i32_e32 v2, v5, v6 +; CI-NEXT: v_max_i32_e32 v3, v3, v7 ; CI-NEXT: v_add_i32_e32 v1, vcc, 2, v1 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v1, v2, v1 -; CI-NEXT: v_or_b32_e32 v0, v3, v0 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v1, v3, v1 +; CI-NEXT: v_or_b32_e32 v0, v2, v0 ; CI-NEXT: v_add_i32_e32 v1, vcc, 0x20000, v1 ; CI-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 68ed7cecd8ff78..ce89b2a962eeaf 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -788,18 +788,265 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) ret void } +define amdgpu_kernel void @s_ashr_33_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { +; SI-LABEL: s_ashr_33_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_ashr_i32 s6, s7, 31 +; SI-NEXT: s_ashr_i32 s7, s7, 1 +; SI-NEXT: s_add_u32 s4, s7, s4 +; SI-NEXT: s_addc_u32 s5, s6, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_ashr_33_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_ashr_i32 s6, s7, 31 +; VI-NEXT: s_ashr_i32 s7, s7, 1 +; VI-NEXT: s_add_u32 s4, s7, s4 +; VI-NEXT: s_addc_u32 s5, s6, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_ashr_33_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: ASHR T0.W, KC0[5].X, 1, +; EG-NEXT: ASHR * T1.W, KC0[5].X, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T1.W, PS, KC0[7].Z, +; EG-NEXT: ADDC_UINT * T2.W, PV.W, KC0[7].Y, +; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, +; EG-NEXT: ADD_INT T0.X, T0.W, KC0[7].Y, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %result = ashr i64 %a, 33 + %add = add i64 %result, %b + store i64 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_ashr_33_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: v_ashr_33_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; SI-NEXT: v_ashrrev_i32_e32 v3, 1, v3 +; SI-NEXT: buffer_store_dwordx2 v[3:4], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_ashr_33_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; VI-NEXT: v_ashrrev_i32_e32 v4, 1, v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_endpgm +; +; EG-LABEL: v_ashr_33_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: ASHR * T0.Y, T0.X, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: ASHR T0.X, T0.X, 1, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid + %gep.out = getelementptr i64, ptr addrspace(1) %out, i32 %tid + %a = load i64, ptr addrspace(1) %gep.in + %result = ashr i64 %a, 33 + store i64 %result, ptr addrspace(1) %gep.out + ret void +} + +define amdgpu_kernel void @s_ashr_62_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { +; SI-LABEL: s_ashr_62_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_ashr_i32 s6, s7, 31 +; SI-NEXT: s_ashr_i32 s7, s7, 30 +; SI-NEXT: s_add_u32 s4, s7, s4 +; SI-NEXT: s_addc_u32 s5, s6, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_ashr_62_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_ashr_i32 s6, s7, 31 +; VI-NEXT: s_ashr_i32 s7, s7, 30 +; VI-NEXT: s_add_u32 s4, s7, s4 +; VI-NEXT: s_addc_u32 s5, s6, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_ashr_62_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: ASHR T0.W, KC0[5].X, literal.x, +; EG-NEXT: ASHR * T1.W, KC0[5].X, literal.y, +; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44) +; EG-NEXT: ADD_INT T1.W, PS, KC0[7].Z, +; EG-NEXT: ADDC_UINT * T2.W, PV.W, KC0[7].Y, +; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, +; EG-NEXT: ADD_INT T0.X, T0.W, KC0[7].Y, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %result = ashr i64 %a, 62 + %add = add i64 %result, %b + store i64 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_ashr_62_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: v_ashr_62_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; SI-NEXT: v_ashrrev_i32_e32 v3, 30, v3 +; SI-NEXT: buffer_store_dwordx2 v[3:4], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_ashr_62_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; VI-NEXT: v_ashrrev_i32_e32 v4, 30, v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_endpgm +; +; EG-LABEL: v_ashr_62_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: ASHR * T0.Y, T0.X, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: ASHR T0.X, T0.X, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, +; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid + %gep.out = getelementptr i64, ptr addrspace(1) %out, i32 %tid + %a = load i64, ptr addrspace(1) %gep.in + %result = ashr i64 %a, 62 + store i64 %result, ptr addrspace(1) %gep.out + ret void +} + define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_63_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[4:5], 0x14 -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x1d ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s5, s8, 31 -; SI-NEXT: s_add_u32 s4, s5, s6 -; SI-NEXT: s_addc_u32 s5, s5, s7 +; SI-NEXT: s_ashr_i32 s5, s7, 31 +; SI-NEXT: s_add_u32 s4, s5, s8 +; SI-NEXT: s_addc_u32 s5, s5, s9 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -807,15 +1054,15 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_63_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[4:5], 0x50 -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x74 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x74 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s5, s8, 31 -; VI-NEXT: s_add_u32 s4, s5, s6 -; VI-NEXT: s_addc_u32 s5, s5, s7 +; VI-NEXT: s_ashr_i32 s5, s7, 31 +; VI-NEXT: s_add_u32 s4, s5, s8 +; VI-NEXT: s_addc_u32 s5, s5, s9 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -851,14 +1098,13 @@ define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v3 ; SI-NEXT: v_mov_b32_e32 v3, v2 -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ashr_63_i64: @@ -866,19 +1112,17 @@ define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v1 +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ashr_63_i64: diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index c9e5ff444f7159..c729c3fb8a4e4c 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -443,31 +443,32 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s2, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_ashr_i32 s3, s3, 9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s2, s2, 9 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GCN-NEXT: s_xor_b32 s0, s3, s2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s1, s0, s8 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-NEXT: s_or_b32 s1, s1, 1 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_or_b32 s8, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-NEXT: s_add_i32 s1, s2, s1 -; GCN-NEXT: s_mul_i32 s1, s1, s8 -; GCN-NEXT: s_sub_i32 s0, s0, s1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s8, 0 +; GCN-NEXT: v_readfirstlane_b32 s1, v2 +; GCN-NEXT: s_add_i32 s0, s1, s0 +; GCN-NEXT: s_mul_i32 s0, s0, s2 +; GCN-NEXT: s_sub_i32 s0, s3, s0 ; GCN-NEXT: s_bfe_i32 s0, s0, 0x170000 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -478,31 +479,32 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_load_dword s2, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_ashr_i32 s3, s3, 9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i32 s2, s2, 9 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GCN-IR-NEXT: s_xor_b32 s0, s3, s2 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s1, s0, s8 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_or_b32 s8, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-IR-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-IR-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-IR-NEXT: s_add_i32 s1, s2, s1 -; GCN-IR-NEXT: s_mul_i32 s1, s1, s8 -; GCN-IR-NEXT: s_sub_i32 s0, s0, s1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-IR-NEXT: s_cselect_b32 s0, s8, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s1, v2 +; GCN-IR-NEXT: s_add_i32 s0, s1, s0 +; GCN-IR-NEXT: s_mul_i32 s0, s0, s2 +; GCN-IR-NEXT: s_sub_i32 s0, s3, s0 ; GCN-IR-NEXT: s_bfe_i32 s0, s0, 0x170000 ; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 @@ -520,31 +522,32 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s2, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_ashr_i32 s3, s3, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s2, s2, 8 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GCN-NEXT: s_xor_b32 s0, s3, s2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s1, s0, s8 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-NEXT: s_or_b32 s1, s1, 1 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_or_b32 s8, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-NEXT: s_add_i32 s1, s2, s1 -; GCN-NEXT: s_mul_i32 s1, s1, s8 -; GCN-NEXT: s_sub_i32 s0, s0, s1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s8, 0 +; GCN-NEXT: v_readfirstlane_b32 s1, v2 +; GCN-NEXT: s_add_i32 s0, s1, s0 +; GCN-NEXT: s_mul_i32 s0, s0, s2 +; GCN-NEXT: s_sub_i32 s0, s3, s0 ; GCN-NEXT: s_bfe_i32 s0, s0, 0x180000 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -555,31 +558,32 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_load_dword s2, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_ashr_i32 s3, s3, 8 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i32 s2, s2, 8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GCN-IR-NEXT: s_xor_b32 s0, s3, s2 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s1, s0, s8 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_or_b32 s8, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-IR-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-IR-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-IR-NEXT: s_add_i32 s1, s2, s1 -; GCN-IR-NEXT: s_mul_i32 s1, s1, s8 -; GCN-IR-NEXT: s_sub_i32 s0, s0, s1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-IR-NEXT: s_cselect_b32 s0, s8, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s1, v2 +; GCN-IR-NEXT: s_add_i32 s0, s1, s0 +; GCN-IR-NEXT: s_mul_i32 s0, s0, s2 +; GCN-IR-NEXT: s_sub_i32 s0, s3, s0 ; GCN-IR-NEXT: s_bfe_i32 s0, s0, 0x180000 ; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 @@ -597,23 +601,23 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { ; GCN-LABEL: v_test_srem24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], 40 -; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_xor_b32_e32 v5, v0, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v3 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GCN-NEXT: v_xor_b32_e32 v5, v1, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 30, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v3 ; GCN-NEXT: v_or_b32_e32 v5, 1, v5 -; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 +; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v1, -v4, v3, v1 +; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| +; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v0, v2, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -621,23 +625,23 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_srem24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashr_i64 v[2:3], v[2:3], 40 -; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v2 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v5, v0, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v3 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v5, v1, v0 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 30, v5 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v3 ; GCN-IR-NEXT: v_or_b32_e32 v5, 1, v5 -; GCN-IR-NEXT: v_mul_f32_e32 v4, v1, v4 +; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-IR-NEXT: v_mad_f32 v1, -v4, v3, v1 +; GCN-IR-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| +; GCN-IR-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v2, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -650,11 +654,11 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 +; GCN-NEXT: s_ashr_i32 s0, s0, 7 ; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -665,16 +669,16 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: s_abs_i32 s3, s2 -; GCN-NEXT: s_ashr_i32 s0, s2, 31 +; GCN-NEXT: s_ashr_i32 s2, s3, 7 +; GCN-NEXT: s_abs_i32 s2, s2 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 ; GCN-NEXT: s_mul_i32 s1, s1, s8 -; GCN-NEXT: s_sub_i32 s1, s3, s1 +; GCN-NEXT: s_sub_i32 s1, s2, s1 ; GCN-NEXT: s_sub_i32 s2, s1, s8 ; GCN-NEXT: s_cmp_ge_u32 s1, s8 ; GCN-NEXT: s_cselect_b32 s1, s2, s1 @@ -691,11 +695,11 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 7 ; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -706,16 +710,16 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-IR-NEXT: s_abs_i32 s3, s2 -; GCN-IR-NEXT: s_ashr_i32 s0, s2, 31 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 7 +; GCN-IR-NEXT: s_abs_i32 s2, s2 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-IR-NEXT: v_readfirstlane_b32 s1, v0 ; GCN-IR-NEXT: s_mul_i32 s1, s1, s8 -; GCN-IR-NEXT: s_sub_i32 s1, s3, s1 +; GCN-IR-NEXT: s_sub_i32 s1, s2, s1 ; GCN-IR-NEXT: s_sub_i32 s2, s1, s8 ; GCN-IR-NEXT: s_cmp_ge_u32 s1, s8 ; GCN-IR-NEXT: s_cselect_b32 s1, s2, s1 @@ -739,11 +743,11 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 +; GCN-NEXT: s_ashr_i32 s0, s0, 1 ; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -754,16 +758,16 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: s_abs_i32 s3, s2 -; GCN-NEXT: s_ashr_i32 s0, s2, 31 +; GCN-NEXT: s_ashr_i32 s2, s3, 1 +; GCN-NEXT: s_abs_i32 s2, s2 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 ; GCN-NEXT: s_mul_i32 s1, s1, s8 -; GCN-NEXT: s_sub_i32 s1, s3, s1 +; GCN-NEXT: s_sub_i32 s1, s2, s1 ; GCN-NEXT: s_sub_i32 s2, s1, s8 ; GCN-NEXT: s_cmp_ge_u32 s1, s8 ; GCN-NEXT: s_cselect_b32 s1, s2, s1 @@ -780,11 +784,11 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 1 ; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -795,16 +799,16 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-IR-NEXT: s_abs_i32 s3, s2 -; GCN-IR-NEXT: s_ashr_i32 s0, s2, 31 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 1 +; GCN-IR-NEXT: s_abs_i32 s2, s2 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-IR-NEXT: v_readfirstlane_b32 s1, v0 ; GCN-IR-NEXT: s_mul_i32 s1, s1, s8 -; GCN-IR-NEXT: s_sub_i32 s1, s3, s1 +; GCN-IR-NEXT: s_sub_i32 s1, s2, s1 ; GCN-IR-NEXT: s_sub_i32 s2, s1, s8 ; GCN-IR-NEXT: s_cmp_ge_u32 s1, s8 ; GCN-IR-NEXT: s_cselect_b32 s1, s2, s1 @@ -1915,24 +1919,24 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-LABEL: s_test_srem24_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s8, 0x41c00000 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-NEXT: s_ashr_i32 s2, s3, 8 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-NEXT: s_mov_b32 s3, 0x41c00000 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i32 s0, s2, 30 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_or_b32 s8, s0, 1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: s_or_b32 s3, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v2, -v1, v0, s3 +; GCN-NEXT: v_mad_f32 v2, -v1, v0, s8 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s8, 0 +; GCN-NEXT: s_cselect_b32 s0, s3, 0 ; GCN-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-NEXT: s_add_i32 s0, s1, s0 ; GCN-NEXT: s_mul_i32 s0, s0, s2 @@ -1947,24 +1951,24 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-LABEL: s_test_srem24_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_mov_b32 s8, 0x41c00000 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 8 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-IR-NEXT: s_mov_b32 s3, 0x41c00000 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_or_b32 s8, s0, 1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: s_or_b32 s3, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s3 +; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s8 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| ; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s8, 0 +; GCN-IR-NEXT: s_cselect_b32 s0, s3, 0 ; GCN-IR-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-IR-NEXT: s_add_i32 s0, s1, s0 ; GCN-IR-NEXT: s_mul_i32 s0, s0, s2 @@ -1985,27 +1989,27 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; GCN-LABEL: s_test_srem24_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s2, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_ashr_i32 s8, s3, 8 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v0, -v1, s8, v0 +; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: s_or_b32 s3, s0, 1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s8 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s2 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GCN-NEXT: s_cselect_b32 s0, s3, 0 ; GCN-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-NEXT: s_add_i32 s0, s1, s0 ; GCN-NEXT: s_mulk_i32 s0, 0x5b7f -; GCN-NEXT: s_sub_i32 s0, s2, s0 +; GCN-NEXT: s_sub_i32 s0, s8, s0 ; GCN-NEXT: s_bfe_i32 s0, s0, 0x180000 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -2016,27 +2020,27 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-LABEL: s_test_srem24_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s2, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: s_ashr_i32 s8, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 +; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v0, -v1, s8, v0 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-IR-NEXT: s_or_b32 s3, s0, 1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s8 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s2 ; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GCN-IR-NEXT: s_cselect_b32 s0, s3, 0 ; GCN-IR-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-IR-NEXT: s_add_i32 s0, s1, s0 ; GCN-IR-NEXT: s_mulk_i32 s0, 0x5b7f -; GCN-IR-NEXT: s_sub_i32 s0, s2, s0 +; GCN-IR-NEXT: s_sub_i32 s0, s8, s0 ; GCN-IR-NEXT: s_bfe_i32 s0, s0, 0x180000 ; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 @@ -2053,19 +2057,19 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) { ; GCN-LABEL: v_test_srem24_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v0 -; GCN-NEXT: v_or_b32_e32 v3, 1, v3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v4, -v2, v1, s4 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GCN-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 0x41c00000, v3 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -2075,19 +2079,19 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) { ; GCN-IR-LABEL: v_test_srem24_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 30, v0 -; GCN-IR-NEXT: v_or_b32_e32 v3, 1, v3 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v4, -v2, v1, s4 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1| -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x41c00000, v3 +; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -2102,19 +2106,19 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) { ; GCN-LABEL: v_test_srem24_pow2_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v0 -; GCN-NEXT: v_or_b32_e32 v3, 1, v3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v4, -v2, v1, s4 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GCN-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 0x47000000, v3 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -2124,19 +2128,19 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) { ; GCN-IR-LABEL: v_test_srem24_pow2_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 30, v0 -; GCN-IR-NEXT: v_or_b32_e32 v3, 1, v3 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v4, -v2, v1, s4 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1| -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x47000000, v3 +; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -2151,29 +2155,30 @@ define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) { ; GCN-LABEL: v_test_srem24_pow2_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GCN-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 17, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v2, vcc +; GCN-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IR-LABEL: v_test_srem24_pow2_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 30, v0 -; GCN-IR-NEXT: v_or_b32_e32 v2, 1, v2 -; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x38000000, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x38000000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-IR-NEXT: v_mad_f32 v1, -v3, s4, v1 +; GCN-IR-NEXT: v_mad_f32 v2, -v3, s4, v2 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, s4 -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, s4 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_lshlrev_b32_e32 v1, 15, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v1