diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6310f7270ceaf..a41258dd6d274 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16811,15 +16811,11 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // creating a cycle in a DAG. Let's undo that by mutating the freeze. assert(N->getOperand(0) == FrozenN0 && "Expected cycle in DAG"); DAG.UpdateNodeOperands(N, N0); + // Revisit the node. + AddToWorklist(N); return FrozenN0; } - // We currently avoid folding freeze over SRA/SRL, due to the problems seen - // with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for - // example https://reviews.llvm.org/D136529#4120959. - if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL) - return SDValue(); - // Fold freeze(op(x, ...)) -> op(freeze(x), ...). // Try to push freeze through instructions that propagate but don't produce // poison as far as possible. If an operand of freeze follows three @@ -16832,18 +16828,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { N0->getNumValues() != 1 || !N0->hasOneUse()) return SDValue(); - // TOOD: we should always allow multiple operands, however this increases the - // likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call - // below causing later nodes that share frozen operands to fold again and no - // longer being able to confirm other operands are not poison due to recursion - // depth limits on isGuaranteedNotToBeUndefOrPoison. - bool AllowMultipleMaybePoisonOperands = - N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC || - N0.getOpcode() == ISD::BUILD_VECTOR || - N0.getOpcode() == ISD::BUILD_PAIR || - N0.getOpcode() == ISD::VECTOR_SHUFFLE || - N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL; - // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all // ones" or "constant" into something that depends on FrozenUndef. We can // instead pick undef values to keep those properties, while at the same time @@ -16864,74 +16848,13 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { } } - SmallSet MaybePoisonOperands; - SmallVector MaybePoisonOperandNumbers; - for (auto [OpNo, Op] : enumerate(N0->ops())) { - if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly=*/false)) - continue; - bool HadMaybePoisonOperands = !MaybePoisonOperands.empty(); - bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second; - if (IsNewMaybePoisonOperand) - MaybePoisonOperandNumbers.push_back(OpNo); - if (!HadMaybePoisonOperands) - continue; - if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) { - // Multiple maybe-poison ops when not allowed - bail out. - return SDValue(); - } - } - // NOTE: the whole op may be not guaranteed to not be undef or poison because - // it could create undef or poison due to it's poison-generating flags. - // So not finding any maybe-poison operands is fine. - - for (unsigned OpNo : MaybePoisonOperandNumbers) { - // N0 can mutate during iteration, so make sure to refetch the maybe poison - // operands via the operand numbers. The typical scenario is that we have - // something like this - // t262: i32 = freeze t181 - // t150: i32 = ctlz_zero_undef t262 - // t184: i32 = ctlz_zero_undef t181 - // t268: i32 = select_cc t181, Constant:i32<0>, t184, t186, setne:ch - // When freezing the t181 operand we get t262 back, and then the - // ReplaceAllUsesOfValueWith call will not only replace t181 by t262, but - // also recursively replace t184 by t150. - SDValue MaybePoisonOperand = N->getOperand(0).getOperand(OpNo); - // Don't replace every single UNDEF everywhere with frozen UNDEF, though. - if (MaybePoisonOperand.isUndef()) - continue; - // First, freeze each offending operand. - SDValue FrozenMaybePoisonOperand = DAG.getFreeze(MaybePoisonOperand); - // Then, change all other uses of unfrozen operand to use frozen operand. - DAG.ReplaceAllUsesOfValueWith(MaybePoisonOperand, FrozenMaybePoisonOperand); - if (FrozenMaybePoisonOperand.getOpcode() == ISD::FREEZE && - FrozenMaybePoisonOperand.getOperand(0) == FrozenMaybePoisonOperand) { - // But, that also updated the use in the freeze we just created, thus - // creating a cycle in a DAG. Let's undo that by mutating the freeze. - DAG.UpdateNodeOperands(FrozenMaybePoisonOperand.getNode(), - MaybePoisonOperand); - } - - // This node has been merged with another. - if (N->getOpcode() == ISD::DELETED_NODE) - return SDValue(N, 0); - } - - assert(N->getOpcode() != ISD::DELETED_NODE && "Node was deleted!"); - - // The whole node may have been updated, so the value we were holding - // may no longer be valid. Re-fetch the operand we're `freeze`ing. - N0 = N->getOperand(0); + // Collect and freeze all operands. + SmallVector Ops(N0->ops()); + for (auto &Op : Ops) + Op = DAG.getFreeze(Op); // Finally, recreate the node, it's operands were updated to use // frozen operands, so we just need to use it's "original" operands. - SmallVector Ops(N0->ops()); - // TODO: ISD::UNDEF and ISD::POISON should get separate handling, but best - // leave for a future patch. - for (SDValue &Op : Ops) { - if (Op.isUndef()) - Op = DAG.getFreeze(Op); - } - SDLoc DL(N0); // Special case handling for ShuffleVectorSDNode nodes. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index e71bf15384727..cd5854f48b8fa 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -129,7 +129,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) { ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -211,7 +211,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) { ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 02ead572145f9..749ca6c46ac06 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -188,7 +188,7 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7 ; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3] ; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1] ; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 @@ -225,7 +225,7 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_add_nc_u32_e32 v0, v9, v0 ; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, 1, v11 -; GFX1250-NEXT: s_or_b32 vcc_lo, s1, vcc_lo +; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s1 ; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo ; GFX1250-NEXT: s_or_b32 vcc_lo, s2, s0 ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 49ba0e2ac796a..2342f2ee8b915 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -437,17 +437,20 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] @@ -455,12 +458,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9] -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -474,17 +477,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -977,10 +977,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -2564,17 +2564,20 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] @@ -2587,6 +2590,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -3100,10 +3104,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll index 3303cb86c874e..e703caf4724d8 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll @@ -14,15 +14,13 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3 - ; GCN-NEXT: [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[S_LOAD_DWORD_IMM]] - ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; GCN-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def dead $scc - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LSHR_B32_]] - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY3]], implicit-def dead $scc - ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[COPY3]] + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 65536, [[COPY3]], implicit-def dead $scc + ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $scc - ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN-NEXT: S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_3]], implicit-def $scc + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_2]], implicit-def $scc ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY5]], killed [[COPY4]], implicit-def dead $scc ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index cdd34cbde6ddd..905319fd6a1ab 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -1435,25 +1435,15 @@ define i128 @fptoui_f32_to_i128(float %x) { } define i128 @fptosi_f16_to_i128(half %x) { -; SDAG-LABEL: fptosi_f16_to_i128: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 -; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SDAG-NEXT: v_mov_b32_e32 v3, v2 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: fptosi_f16_to_i128: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fptosi_f16_to_i128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] %cvt = fptosi half %x to i128 ret i128 %cvt } diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 5b2213592f495..582eae4521b00 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -9843,53 +9843,53 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s42, s5, 30 -; GFX6-NEXT: s_lshr_b32 s36, s4, 30 -; GFX6-NEXT: s_lshr_b32 s38, s4, 31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 28 -; GFX6-NEXT: s_lshr_b32 s34, s4, 29 -; GFX6-NEXT: s_lshr_b32 s26, s4, 26 -; GFX6-NEXT: s_lshr_b32 s28, s4, 27 -; GFX6-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NEXT: s_lshr_b32 s24, s4, 25 -; GFX6-NEXT: s_lshr_b32 s18, s4, 22 -; GFX6-NEXT: s_lshr_b32 s20, s4, 23 -; GFX6-NEXT: s_lshr_b32 s14, s4, 20 -; GFX6-NEXT: s_lshr_b32 s16, s4, 21 -; GFX6-NEXT: s_lshr_b32 s10, s4, 18 -; GFX6-NEXT: s_lshr_b32 s12, s4, 19 -; GFX6-NEXT: s_lshr_b32 s6, s4, 16 -; GFX6-NEXT: s_lshr_b32 s8, s4, 17 +; GFX6-NEXT: s_lshr_b32 s36, s5, 28 +; GFX6-NEXT: s_lshr_b32 s38, s5, 29 +; GFX6-NEXT: s_lshr_b32 s30, s5, 26 +; GFX6-NEXT: s_lshr_b32 s34, s5, 27 +; GFX6-NEXT: s_lshr_b32 s26, s5, 24 +; GFX6-NEXT: s_lshr_b32 s28, s5, 25 +; GFX6-NEXT: s_lshr_b32 s22, s5, 22 +; GFX6-NEXT: s_lshr_b32 s24, s5, 23 +; GFX6-NEXT: s_lshr_b32 s18, s5, 20 +; GFX6-NEXT: s_lshr_b32 s20, s5, 21 +; GFX6-NEXT: s_lshr_b32 s14, s5, 18 +; GFX6-NEXT: s_lshr_b32 s16, s5, 19 +; GFX6-NEXT: s_lshr_b32 s10, s5, 16 +; GFX6-NEXT: s_lshr_b32 s12, s5, 17 +; GFX6-NEXT: s_lshr_b32 s6, s5, 14 +; GFX6-NEXT: s_lshr_b32 s8, s5, 15 +; GFX6-NEXT: s_mov_b32 s40, s5 ; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[40:41], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v4, s7 -; GFX6-NEXT: s_lshr_b32 s40, s4, 14 +; GFX6-NEXT: s_lshr_b32 s40, s5, 12 ; GFX6-NEXT: v_mov_b32_e32 v0, s44 ; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_mov_b32 s44, s5 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v6, s44 ; GFX6-NEXT: v_mov_b32_e32 v7, s45 -; GFX6-NEXT: s_lshr_b32 s44, s4, 15 +; GFX6-NEXT: s_lshr_b32 s44, s5, 13 ; GFX6-NEXT: v_mov_b32_e32 v2, s42 ; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 12 +; GFX6-NEXT: s_lshr_b32 s42, s5, 10 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v8, s36 ; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 13 +; GFX6-NEXT: s_lshr_b32 s36, s5, 11 ; GFX6-NEXT: v_mov_b32_e32 v10, s38 ; GFX6-NEXT: v_mov_b32_e32 v11, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 10 +; GFX6-NEXT: s_lshr_b32 s38, s5, 8 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v12, s30 ; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 11 +; GFX6-NEXT: s_lshr_b32 s30, s5, 9 ; GFX6-NEXT: v_mov_b32_e32 v14, s34 ; GFX6-NEXT: v_mov_b32_e32 v15, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 8 +; GFX6-NEXT: s_lshr_b32 s34, s5, 6 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v5, s7 @@ -9897,191 +9897,190 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s26 ; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 9 +; GFX6-NEXT: s_lshr_b32 s26, s5, 7 ; GFX6-NEXT: v_mov_b32_e32 v4, s28 ; GFX6-NEXT: v_mov_b32_e32 v5, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 6 +; GFX6-NEXT: s_lshr_b32 s28, s5, 4 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s22 ; GFX6-NEXT: v_mov_b32_e32 v9, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 7 +; GFX6-NEXT: s_lshr_b32 s22, s5, 5 ; GFX6-NEXT: v_mov_b32_e32 v10, s24 ; GFX6-NEXT: v_mov_b32_e32 v11, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 4 +; GFX6-NEXT: s_lshr_b32 s24, s5, 2 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s18 ; GFX6-NEXT: v_mov_b32_e32 v13, s19 -; GFX6-NEXT: s_lshr_b32 s18, s4, 5 +; GFX6-NEXT: s_lshr_b32 s18, s5, 3 ; GFX6-NEXT: v_mov_b32_e32 v14, s20 ; GFX6-NEXT: v_mov_b32_e32 v15, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 2 +; GFX6-NEXT: s_lshr_b32 s20, s5, 1 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 3 +; GFX6-NEXT: s_lshr_b32 s14, s4, 30 ; GFX6-NEXT: v_mov_b32_e32 v4, s16 ; GFX6-NEXT: v_mov_b32_e32 v5, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 1 +; GFX6-NEXT: s_lshr_b32 s16, s4, 31 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s10 ; GFX6-NEXT: v_mov_b32_e32 v9, s11 -; GFX6-NEXT: s_lshr_b32 s10, s5, 29 +; GFX6-NEXT: s_lshr_b32 s10, s4, 28 ; GFX6-NEXT: v_mov_b32_e32 v10, s12 ; GFX6-NEXT: v_mov_b32_e32 v11, s13 -; GFX6-NEXT: s_lshr_b32 s12, s5, 28 +; GFX6-NEXT: s_lshr_b32 s12, s4, 29 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s6 ; GFX6-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NEXT: s_lshr_b32 s6, s5, 26 +; GFX6-NEXT: s_lshr_b32 s46, s4, 26 ; GFX6-NEXT: v_mov_b32_e32 v14, s8 ; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s5, 27 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_lshr_b32 s8, s4, 27 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s40 ; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: s_lshr_b32 s40, s5, 25 -; GFX6-NEXT: v_mov_b32_e32 v4, s44 -; GFX6-NEXT: v_mov_b32_e32 v5, s45 -; GFX6-NEXT: s_lshr_b32 s44, s5, 24 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144 +; GFX6-NEXT: s_lshr_b32 s40, s4, 24 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NEXT: s_lshr_b32 s44, s4, 25 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s42 -; GFX6-NEXT: v_mov_b32_e32 v9, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 22 -; GFX6-NEXT: v_mov_b32_e32 v10, s36 -; GFX6-NEXT: v_mov_b32_e32 v11, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 23 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 +; GFX6-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 22 +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_mov_b32_e32 v11, s7 +; GFX6-NEXT: s_lshr_b32 s42, s4, 23 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[38:39], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s38 -; GFX6-NEXT: v_mov_b32_e32 v13, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 20 -; GFX6-NEXT: v_mov_b32_e32 v14, s30 -; GFX6-NEXT: v_mov_b32_e32 v15, s31 -; GFX6-NEXT: s_lshr_b32 s4, s5, 21 +; GFX6-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 20 +; GFX6-NEXT: v_mov_b32_e32 v14, s6 +; GFX6-NEXT: v_mov_b32_e32 v15, s7 +; GFX6-NEXT: s_lshr_b32 s6, s4, 21 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[34:35], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 -; GFX6-NEXT: s_lshr_b32 s30, s5, 18 -; GFX6-NEXT: v_mov_b32_e32 v4, s26 -; GFX6-NEXT: v_mov_b32_e32 v5, s27 -; GFX6-NEXT: s_lshr_b32 s26, s5, 19 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352 +; GFX6-NEXT: v_mov_b32_e32 v16, s34 +; GFX6-NEXT: v_mov_b32_e32 v17, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 18 +; GFX6-NEXT: v_mov_b32_e32 v18, s26 +; GFX6-NEXT: v_mov_b32_e32 v19, s27 +; GFX6-NEXT: s_lshr_b32 s26, s4, 19 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s28 ; GFX6-NEXT: v_mov_b32_e32 v9, s29 -; GFX6-NEXT: s_lshr_b32 s28, s5, 17 +; GFX6-NEXT: s_lshr_b32 s28, s4, 16 ; GFX6-NEXT: v_mov_b32_e32 v10, s22 ; GFX6-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NEXT: s_lshr_b32 s22, s5, 16 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_lshr_b32 s22, s4, 17 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s24 ; GFX6-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NEXT: s_lshr_b32 s24, s5, 14 +; GFX6-NEXT: s_lshr_b32 s24, s4, 14 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v14, s18 ; GFX6-NEXT: v_mov_b32_e32 v15, s19 -; GFX6-NEXT: s_lshr_b32 s18, s5, 15 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 -; GFX6-NEXT: v_mov_b32_e32 v16, s20 -; GFX6-NEXT: v_mov_b32_e32 v17, s21 -; GFX6-NEXT: s_lshr_b32 s20, s5, 12 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_lshr_b32 s18, s4, 15 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 12 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v18, s14 -; GFX6-NEXT: v_mov_b32_e32 v19, s15 -; GFX6-NEXT: s_lshr_b32 s14, s5, 13 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: s_lshr_b32 s16, s5, 10 +; GFX6-NEXT: v_mov_b32_e32 v16, s14 +; GFX6-NEXT: v_mov_b32_e32 v17, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 13 +; GFX6-NEXT: v_mov_b32_e32 v18, s16 +; GFX6-NEXT: v_mov_b32_e32 v19, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 10 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s12 -; GFX6-NEXT: v_mov_b32_e32 v9, s13 -; GFX6-NEXT: s_lshr_b32 s12, s5, 11 -; GFX6-NEXT: v_mov_b32_e32 v10, s10 -; GFX6-NEXT: v_mov_b32_e32 v11, s11 -; GFX6-NEXT: s_lshr_b32 s10, s5, 8 +; GFX6-NEXT: v_mov_b32_e32 v8, s10 +; GFX6-NEXT: v_mov_b32_e32 v9, s11 +; GFX6-NEXT: s_lshr_b32 s10, s4, 11 +; GFX6-NEXT: v_mov_b32_e32 v10, s12 +; GFX6-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NEXT: s_lshr_b32 s12, s4, 8 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[46:47], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NEXT: s_lshr_b32 s6, s5, 9 +; GFX6-NEXT: v_mov_b32_e32 v12, s38 +; GFX6-NEXT: v_mov_b32_e32 v13, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 9 ; GFX6-NEXT: v_mov_b32_e32 v14, s8 ; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s5, 6 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[44:45], 0x10000 +; GFX6-NEXT: s_lshr_b32 s8, s4, 6 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s34 -; GFX6-NEXT: v_mov_b32_e32 v17, s35 -; GFX6-NEXT: s_lshr_b32 s34, s5, 7 -; GFX6-NEXT: v_mov_b32_e32 v18, s40 -; GFX6-NEXT: v_mov_b32_e32 v19, s41 -; GFX6-NEXT: s_lshr_b32 s40, s5, 4 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: s_lshr_b32 s44, s4, 4 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s42 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 5 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 2 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 +; GFX6-NEXT: v_mov_b32_e32 v16, s36 +; GFX6-NEXT: v_mov_b32_e32 v17, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v18, s42 +; GFX6-NEXT: v_mov_b32_e32 v19, s43 +; GFX6-NEXT: s_lshr_b32 s42, s4, 2 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 3 -; GFX6-NEXT: s_lshr_b32 s44, s5, 1 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s30 +; GFX6-NEXT: v_mov_b32_e32 v9, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 3 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 @@ -10090,71 +10089,71 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:448 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 -; GFX6-NEXT: v_mov_b32_e32 v10, s4 -; GFX6-NEXT: v_mov_b32_e32 v11, s5 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:416 -; GFX6-NEXT: s_waitcnt expcnt(1) -; GFX6-NEXT: v_mov_b32_e32 v0, s30 -; GFX6-NEXT: v_mov_b32_e32 v1, s31 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_mov_b32_e32 v11, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 +; GFX6-NEXT: s_waitcnt expcnt(2) +; GFX6-NEXT: v_mov_b32_e32 v0, s34 +; GFX6-NEXT: v_mov_b32_e32 v1, s35 ; GFX6-NEXT: v_mov_b32_e32 v2, s26 ; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s22 -; GFX6-NEXT: v_mov_b32_e32 v1, s23 -; GFX6-NEXT: v_mov_b32_e32 v2, s28 -; GFX6-NEXT: v_mov_b32_e32 v3, s29 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 +; GFX6-NEXT: v_mov_b32_e32 v0, s28 +; GFX6-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s24 ; GFX6-NEXT: v_mov_b32_e32 v1, s25 ; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s20 ; GFX6-NEXT: v_mov_b32_e32 v1, s21 ; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v1, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 -; GFX6-NEXT: v_mov_b32_e32 v8, s44 -; GFX6-NEXT: v_mov_b32_e32 v9, s45 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:256 +; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v8, s4 +; GFX6-NEXT: v_mov_b32_e32 v9, s5 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index a135b43bad0fe..f7396eb1e1159 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -6545,33 +6545,33 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s6, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s6, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s5, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s5, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s7, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 @@ -6592,8 +6592,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s2, s7 -; GCN-HSA-NEXT: s_lshr_b32 s8, s6, 16 -; GCN-HSA-NEXT: s_mov_b32 s10, s5 +; GCN-HSA-NEXT: s_mov_b32 s8, s5 +; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 ; GCN-HSA-NEXT: s_ashr_i32 s13, s5, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x100000 @@ -6611,25 +6611,25 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -7167,12 +7167,12 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s7 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s6, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s3 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s2, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x100000 @@ -7180,60 +7180,60 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s1, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s3, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s5, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[16:17], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s7, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[24:25], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[20:21], 0x100000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s30 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 @@ -7249,19 +7249,19 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s10, s7 -; GCN-HSA-NEXT: s_lshr_b32 s12, s6, 16 +; GCN-HSA-NEXT: s_mov_b32 s12, s7 ; GCN-HSA-NEXT: s_mov_b32 s14, s5 -; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 16 -; GCN-HSA-NEXT: s_ashr_i32 s25, s1, 31 +; GCN-HSA-NEXT: s_mov_b32 s16, s3 +; GCN-HSA-NEXT: s_mov_b32 s18, s1 +; GCN-HSA-NEXT: s_ashr_i32 s27, s1, 31 ; GCN-HSA-NEXT: s_ashr_i32 s29, s3, 31 ; GCN-HSA-NEXT: s_ashr_i32 s30, s3, 16 -; GCN-HSA-NEXT: s_mov_b32 s18, s3 -; GCN-HSA-NEXT: s_lshr_b32 s20, s2, 16 -; GCN-HSA-NEXT: s_mov_b32 s22, s1 -; GCN-HSA-NEXT: s_lshr_b32 s24, s0, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s0, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[2:3], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s28, s1, 16 ; GCN-HSA-NEXT: s_ashr_i32 s31, s5, 31 ; GCN-HSA-NEXT: s_ashr_i32 s33, s5, 16 @@ -7272,36 +7272,55 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[22:23], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[24:25], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s22, s8, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GCN-HSA-NEXT: s_add_u32 s24, s8, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s25, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GCN-HSA-NEXT: s_add_u32 s14, s8, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 +; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s34 -; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: s_add_u32 s14, s8, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s31 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 +; GCN-HSA-NEXT: s_add_u32 s14, s8, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -7310,35 +7329,17 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s8, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s8, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s8, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 @@ -8312,151 +8313,148 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s3, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s9, 31 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s7 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s1, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s3, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s3, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s9, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s9, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s11, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s11, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s11, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s13, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s15, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s6, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s4, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s2, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s13, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s15, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s15, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s54, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[14:15], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[14:15], 0x100000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s53 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s47 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s39 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[54:55], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[56:57], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s23 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s37 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: @@ -8468,47 +8466,47 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s24, s15 -; GCN-HSA-NEXT: s_ashr_i32 s37, s3, 31 -; GCN-HSA-NEXT: s_ashr_i32 s38, s3, 16 -; GCN-HSA-NEXT: s_ashr_i32 s57, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s59, s13, 31 -; GCN-HSA-NEXT: s_ashr_i32 s61, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s63, s15, 31 -; GCN-HSA-NEXT: s_ashr_i32 s65, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16 -; GCN-HSA-NEXT: s_mov_b32 s48, s13 -; GCN-HSA-NEXT: s_lshr_b32 s50, s12, 16 -; GCN-HSA-NEXT: s_mov_b32 s52, s11 -; GCN-HSA-NEXT: s_lshr_b32 s34, s10, 16 -; GCN-HSA-NEXT: s_mov_b32 s30, s9 -; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-HSA-NEXT: s_mov_b32 s54, s7 -; GCN-HSA-NEXT: s_lshr_b32 s56, s6, 16 -; GCN-HSA-NEXT: s_mov_b32 s58, s5 -; GCN-HSA-NEXT: s_lshr_b32 s60, s4, 16 -; GCN-HSA-NEXT: s_mov_b32 s62, s3 -; GCN-HSA-NEXT: s_lshr_b32 s64, s2, 16 -; GCN-HSA-NEXT: s_mov_b32 s66, s1 +; GCN-HSA-NEXT: s_mov_b32 s34, s15 +; GCN-HSA-NEXT: s_ashr_i32 s41, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s42, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s57, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s59, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s61, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s63, s9, 31 +; GCN-HSA-NEXT: s_ashr_i32 s65, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s67, s11, 31 +; GCN-HSA-NEXT: s_ashr_i32 s69, s11, 16 +; GCN-HSA-NEXT: s_mov_b32 s44, s13 +; GCN-HSA-NEXT: s_mov_b32 s46, s11 +; GCN-HSA-NEXT: s_mov_b32 s48, s9 +; GCN-HSA-NEXT: s_mov_b32 s50, s7 +; GCN-HSA-NEXT: s_mov_b32 s52, s5 +; GCN-HSA-NEXT: s_mov_b32 s38, s3 +; GCN-HSA-NEXT: s_mov_b32 s36, s1 +; GCN-HSA-NEXT: s_lshr_b32 s54, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s58, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s60, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[34:35], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GCN-HSA-NEXT: s_ashr_i32 s36, s1, 16 -; GCN-HSA-NEXT: s_ashr_i32 s39, s5, 31 -; GCN-HSA-NEXT: s_ashr_i32 s40, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s41, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s42, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s43, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s44, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s45, s11, 31 +; GCN-HSA-NEXT: s_ashr_i32 s40, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s43, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s70, s13, 31 +; GCN-HSA-NEXT: s_ashr_i32 s71, s13, 16 +; GCN-HSA-NEXT: s_ashr_i32 s72, s15, 31 +; GCN-HSA-NEXT: s_ashr_i32 s73, s15, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[70:71], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[72:73], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[74:75], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[14:15], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x100000 @@ -8518,149 +8516,149 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[54:55], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[54:55], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 ; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s63 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s73 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s72 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s74 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s75 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s71 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 +; GCN-HSA-NEXT: s_add_u32 s38, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s70 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 +; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s69 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s67 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GCN-HSA-NEXT: s_add_u32 s36, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s37 +; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s37 +; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 +; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s30 +; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s31 ; GCN-HSA-NEXT: s_addc_u32 s31, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 -; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 -; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 -; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s59 -; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s72 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GCN-HSA-NEXT: s_add_u32 s24, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s73 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s44 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 +; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s70 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s71 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s36 +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_add_u32 s10, s16, 64 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 32 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index b534c2c267fad..5c4bc95578bb4 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -6398,41 +6398,41 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s16, s5 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s10, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 8 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s5, 31 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s5, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64: @@ -6445,11 +6445,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s4, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s6, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s8, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s14, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s6, s3, 8 +; GFX7-HSA-NEXT: s_mov_b32 s8, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s2, 8 ; GFX7-HSA-NEXT: s_ashr_i32 s5, s3, 31 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 @@ -6465,32 +6465,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -6502,11 +6502,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s3, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s14, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s8, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 ; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s3, 31 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 @@ -6522,32 +6522,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -6615,34 +6615,34 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s4, s3, 16 -; GFX12-NEXT: s_lshr_b32 s6, s2, 16 -; GFX12-NEXT: s_lshr_b32 s8, s2, 24 -; GFX12-NEXT: s_lshr_b32 s10, s2, 8 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX12-NEXT: s_lshr_b32 s12, s3, 8 -; GFX12-NEXT: s_mov_b32 s14, s3 +; GFX12-NEXT: s_lshr_b32 s6, s3, 8 +; GFX12-NEXT: s_mov_b32 s8, s3 +; GFX12-NEXT: s_lshr_b32 s10, s2, 16 +; GFX12-NEXT: s_lshr_b32 s12, s2, 24 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX12-NEXT: s_ashr_i32 s15, s3, 31 ; GFX12-NEXT: s_ashr_i32 s18, s3, 24 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX12-NEXT: s_lshr_b32 s14, s2, 8 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s15 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s7 -; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s9 -; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v7, s11 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v5, s17 ; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v13, s3 -; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v15, s13 -; GFX12-NEXT: v_mov_b32_e32 v14, s12 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s9 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s7 +; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s11 +; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13 +; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] ; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = sext <8 x i8> %load to <8 x i64> @@ -7033,81 +7033,80 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s26, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s8, s5 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s14, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s26, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s4, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 24 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s7, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s38, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s35 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s33 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: @@ -7119,33 +7118,31 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s2, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s8, s6, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 -; GFX7-HSA-NEXT: s_ashr_i32 s27, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 24 -; GFX7-HSA-NEXT: s_mov_b32 s22, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s28, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s12, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 8 +; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s31, s5, 24 +; GFX7-HSA-NEXT: s_mov_b32 s24, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s34, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[24:25], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s36, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 @@ -7153,70 +7150,73 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s24, s0, 0x50 -; GFX7-HSA-NEXT: s_addc_u32 s25, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 +; GFX7-HSA-NEXT: s_add_u32 s26, s0, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s27, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 32 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -7225,109 +7225,107 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s10, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s10, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s8, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s8, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s11, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s11, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s28, s11 -; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s9, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s9, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s2, s9 -; GFX8-NOHSA-NEXT: s_ashr_i32 s25, s9, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s29, s9, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[28:29], 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s7, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s22, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s6, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s5, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s5, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s14, s5 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s4, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 8 +; GFX8-NOHSA-NEXT: s_ashr_i32 s19, s5, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s31, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s28, s11, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s11, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[24:25], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 0x50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 64 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NOHSA-NEXT: s_ashr_i32 s30, s7, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[18:19], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x50 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s29 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -7437,64 +7435,64 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s2, s6, 16 -; GFX12-NEXT: s_lshr_b32 s8, s6, 24 -; GFX12-NEXT: s_lshr_b32 s10, s6, 8 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX12-NEXT: s_lshr_b32 s12, s4, 16 -; GFX12-NEXT: s_lshr_b32 s14, s4, 24 +; GFX12-NEXT: s_lshr_b32 s8, s7, 16 +; GFX12-NEXT: s_lshr_b32 s10, s7, 8 +; GFX12-NEXT: s_mov_b32 s12, s7 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35 -; GFX12-NEXT: s_lshr_b32 s16, s4, 8 -; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v9, s3 -; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s9 -; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v3, s11 -; GFX12-NEXT: s_lshr_b32 s18, s7, 16 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v5, s31 -; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v13, s13 -; GFX12-NEXT: s_lshr_b32 s20, s7, 8 -; GFX12-NEXT: s_mov_b32 s22, s7 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX12-NEXT: s_lshr_b32 s24, s5, 16 ; GFX12-NEXT: s_ashr_i32 s33, s7, 31 ; GFX12-NEXT: s_ashr_i32 s36, s7, 24 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX12-NEXT: s_lshr_b32 s14, s6, 16 +; GFX12-NEXT: s_lshr_b32 s16, s6, 24 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v3, s33 +; GFX12-NEXT: s_lshr_b32 s18, s6, 8 +; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s35 +; GFX12-NEXT: v_dual_mov_b32 v4, s34 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v9, s13 +; GFX12-NEXT: s_lshr_b32 s20, s5, 16 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s11 +; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s15 +; GFX12-NEXT: s_lshr_b32 s22, s5, 8 +; GFX12-NEXT: s_mov_b32 s24, s5 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 -; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v7, s17 -; GFX12-NEXT: s_lshr_b32 s26, s5, 8 -; GFX12-NEXT: s_mov_b32 s28, s5 -; GFX12-NEXT: s_ashr_i32 s27, s5, 31 -; GFX12-NEXT: s_ashr_i32 s29, s5, 24 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX12-NEXT: s_lshr_b32 s26, s4, 16 +; GFX12-NEXT: s_lshr_b32 s28, s4, 24 +; GFX12-NEXT: s_ashr_i32 s29, s5, 31 +; GFX12-NEXT: s_ashr_i32 s31, s5, 24 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: v_mov_b32_e32 v6, s16 +; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s17 +; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v7, s19 +; GFX12-NEXT: s_lshr_b32 s30, s4, 8 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX12-NEXT: v_mov_b32_e32 v6, s18 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64 -; GFX12-NEXT: v_dual_mov_b32 v0, s18 :: v_dual_mov_b32 v3, s33 -; GFX12-NEXT: v_dual_mov_b32 v1, s19 :: v_dual_mov_b32 v2, s36 -; GFX12-NEXT: v_mov_b32_e32 v9, s23 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s21 -; GFX12-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v17, s25 -; GFX12-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v19, s27 -; GFX12-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v21, s5 -; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s7 -; GFX12-NEXT: v_mov_b32_e32 v22, s6 -; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:96 -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:32 +; GFX12-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s29 +; GFX12-NEXT: v_dual_mov_b32 v1, s21 :: v_dual_mov_b32 v2, s31 +; GFX12-NEXT: v_mov_b32_e32 v9, s25 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v11, s23 +; GFX12-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v17, s27 +; GFX12-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v19, s7 +; GFX12-NEXT: v_dual_mov_b32 v18, s6 :: v_dual_mov_b32 v21, s3 +; GFX12-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v23, s5 +; GFX12-NEXT: v_mov_b32_e32 v22, s4 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = sext <16 x i8> %load to <16 x i64> @@ -8206,157 +8204,157 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s30, s7 ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s0, 24 -; GFX6-NOHSA-NEXT: s_mov_b32 s34, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s34, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8 ; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s1, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s1, 24 -; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s3, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 24 -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s0, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s46, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s50, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 16 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s1, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s6, s1 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s3, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s44, s3 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s52, s1 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s54, s0, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s56 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s39 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s55 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s56 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s57 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s62 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s63 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s17 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s8 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s6 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:144 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[54:55], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[48:49], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[44:45], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s47 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s45 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s43 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s49 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:144 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s52 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s53 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s31 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s31 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s26 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: @@ -8368,212 +8366,211 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s12, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s42, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s44, s6, 16 +; GFX7-HSA-NEXT: s_ashr_i32 s41, s1, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s43, s3, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s45, s3, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s48, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s50, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s46, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s54, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s36, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s34, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s3, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s3, 8 +; GFX7-HSA-NEXT: s_mov_b32 s26, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s1, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s1, 8 +; GFX7-HSA-NEXT: s_mov_b32 s12, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s0, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s62, s0, 8 +; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s37, s1, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s34, s0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s30, s0, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s0, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s66, s1, 8 -; GFX7-HSA-NEXT: s_mov_b32 s68, s1 -; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x80000 -; GFX7-HSA-NEXT: s_lshr_b32 s36, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 16 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s3, 31 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s4, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s54, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s42, s2, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s24, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s16, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s62, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i32 s44, s3, 24 -; GFX7-HSA-NEXT: s_ashr_i32 s45, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s46, s5, 24 -; GFX7-HSA-NEXT: s_ashr_i32 s47, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s48, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[70:71], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[60:61], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s66, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s67, s5, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s68, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s69, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[64:65], s[6:7], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[60:61], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[58:59], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[36:37], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xd0 +; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[44:45], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[42:43], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[40:41], 0x80000 +; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s58 +; GFX7-HSA-NEXT: s_add_u32 s58, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s59 +; GFX7-HSA-NEXT: s_addc_u32 s59, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48 +; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s49 +; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s49 +; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s49 +; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s63 +; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s69 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s58 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 -; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s38 +; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61 -; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s50 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 -; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s38 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s72 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: s_add_u32 s34, s8, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s73 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX7-HSA-NEXT: s_addc_u32 s35, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s30 +; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s31 +; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s31 +; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s57 +; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 -; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xf0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xe0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s29 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 +; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s65 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29 +; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s66 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s38 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s24 +; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s30 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s28 +; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s35 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s25 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-HSA-NEXT: s_add_u32 s14, s8, 0xa0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-HSA-NEXT: s_addc_u32 s15, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-HSA-NEXT: s_add_u32 s16, s8, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-HSA-NEXT: s_addc_u32 s17, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: s_add_u32 s14, s8, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: s_addc_u32 s15, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: s_add_u32 s10, s8, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: s_add_u32 s12, s8, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s8, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-HSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s8, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -8583,175 +8580,140 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s6, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s6, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s4, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s54, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s4, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s2, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s0, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s0, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s0, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s62, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s22, s7 -; GFX8-NOHSA-NEXT: s_lshr_b32 s64, s5, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s5, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s18, s5 -; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s3, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s14, s3 -; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s1, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s1, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s10, s1 -; GFX8-NOHSA-NEXT: s_ashr_i32 s63, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[0:1], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[68:69], s[6:7], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s7, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s54, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s6, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s62, s5, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s5, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s46, s5 +; GFX8-NOHSA-NEXT: s_lshr_b32 s42, s4, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s4, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s4, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s64, s3, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s30, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s1, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s1, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s18, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s0, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s0, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s0, 8 +; GFX8-NOHSA-NEXT: s_ashr_i32 s65, s3, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[44:45], s[6:7], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s42, s1, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[44:45], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s43, s3, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s44, s3, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s45, s5, 31 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s4, s1, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s1, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s64, s7, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s65, s7, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s3, 31 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[64:65], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s64, s5, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s66, s7, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s7, s7, 24 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xd0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 -; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s68 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s69 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0x90 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s50 +; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xf0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s51 +; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s66 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 +; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xe0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s54 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s55 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s52 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 +; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xd0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s52 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s54 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s56 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s59 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s38 -; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x80 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s39 -; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s56 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39 -; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x50 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s58 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44 +; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xc0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s45 +; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s61 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xb0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s62 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s63 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xa0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 -; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 64 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX8-NOHSA-NEXT: s_add_u32 s40, s8, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 -; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 16 +; GFX8-NOHSA-NEXT: s_addc_u32 s41, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s40 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s43 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s41 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 +; GFX8-NOHSA-NEXT: s_nop 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NOHSA-NEXT: s_add_u32 s36, s8, 0x80 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s29 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xe0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xb0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s63 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s37, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s36 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s37 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -8761,15 +8723,33 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -8780,16 +8760,32 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -8988,120 +8984,122 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s34, s6, 16 -; GFX12-NEXT: s_lshr_b32 s36, s6, 24 -; GFX12-NEXT: s_lshr_b32 s38, s6, 8 -; GFX12-NEXT: s_lshr_b32 s40, s4, 16 -; GFX12-NEXT: s_lshr_b32 s42, s4, 24 -; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX12-NEXT: s_lshr_b32 s44, s4, 8 +; GFX12-NEXT: s_lshr_b32 s40, s7, 16 +; GFX12-NEXT: s_lshr_b32 s50, s6, 8 +; GFX12-NEXT: s_lshr_b32 s62, s3, 16 +; GFX12-NEXT: s_ashr_i32 s51, s3, 24 +; GFX12-NEXT: s_lshr_b32 s42, s7, 8 +; GFX12-NEXT: s_mov_b32 s44, s7 +; GFX12-NEXT: s_lshr_b32 s46, s6, 16 +; GFX12-NEXT: s_lshr_b32 s48, s6, 24 +; GFX12-NEXT: s_lshr_b32 s38, s5, 16 +; GFX12-NEXT: s_lshr_b32 s52, s5, 8 +; GFX12-NEXT: s_mov_b32 s54, s5 +; GFX12-NEXT: s_lshr_b32 s56, s4, 16 +; GFX12-NEXT: s_lshr_b32 s58, s4, 24 +; GFX12-NEXT: s_lshr_b32 s60, s4, 8 +; GFX12-NEXT: s_lshr_b32 s36, s3, 8 +; GFX12-NEXT: s_mov_b32 s34, s3 +; GFX12-NEXT: s_lshr_b32 s28, s2, 16 +; GFX12-NEXT: s_lshr_b32 s26, s2, 24 +; GFX12-NEXT: s_lshr_b32 s24, s2, 8 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35 -; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX12-NEXT: s_ashr_i32 s39, s3, 31 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 +; GFX12-NEXT: s_ashr_i32 s62, s5, 31 +; GFX12-NEXT: s_ashr_i32 s63, s5, 24 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000 +; GFX12-NEXT: s_ashr_i32 s50, s7, 31 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s37 -; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s67 -; GFX12-NEXT: s_lshr_b32 s28, s2, 16 -; GFX12-NEXT: s_lshr_b32 s46, s2, 24 -; GFX12-NEXT: s_bfe_i64 s[64:65], s[4:5], 0x80000 +; GFX12-NEXT: s_ashr_i32 s7, s7, 24 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s66 :: v_dual_mov_b32 v7, s39 -; GFX12-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v9, s41 -; GFX12-NEXT: s_lshr_b32 s48, s2, 8 -; GFX12-NEXT: v_dual_mov_b32 v8, s40 :: v_dual_mov_b32 v11, s43 -; GFX12-NEXT: v_dual_mov_b32 v10, s42 :: v_dual_mov_b32 v13, s65 -; GFX12-NEXT: s_lshr_b32 s50, s0, 16 -; GFX12-NEXT: s_lshr_b32 s52, s0, 24 -; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s64 :: v_dual_mov_b32 v15, s45 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX12-NEXT: v_mov_b32_e32 v14, s44 -; GFX12-NEXT: s_lshr_b32 s54, s0, 8 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s41 ; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX12-NEXT: s_lshr_b32 s56, s7, 16 -; GFX12-NEXT: s_lshr_b32 s58, s5, 16 -; GFX12-NEXT: s_lshr_b32 s60, s1, 8 -; GFX12-NEXT: s_mov_b32 s62, s1 -; GFX12-NEXT: s_ashr_i32 s57, s1, 24 -; GFX12-NEXT: s_ashr_i32 s59, s3, 31 -; GFX12-NEXT: s_ashr_i32 s61, s3, 24 -; GFX12-NEXT: s_ashr_i32 s63, s5, 31 +; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v3, s50 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s45 +; GFX12-NEXT: v_dual_mov_b32 v4, s44 :: v_dual_mov_b32 v7, s43 +; GFX12-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v9, s47 +; GFX12-NEXT: v_dual_mov_b32 v8, s46 :: v_dual_mov_b32 v11, s49 +; GFX12-NEXT: v_dual_mov_b32 v10, s48 :: v_dual_mov_b32 v13, s67 +; GFX12-NEXT: v_dual_mov_b32 v12, s66 :: v_dual_mov_b32 v15, s5 +; GFX12-NEXT: v_mov_b32_e32 v14, s4 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[38:39], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:208 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:192 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:224 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:192 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s62 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s63 +; GFX12-NEXT: v_mov_b32_e32 v5, s55 +; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s54 :: v_dual_mov_b32 v7, s53 +; GFX12-NEXT: v_dual_mov_b32 v6, s52 :: v_dual_mov_b32 v9, s57 +; GFX12-NEXT: v_dual_mov_b32 v8, s56 :: v_dual_mov_b32 v11, s59 +; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s31 +; GFX12-NEXT: s_lshr_b32 s22, s1, 16 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s30 :: v_dual_mov_b32 v15, s61 +; GFX12-NEXT: v_dual_mov_b32 v14, s60 :: v_dual_mov_b32 v17, s3 +; GFX12-NEXT: s_lshr_b32 s16, s1, 8 +; GFX12-NEXT: s_mov_b32 s18, s1 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s39 +; GFX12-NEXT: v_dual_mov_b32 v18, s51 :: v_dual_mov_b32 v21, s35 +; GFX12-NEXT: s_lshr_b32 s14, s0, 16 +; GFX12-NEXT: s_lshr_b32 s12, s0, 24 +; GFX12-NEXT: s_ashr_i32 s6, s1, 31 +; GFX12-NEXT: s_ashr_i32 s33, s1, 24 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v23, s37 +; GFX12-NEXT: v_mov_b32_e32 v22, s36 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:144 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:128 -; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s47 -; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s46 -; GFX12-NEXT: v_mov_b32_e32 v5, s31 -; GFX12-NEXT: s_lshr_b32 s26, s7, 8 -; GFX12-NEXT: s_mov_b32 s24, s7 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v7, s49 -; GFX12-NEXT: v_dual_mov_b32 v6, s48 :: v_dual_mov_b32 v9, s51 -; GFX12-NEXT: s_lshr_b32 s18, s5, 8 -; GFX12-NEXT: s_mov_b32 s20, s5 -; GFX12-NEXT: s_lshr_b32 s16, s3, 16 -; GFX12-NEXT: s_lshr_b32 s12, s3, 8 -; GFX12-NEXT: s_mov_b32 s14, s3 -; GFX12-NEXT: s_lshr_b32 s10, s1, 16 -; GFX12-NEXT: s_ashr_i32 s33, s1, 31 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[0:1], s[60:61], 0x80000 -; GFX12-NEXT: s_ashr_i32 s60, s5, 24 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x80000 -; GFX12-NEXT: s_ashr_i32 s58, s7, 31 -; GFX12-NEXT: s_ashr_i32 s62, s7, 24 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s50 :: v_dual_mov_b32 v11, s53 -; GFX12-NEXT: v_dual_mov_b32 v10, s52 :: v_dual_mov_b32 v13, s23 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v15, s55 -; GFX12-NEXT: v_dual_mov_b32 v14, s54 :: v_dual_mov_b32 v17, s7 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s27 +; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s26 +; GFX12-NEXT: v_mov_b32_e32 v5, s21 +; GFX12-NEXT: s_lshr_b32 s64, s0, 8 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v19, s58 -; GFX12-NEXT: v_dual_mov_b32 v18, s62 :: v_dual_mov_b32 v21, s25 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v20, s24 :: v_dual_mov_b32 v23, s27 -; GFX12-NEXT: v_mov_b32_e32 v22, s26 -; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:80 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:64 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:16 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:240 -; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:224 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s63 -; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s60 -; GFX12-NEXT: v_mov_b32_e32 v5, s21 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s25 +; GFX12-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v9, s23 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s19 -; GFX12-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s17 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v11, s59 -; GFX12-NEXT: v_dual_mov_b32 v10, s61 :: v_dual_mov_b32 v13, s15 -; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s13 -; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v17, s11 -; GFX12-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v19, s33 -; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v21, s3 -; GFX12-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v23, s1 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s6 +; GFX12-NEXT: v_dual_mov_b32 v10, s33 :: v_dual_mov_b32 v13, s19 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[64:65], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s17 +; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v17, s15 +; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v19, s13 +; GFX12-NEXT: v_dual_mov_b32 v18, s12 :: v_dual_mov_b32 v21, s11 +; GFX12-NEXT: v_dual_mov_b32 v20, s10 :: v_dual_mov_b32 v23, s1 ; GFX12-NEXT: v_mov_b32_e32 v22, s0 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:112 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:96 -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:48 -; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] ; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = sext <32 x i8> %load to <32 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 09d3c3b01b809..1e74d7caa6c75 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -6362,29 +6362,46 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v3 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s5, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64: @@ -6397,44 +6414,59 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-HSA-NEXT: s_mov_b32 s6, s5 +; GCN-HSA-NEXT: s_mov_b32 s8, s3 +; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 16 +; GCN-HSA-NEXT: s_ashr_i32 s11, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s13, s3, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s18, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s19, s5, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64: @@ -6980,149 +7012,212 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v3 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v7 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v1 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s9, v5 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s11, v7 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s9 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s9, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s9, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s11, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s11, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[16:17], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-HSA-NEXT: s_mov_b32 s6, s5 +; GCN-HSA-NEXT: s_mov_b32 s8, s3 +; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s18, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s19, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s20, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s21, s5, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v4 +; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v5 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v6 +; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v7 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s15 +; GCN-HSA-NEXT: s_mov_b32 s14, s5 +; GCN-HSA-NEXT: s_mov_b32 s16, s3 +; GCN-HSA-NEXT: s_lshr_b32 s18, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s2, 16 +; GCN-HSA-NEXT: s_ashr_i32 s15, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s17, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s19, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s21, s5, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[20:21], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[18:19], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[16:17], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x100000 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v9, v18, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_bfe_i32 v6, v19, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v18, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v15, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[22:25] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64: @@ -8112,299 +8207,420 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s11, v1 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v3 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s16, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s17, v5 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s14, v6 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s15, v7 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v23, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v11 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v12 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v13 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s20, v14 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s21, v15 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v8 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v9 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v11 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v10 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s13 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s15 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s17 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s38, s21 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s19 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s25 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s23 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s16, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s20, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s18, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s24, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s22, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s23, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s23, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s49, s25, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s51, s25, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s53, s19, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s19, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s21, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s21, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s17, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s17, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[16:17], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s16, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s17, s13, 31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s16, s11, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s17, s11, 31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s15, 31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s15, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[58:59], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[56:57], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[54:55], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[52:53], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[50:51], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[48:49], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s67 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s66 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v27, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s65 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s53 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v11 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v19, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 31, v30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v5 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v7 +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v4 +; GCN-HSA-NEXT: s_mov_b32 s10, s9 +; GCN-HSA-NEXT: s_mov_b32 s12, s7 +; GCN-HSA-NEXT: s_lshr_b32 s14, s8, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: v_readfirstlane_b32 s23, v17 +; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 16 +; GCN-HSA-NEXT: v_readfirstlane_b32 s18, v14 +; GCN-HSA-NEXT: v_readfirstlane_b32 s19, v15 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s24, s23 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s33, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s49, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s50, s9, 31 +; GCN-HSA-NEXT: s_ashr_i32 s51, s9, 16 +; GCN-HSA-NEXT: v_readfirstlane_b32 s22, v16 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[16:17], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s26, s19 +; GCN-HSA-NEXT: s_lshr_b32 s16, s18, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[18:19], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s52, s19, 31 +; GCN-HSA-NEXT: s_ashr_i32 s53, s19, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s13 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_readfirstlane_b32 s12, v10 +; GCN-HSA-NEXT: v_readfirstlane_b32 s13, v11 +; GCN-HSA-NEXT: v_readfirstlane_b32 s14, v12 +; GCN-HSA-NEXT: v_readfirstlane_b32 s15, v13 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[24:25], 0x100000 +; GCN-HSA-NEXT: s_lshr_b32 s28, s22, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[22:23], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[26:27], 0x100000 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s19 +; GCN-HSA-NEXT: s_lshr_b32 s22, s12, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s58, s15, 31 +; GCN-HSA-NEXT: s_ashr_i32 s59, s15, 16 +; GCN-HSA-NEXT: s_mov_b32 s12, s15 +; GCN-HSA-NEXT: s_lshr_b32 s26, s14, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s36, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s37, v3 +; GCN-HSA-NEXT: s_mov_b32 s24, s13 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[26:27], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s44, s15 +; GCN-HSA-NEXT: s_lshr_b32 s26, s14, 16 +; GCN-HSA-NEXT: s_mov_b32 s46, s37 +; GCN-HSA-NEXT: s_lshr_b32 s48, s36, 16 +; GCN-HSA-NEXT: s_ashr_i32 s54, s23, 31 +; GCN-HSA-NEXT: s_ashr_i32 s55, s23, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s56, s13, 31 +; GCN-HSA-NEXT: s_ashr_i32 s57, s13, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[24:25], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s60, s15, 31 +; GCN-HSA-NEXT: s_ashr_i32 s61, s15, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s62, s37, 31 +; GCN-HSA-NEXT: s_ashr_i32 s63, s37, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[36:37], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[26:27], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[48:49], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[44:45], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s46, s0, 48 +; GCN-HSA-NEXT: s_addc_u32 s47, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s47 +; GCN-HSA-NEXT: s_add_u32 s46, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s47, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47 +; GCN-HSA-NEXT: s_add_u32 s46, s0, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s47, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 +; GCN-HSA-NEXT: s_add_u32 s38, s0, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 +; GCN-HSA-NEXT: s_addc_u32 s39, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s39 +; GCN-HSA-NEXT: s_add_u32 s38, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s39, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s39 +; GCN-HSA-NEXT: s_add_u32 s38, s0, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s39, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s39 +; GCN-HSA-NEXT: s_add_u32 s38, s0, 0x70 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s39, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39 +; GCN-HSA-NEXT: s_add_u32 s38, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s52 +; GCN-HSA-NEXT: s_addc_u32 s39, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s58 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s56 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s62 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[13:16] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_bfe_i32 v12, v9, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_bfe_i32 v16, v29, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v18, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v10, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v20, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_bfe_i32 v8, v21, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v26, v26, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-HSA-NEXT: v_bfe_i32 v14, v27, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, v7 -; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v19, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s38 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index f879dc660203f..9631398b6de67 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -6270,43 +6270,44 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v1 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 8 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s4, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s4, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[4:5], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s17, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s5, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v8i8_to_v8i64: @@ -6322,53 +6323,55 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v1 -; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v0 -; GCN-HSA-NEXT: s_lshr_b32 s4, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 24 -; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 8 -; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 8 -; GCN-HSA-NEXT: s_ashr_i32 s13, s2, 31 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x80000 -; GCN-HSA-NEXT: s_ashr_i32 s16, s2, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s6, s3, 8 +; GCN-HSA-NEXT: s_mov_b32 s8, s3 +; GCN-HSA-NEXT: s_lshr_b32 s10, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 24 +; GCN-HSA-NEXT: s_lshr_b32 s14, s2, 8 +; GCN-HSA-NEXT: s_ashr_i32 s15, s3, 31 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s18, s3, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i8_to_v8i64: @@ -6382,44 +6385,46 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 8 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s4, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s4, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 8 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s17, s5, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s20, s5, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[4:5], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v8i8_to_v8i64: @@ -6934,85 +6939,84 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 8 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s9, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s9 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s8, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s8, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 8 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s7, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s4, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[8:9], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s9, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s9, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i8_to_v16i64: @@ -7025,42 +7029,42 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 -; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s2, 24 -; GCN-HSA-NEXT: s_lshr_b32 s10, s2, 8 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-HSA-NEXT: s_lshr_b32 s6, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 8 +; GCN-HSA-NEXT: s_mov_b32 s10, s5 +; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24 +; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8 +; GCN-HSA-NEXT: s_ashr_i32 s7, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s9, s5, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_lshr_b32 s18, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 8 ; GCN-HSA-NEXT: s_mov_b32 s22, s3 -; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31 -; GCN-HSA-NEXT: s_ashr_i32 s9, s3, 24 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_lshr_b32 s4, s2, 16 +; GCN-HSA-NEXT: s_ashr_i32 s5, s3, 31 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 24 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 +; GCN-HSA-NEXT: s_lshr_b32 s24, s2, 24 +; GCN-HSA-NEXT: s_lshr_b32 s2, s2, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24 -; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_ashr_i32 s4, s5, 24 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_lshr_b32 s2, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s3, s5, 31 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 -; GCN-HSA-NEXT: s_lshr_b32 s4, s5, 8 -; GCN-HSA-NEXT: s_mov_b32 s24, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 @@ -7070,66 +7074,66 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s15 +; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i8_to_v16i64: @@ -7146,81 +7150,80 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 24 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s6, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 8 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v2 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s9, 16 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s9, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s8, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s8, 8 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s35, s9, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s9, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s5 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s7, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s7, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s7, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s4, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s4, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[8:9], 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s5, 31 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s5, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s5, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s27 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s19 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s29 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s13 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i8_to_v16i64: @@ -8176,166 +8179,166 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v2 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v3 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v0 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v1 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s36, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s37, v1 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s38, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s39, v3 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v6 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v7 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s18, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s18, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s18, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s22, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s22, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s22, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s24, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s24, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s24, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s12, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s12, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s19, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[22:23], 0x80000 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v5 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s9, v7 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s39, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s39, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s39 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s38, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s38, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s38, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s37, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s37, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s37 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s36, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s36, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s36, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s9, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s9 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s39, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s39, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[38:39], 0x80000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[24:25], 0x80000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s8, 16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s45 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s19, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s43 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s23, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s8, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s11 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s37, 31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s37, 24 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s23, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s23 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s41 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s8, 8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s37 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s25, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s31 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s7, 8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s25, 8 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s25, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s23, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 24 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s19, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s19, 24 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s25, 24 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s25 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s7 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s29 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s6, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s13, 16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s27 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s27 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 24 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s13, 31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s13, 24 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s13 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[22:23], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s6, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s9, 31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s9, 24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s11 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s7, 31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s7, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[22:23], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[24:25], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[44:45], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[38:39], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[40:41], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s41 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s19 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s37 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i64: @@ -8348,223 +8351,225 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_readfirstlane_b32 s10, v8 +; GCN-HSA-NEXT: v_readfirstlane_b32 s11, v9 ; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v6 -; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v4 -; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v5 ; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v7 -; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 24 -; GCN-HSA-NEXT: s_lshr_b32 s10, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s8, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 8 -; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 8 -; GCN-HSA-NEXT: s_lshr_b32 s12, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 8 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s6, s9, 16 -; GCN-HSA-NEXT: s_mov_b32 s28, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[18:19], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s16, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s11, 8 +; GCN-HSA-NEXT: s_mov_b32 s24, s11 +; GCN-HSA-NEXT: s_lshr_b32 s22, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s10, 24 +; GCN-HSA-NEXT: s_lshr_b32 s2, s10, 8 +; GCN-HSA-NEXT: s_lshr_b32 s4, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s7, 8 +; GCN-HSA-NEXT: s_mov_b32 s12, s7 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[10:11], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s6, 24 +; GCN-HSA-NEXT: s_lshr_b32 s30, s6, 8 +; GCN-HSA-NEXT: s_ashr_i32 s43, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s52, s7, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[16:17], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[24:25], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s40, v2 -; GCN-HSA-NEXT: v_readfirstlane_b32 s41, v3 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000 -; GCN-HSA-NEXT: v_readfirstlane_b32 s44, v0 -; GCN-HSA-NEXT: v_readfirstlane_b32 s45, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s24, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s25, v3 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s36, v4 +; GCN-HSA-NEXT: v_readfirstlane_b32 s37, v5 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: s_mov_b32 s22, s7 -; GCN-HSA-NEXT: s_lshr_b32 s8, s9, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[16:17], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[12:13], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[30:31], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[28:29], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[4:5], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: s_lshr_b32 s42, s44, 16 -; GCN-HSA-NEXT: s_lshr_b32 s48, s44, 24 -; GCN-HSA-NEXT: s_lshr_b32 s28, s44, 8 -; GCN-HSA-NEXT: s_lshr_b32 s6, s45, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s45, 8 -; GCN-HSA-NEXT: s_mov_b32 s4, s45 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s44, s40, 16 -; GCN-HSA-NEXT: s_lshr_b32 s50, s40, 24 -; GCN-HSA-NEXT: s_lshr_b32 s52, s40, 8 -; GCN-HSA-NEXT: s_lshr_b32 s20, s41, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s41, 8 -; GCN-HSA-NEXT: s_mov_b32 s14, s41 -; GCN-HSA-NEXT: s_ashr_i32 s33, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s37, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s38, s7, 24 -; GCN-HSA-NEXT: s_ashr_i32 s34, s9, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_ashr_i32 s30, s45, 31 -; GCN-HSA-NEXT: s_ashr_i32 s31, s45, 24 -; GCN-HSA-NEXT: s_ashr_i32 s35, s41, 31 -; GCN-HSA-NEXT: s_ashr_i32 s36, s41, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s14, s25, 16 +; GCN-HSA-NEXT: s_lshr_b32 s10, s25, 8 +; GCN-HSA-NEXT: s_mov_b32 s12, s25 +; GCN-HSA-NEXT: s_lshr_b32 s8, s24, 16 +; GCN-HSA-NEXT: s_lshr_b32 s6, s24, 24 +; GCN-HSA-NEXT: s_lshr_b32 s4, s24, 8 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s38, s37, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s37, 8 +; GCN-HSA-NEXT: s_mov_b32 s48, s37 +; GCN-HSA-NEXT: s_lshr_b32 s24, s36, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s36, 24 +; GCN-HSA-NEXT: s_lshr_b32 s18, s36, 8 +; GCN-HSA-NEXT: s_ashr_i32 s50, s11, 31 +; GCN-HSA-NEXT: s_ashr_i32 s51, s11, 24 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s15 +; GCN-HSA-NEXT: s_ashr_i32 s33, s25, 31 +; GCN-HSA-NEXT: s_ashr_i32 s42, s25, 24 +; GCN-HSA-NEXT: s_ashr_i32 s53, s37, 31 +; GCN-HSA-NEXT: s_ashr_i32 s54, s37, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[36:37], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[48:49], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55 -; GCN-HSA-NEXT: s_add_u32 s54, s0, 64 -; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55 -; GCN-HSA-NEXT: s_add_u32 s54, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 -; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s41 -; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41 -; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55 -; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 +; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s49 +; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49 +; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 +; GCN-HSA-NEXT: s_add_u32 s48, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s44 +; GCN-HSA-NEXT: s_add_u32 s44, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s45 +; GCN-HSA-NEXT: s_addc_u32 s45, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s34 +; GCN-HSA-NEXT: s_add_u32 s34, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s44 +; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s43 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: s_add_u32 s20, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s21 +; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xf0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GCN-HSA-NEXT: s_add_u32 s24, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18 -; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 -; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s21 +; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[20:23] +; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s20 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s29 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s21 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: s_add_u32 s16, s0, 48 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: s_add_u32 s14, s0, 0xb0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -8584,155 +8589,155 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v2 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s9, 16 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s9, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 8 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s9, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s9, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s11, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s9, 8 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s11, 24 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s6, 8 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v4 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v5 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s14, v6 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s15, v7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s11, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s11 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s69, s11, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s11, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s39 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s10, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s11, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s66, s11 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[4:5], 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s11, 31 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s8, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s10, 8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s15, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s15, 8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s5 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s5, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s5, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s7, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s15 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s15, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s15, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s14, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s14, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s14, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s7, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, s7 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s7, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s7, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s13, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[14:15], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s9, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s13, 8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s13 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s13, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s13, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s12, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s12, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s9, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[62:63], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s8, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s12, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[66:67], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[64:65], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[68:69], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s39 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s47 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s45 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v32i8_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index ddd1ce66c013a..1eb9fb99cdcc6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -6153,11 +6153,11 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; SI-NEXT: v_bfe_i32 v8, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v9, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 +; SI-NEXT: v_bfe_i32 v14, v11, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; SI-NEXT: v_bfe_i32 v14, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 @@ -6819,10 +6819,10 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: v_mov_b32_e32 v18, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_mov_b32_e32 v12, v3 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v5 @@ -6845,24 +6845,24 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v5, v6, 0, 16 -; SI-NEXT: v_bfe_i32 v10, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v7, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v7, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v10, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v12, v19, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: v_bfe_i32 v14, v17, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_bfe_i32 v16, v16, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: ds_write2_b64 v18, v[3:4], v[8:9] offset0:2 offset1:3 ; SI-NEXT: v_bfe_i32 v3, v15, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: ds_write2_b64 v18, v[7:8], v[3:4] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v18, v[10:11], v[16:17] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v18, v[10:11], v[3:4] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v18, v[7:8], v[16:17] offset0:8 offset1:9 ; SI-NEXT: ds_write2_b64 v18, v[5:6], v[14:15] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v18, v[1:2], v[12:13] offset1:1 ; SI-NEXT: s_endpgm @@ -8114,16 +8114,16 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5 ; SI-NEXT: v_bfe_i32 v11, v6, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SI-NEXT: v_bfe_i32 v13, v4, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: v_bfe_i32 v15, v15, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_bfe_i32 v16, v14, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; SI-NEXT: ds_write2_b64 v7, v[9:10], v[16:17] offset1:1 ; SI-NEXT: v_bfe_i32 v17, v18, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 diff --git a/llvm/test/CodeGen/AMDGPU/pr155452.ll b/llvm/test/CodeGen/AMDGPU/pr155452.ll index d021b210c7f3a..d93203e8dee23 100644 --- a/llvm/test/CodeGen/AMDGPU/pr155452.ll +++ b/llvm/test/CodeGen/AMDGPU/pr155452.ll @@ -9,59 +9,59 @@ define amdgpu_kernel void @my_kernel(i64 %foo, i32 %bar) { ; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-NEXT: s_add_i32 s12, s12, s17 ; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-NEXT: s_load_dword s0, s[8:9], 0x2 -; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], 1 +; CHECK-NEXT: s_load_dword s5, s[8:9], 0x2 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: s_mov_b64 s[2:3], 1 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s6, s0, 31 -; CHECK-NEXT: s_abs_i32 s7, s0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s7 -; CHECK-NEXT: s_sub_i32 s0, 0, s7 +; CHECK-NEXT: s_ashr_i32 s4, s5, 31 +; CHECK-NEXT: s_abs_i32 s5, s5 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s5 +; CHECK-NEXT: s_sub_i32 s6, 0, s5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, s0, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, s6, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: s_and_b64 s[0:1], exec, -1 +; CHECK-NEXT: s_and_b64 vcc, exec, -1 ; CHECK-NEXT: .LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v3, s4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[8:9], s2, v3, 1 -; CHECK-NEXT: s_mul_i32 s4, s3, s4 -; CHECK-NEXT: s_mul_i32 s5, s2, s5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, s4, v4 -; CHECK-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, s5, v4 -; CHECK-NEXT: s_ashr_i32 s5, s4, 31 -; CHECK-NEXT: s_abs_i32 s8, s4 -; CHECK-NEXT: s_xor_b32 s5, s5, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s2 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[6:7], s0, v3, 1 +; CHECK-NEXT: s_mul_i32 s6, s1, s2 +; CHECK-NEXT: s_mul_i32 s3, s0, s3 +; CHECK-NEXT: v_readfirstlane_b32 s2, v3 +; CHECK-NEXT: v_readfirstlane_b32 s7, v4 +; CHECK-NEXT: s_add_i32 s6, s6, s7 +; CHECK-NEXT: s_ashr_i32 s7, s2, 31 +; CHECK-NEXT: s_abs_i32 s8, s2 +; CHECK-NEXT: s_add_i32 s3, s3, s6 +; CHECK-NEXT: s_xor_b32 s6, s7, s4 ; CHECK-NEXT: v_mul_hi_u32 v3, s8, v2 -; CHECK-NEXT: v_readfirstlane_b32 s9, v3 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; CHECK-NEXT: s_mul_i32 s9, s9, s7 +; CHECK-NEXT: v_readfirstlane_b32 s7, v3 +; CHECK-NEXT: s_mul_i32 s9, s7, s5 +; CHECK-NEXT: s_add_i32 s10, s7, 1 ; CHECK-NEXT: s_sub_i32 s8, s8, s9 -; CHECK-NEXT: s_sub_i32 s9, s8, s7 -; CHECK-NEXT: s_cmp_ge_u32 s8, s7 -; CHECK-NEXT: s_cselect_b64 vcc, -1, 0 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CHECK-NEXT: s_sub_i32 s9, s8, s5 +; CHECK-NEXT: s_cmp_ge_u32 s8, s5 +; CHECK-NEXT: s_cselect_b32 s7, s10, s7 ; CHECK-NEXT: s_cselect_b32 s8, s9, s8 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; CHECK-NEXT: s_cmp_ge_u32 s8, s7 -; CHECK-NEXT: s_cselect_b64 vcc, -1, 0 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, s5, v3 -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s5, v3 -; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; CHECK-NEXT: v_or_b32_e32 v3, s4, v3 -; CHECK-NEXT: v_or_b32_e32 v4, v4, v5 +; CHECK-NEXT: s_add_i32 s9, s7, 1 +; CHECK-NEXT: s_cmp_ge_u32 s8, s5 +; CHECK-NEXT: s_cselect_b32 s7, s9, s7 +; CHECK-NEXT: s_xor_b32 s7, s7, s6 +; CHECK-NEXT: s_sub_i32 s6, s7, s6 +; CHECK-NEXT: s_ashr_i32 s7, s6, 31 +; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; CHECK-NEXT: v_mov_b32_e32 v4, s3 +; CHECK-NEXT: v_mov_b32_e32 v3, s2 ; CHECK-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[3:4] -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_mov_b64 vcc, s[0:1] +; CHECK-NEXT: s_mov_b64 s[2:3], 0 +; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index bfc310ad93ead..e4b8de90b44b9 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -475,17 +475,20 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] @@ -493,12 +496,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9] -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -512,17 +515,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -1015,10 +1015,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -1781,17 +1781,20 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] @@ -1804,6 +1807,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -2317,10 +2321,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 5c0f813c8c829..b509361d2bab2 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -25,36 +25,39 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_mov_b32 s8, s6 ; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 -; GCN-NEXT: v_max_i32_e32 v2, v1, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GCN-NEXT: v_max_i32_e32 v5, v0, v5 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 -; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_lo_u32 v4, v4, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_mul_hi_u32 v3, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: s_abs_i32 s7, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GCN-NEXT: s_sub_i32 s0, 0, s7 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: s_abs_i32 s8, s4 +; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 +; GCN-NEXT: s_xor_b32 s4, s4, s6 +; GCN-NEXT: s_ashr_i32 s4, s4, 31 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-NEXT: s_mul_i32 s5, s5, s7 +; GCN-NEXT: s_sub_i32 s5, s8, s5 +; GCN-NEXT: s_sub_i32 s6, s5, s7 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GCN-NEXT: s_cmp_ge_u32 s5, s7 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_cselect_b32 s5, s6, s5 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GCN-NEXT: s_cmp_ge_u32 s5, s7 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_xor_b32_e32 v0, s4, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; @@ -69,36 +72,39 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: s_mov_b32 s8, s6 ; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, 0, v1 -; TONGA-NEXT: v_max_i32_e32 v2, v1, v2 -; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2 -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; TONGA-NEXT: v_max_i32_e32 v5, v0, v5 -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 -; TONGA-NEXT: v_ashrrev_i32_e32 v0, 31, v0 -; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 -; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3 -; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; TONGA-NEXT: v_mul_hi_u32 v3, v5, v3 -; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 +; TONGA-NEXT: v_readfirstlane_b32 s6, v1 +; TONGA-NEXT: s_abs_i32 s7, s6 +; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s7 +; TONGA-NEXT: s_sub_i32 s0, 0, s7 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1 +; TONGA-NEXT: v_mul_lo_u32 v2, s0, v1 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: v_readfirstlane_b32 s4, v0 +; TONGA-NEXT: s_abs_i32 s8, s4 +; TONGA-NEXT: v_mul_hi_u32 v2, v1, v2 +; TONGA-NEXT: s_xor_b32 s4, s4, s6 +; TONGA-NEXT: s_ashr_i32 s4, s4, 31 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v2 +; TONGA-NEXT: v_mul_hi_u32 v0, s8, v0 +; TONGA-NEXT: v_readfirstlane_b32 s5, v0 +; TONGA-NEXT: s_mul_i32 s5, s5, s7 +; TONGA-NEXT: s_sub_i32 s5, s8, s5 +; TONGA-NEXT: s_sub_i32 s6, s5, s7 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; TONGA-NEXT: s_cmp_ge_u32 s5, s7 +; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; TONGA-NEXT: s_cselect_b32 s5, s6, s5 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; TONGA-NEXT: s_cmp_ge_u32 s5, s7 +; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; TONGA-NEXT: v_xor_b32_e32 v0, s4, v0 +; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s4, v0 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; @@ -2006,7 +2012,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v2, v1 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -2014,7 +2020,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -2053,7 +2059,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1 -; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v2, v1 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -2061,7 +2067,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 4addf42b27984..a49c7d589c301 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -588,19 +588,19 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s8, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_abs_i32 s9, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_sub_i32 s2, 0, s9 +; GCN-NEXT: s_abs_i32 s8, s9 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_abs_i32 s0, s3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_xor_b32 s1, s3, s8 +; GCN-NEXT: s_xor_b32 s1, s3, s9 ; GCN-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -609,15 +609,15 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-NEXT: s_mul_i32 s3, s2, s9 +; GCN-NEXT: s_mul_i32 s3, s2, s8 ; GCN-NEXT: s_sub_i32 s0, s0, s3 -; GCN-NEXT: s_add_i32 s8, s2, 1 -; GCN-NEXT: s_sub_i32 s3, s0, s9 -; GCN-NEXT: s_cmp_ge_u32 s0, s9 -; GCN-NEXT: s_cselect_b32 s2, s8, s2 +; GCN-NEXT: s_add_i32 s9, s2, 1 +; GCN-NEXT: s_sub_i32 s3, s0, s8 +; GCN-NEXT: s_cmp_ge_u32 s0, s8 +; GCN-NEXT: s_cselect_b32 s2, s9, s2 ; GCN-NEXT: s_cselect_b32 s0, s3, s0 ; GCN-NEXT: s_add_i32 s3, s2, 1 -; GCN-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-NEXT: s_cmp_ge_u32 s0, s8 ; GCN-NEXT: s_cselect_b32 s0, s3, s2 ; GCN-NEXT: s_xor_b32 s0, s0, s1 ; GCN-NEXT: s_sub_i32 s0, s0, s1 @@ -629,19 +629,19 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s8, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_abs_i32 s9, s8 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 +; GCN-IR-NEXT: s_abs_i32 s8, s9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_abs_i32 s0, s3 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_xor_b32 s1, s3, s8 +; GCN-IR-NEXT: s_xor_b32 s1, s3, s9 ; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -650,15 +650,15 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-IR-NEXT: s_mul_i32 s3, s2, s9 +; GCN-IR-NEXT: s_mul_i32 s3, s2, s8 ; GCN-IR-NEXT: s_sub_i32 s0, s0, s3 -; GCN-IR-NEXT: s_add_i32 s8, s2, 1 -; GCN-IR-NEXT: s_sub_i32 s3, s0, s9 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 -; GCN-IR-NEXT: s_cselect_b32 s2, s8, s2 +; GCN-IR-NEXT: s_add_i32 s9, s2, 1 +; GCN-IR-NEXT: s_sub_i32 s3, s0, s8 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s8 +; GCN-IR-NEXT: s_cselect_b32 s2, s9, s2 ; GCN-IR-NEXT: s_cselect_b32 s0, s3, s0 ; GCN-IR-NEXT: s_add_i32 s3, s2, 1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s8 ; GCN-IR-NEXT: s_cselect_b32 s0, s3, s2 ; GCN-IR-NEXT: s_xor_b32 s0, s0, s1 ; GCN-IR-NEXT: s_sub_i32 s0, s0, s1 @@ -677,11 +677,11 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s8, s0, 1 +; GCN-NEXT: s_ashr_i32 s8, s1, 1 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -721,11 +721,11 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s8, s0, 1 +; GCN-IR-NEXT: s_ashr_i32 s8, s1, 1 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -841,11 +841,11 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s8, s0, 7 +; GCN-NEXT: s_ashr_i32 s8, s1, 7 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -885,11 +885,11 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s8, s0, 7 +; GCN-IR-NEXT: s_ashr_i32 s8, s1, 7 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index f614f58d8e1dc..7ddb5539c4d1e 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1819,7 +1819,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TAHITI-NEXT: v_mul_hi_u32 v1, v0, v1 ; TAHITI-NEXT: v_mul_lo_u32 v1, v1, v2 ; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; TAHITI-NEXT: v_subrev_i32_e32 v1, vcc, v2, v0 +; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 ; TAHITI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TAHITI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 @@ -4738,52 +4738,68 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 30, v5 -; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v3 -; GCN-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 30, v6 -; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GCN-NEXT: v_add_co_u32_e32 v6, vcc, v2, v6 -; GCN-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc -; GCN-NEXT: v_and_b32_e32 v5, -4, v5 -; GCN-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 -; GCN-NEXT: v_and_b32_e32 v6, -4, v6 -; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GCN-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 -; GCN-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v8, vcc +; GCN-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-NEXT: s_ashr_i32 s6, s3, 31 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_lshr_b32 s6, s6, 30 +; GCN-NEXT: s_add_u32 s6, s2, s6 +; GCN-NEXT: s_addc_u32 s7, s3, 0 +; GCN-NEXT: s_and_b32 s6, s6, -4 +; GCN-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NEXT: s_sub_u32 s2, s2, s6 +; GCN-NEXT: s_subb_u32 s3, s3, s7 +; GCN-NEXT: s_ashr_i32 s6, s5, 31 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_lshr_b32 s2, s6, 30 +; GCN-NEXT: s_add_u32 s2, s4, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_addc_u32 s3, s5, 0 +; GCN-NEXT: s_and_b32 s2, s2, -4 +; GCN-NEXT: s_sub_u32 s2, s4, s2 +; GCN-NEXT: s_subb_u32 s3, s5, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v2i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; TAHITI-NEXT: s_mov_b32 s7, 0xf000 -; TAHITI-NEXT: s_mov_b32 s6, -1 -; TAHITI-NEXT: s_mov_b32 s10, s6 -; TAHITI-NEXT: s_mov_b32 s11, s7 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; TAHITI-NEXT: s_mov_b32 s3, 0xf000 +; TAHITI-NEXT: s_mov_b32 s2, -1 +; TAHITI-NEXT: s_mov_b32 s10, s2 +; TAHITI-NEXT: s_mov_b32 s11, s3 ; TAHITI-NEXT: s_waitcnt lgkmcnt(0) -; TAHITI-NEXT: s_mov_b32 s8, s2 -; TAHITI-NEXT: s_mov_b32 s9, s3 +; TAHITI-NEXT: s_mov_b32 s8, s6 +; TAHITI-NEXT: s_mov_b32 s9, s7 ; TAHITI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TAHITI-NEXT: s_mov_b32 s4, s0 -; TAHITI-NEXT: s_mov_b32 s5, s1 +; TAHITI-NEXT: s_mov_b32 s1, s5 +; TAHITI-NEXT: s_mov_b32 s0, s4 ; TAHITI-NEXT: s_waitcnt vmcnt(0) -; TAHITI-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; TAHITI-NEXT: v_lshrrev_b32_e32 v4, 30, v4 -; TAHITI-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; TAHITI-NEXT: v_add_i32_e32 v4, vcc, v0, v4 -; TAHITI-NEXT: v_lshrrev_b32_e32 v5, 30, v5 -; TAHITI-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; TAHITI-NEXT: v_add_i32_e32 v5, vcc, v2, v5 -; TAHITI-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; TAHITI-NEXT: v_and_b32_e32 v4, -4, v4 -; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; TAHITI-NEXT: v_and_b32_e32 v5, -4, v5 -; TAHITI-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; TAHITI-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; TAHITI-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; TAHITI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; TAHITI-NEXT: v_readfirstlane_b32 s5, v1 +; TAHITI-NEXT: s_ashr_i32 s8, s5, 31 +; TAHITI-NEXT: v_readfirstlane_b32 s4, v0 +; TAHITI-NEXT: s_lshr_b32 s8, s8, 30 +; TAHITI-NEXT: s_add_u32 s8, s4, s8 +; TAHITI-NEXT: s_addc_u32 s9, s5, 0 +; TAHITI-NEXT: s_and_b32 s8, s8, -4 +; TAHITI-NEXT: v_readfirstlane_b32 s7, v3 +; TAHITI-NEXT: s_sub_u32 s4, s4, s8 +; TAHITI-NEXT: s_subb_u32 s5, s5, s9 +; TAHITI-NEXT: s_ashr_i32 s8, s7, 31 +; TAHITI-NEXT: v_readfirstlane_b32 s6, v2 +; TAHITI-NEXT: v_mov_b32_e32 v0, s4 +; TAHITI-NEXT: s_lshr_b32 s4, s8, 30 +; TAHITI-NEXT: s_add_u32 s4, s6, s4 +; TAHITI-NEXT: v_mov_b32_e32 v1, s5 +; TAHITI-NEXT: s_addc_u32 s5, s7, 0 +; TAHITI-NEXT: s_and_b32 s4, s4, -4 +; TAHITI-NEXT: s_sub_u32 s4, s6, s4 +; TAHITI-NEXT: s_subb_u32 s5, s7, s5 +; TAHITI-NEXT: v_mov_b32_e32 v2, s4 +; TAHITI-NEXT: v_mov_b32_e32 v3, s5 +; TAHITI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; TAHITI-NEXT: s_endpgm ; ; TONGA-LABEL: srem_v2i64_4: @@ -4793,23 +4809,31 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 ; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; TONGA-NEXT: v_mov_b32_e32 v4, s0 ; TONGA-NEXT: v_mov_b32_e32 v5, s1 +; TONGA-NEXT: v_mov_b32_e32 v4, s0 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; TONGA-NEXT: v_lshrrev_b32_e32 v6, 30, v6 -; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, v0, v6 -; TONGA-NEXT: v_lshrrev_b32_e32 v7, 30, v7 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; TONGA-NEXT: v_add_u32_e32 v7, vcc, v2, v7 -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; TONGA-NEXT: v_and_b32_e32 v6, -4, v6 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v6 -; TONGA-NEXT: v_and_b32_e32 v7, -4, v7 -; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v7 -; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; TONGA-NEXT: v_readfirstlane_b32 s1, v1 +; TONGA-NEXT: s_ashr_i32 s4, s1, 31 +; TONGA-NEXT: v_readfirstlane_b32 s0, v0 +; TONGA-NEXT: s_lshr_b32 s4, s4, 30 +; TONGA-NEXT: s_add_u32 s4, s0, s4 +; TONGA-NEXT: s_addc_u32 s5, s1, 0 +; TONGA-NEXT: s_and_b32 s4, s4, -4 +; TONGA-NEXT: v_readfirstlane_b32 s3, v3 +; TONGA-NEXT: s_sub_u32 s0, s0, s4 +; TONGA-NEXT: s_subb_u32 s1, s1, s5 +; TONGA-NEXT: s_ashr_i32 s4, s3, 31 +; TONGA-NEXT: v_readfirstlane_b32 s2, v2 +; TONGA-NEXT: v_mov_b32_e32 v0, s0 +; TONGA-NEXT: s_lshr_b32 s0, s4, 30 +; TONGA-NEXT: s_add_u32 s0, s2, s0 +; TONGA-NEXT: v_mov_b32_e32 v1, s1 +; TONGA-NEXT: s_addc_u32 s1, s3, 0 +; TONGA-NEXT: s_and_b32 s0, s0, -4 +; TONGA-NEXT: s_sub_u32 s0, s2, s0 +; TONGA-NEXT: s_subb_u32 s1, s3, s1 +; TONGA-NEXT: v_mov_b32_e32 v2, s0 +; TONGA-NEXT: v_mov_b32_e32 v3, s1 ; TONGA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; TONGA-NEXT: s_endpgm ; @@ -6232,7 +6256,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8 ; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10 ; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8 -; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 ; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 @@ -8886,38 +8910,54 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] -; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 +; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 +; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 30, v9 -; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GCN-NEXT: v_add_co_u32_e32 v9, vcc, v0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 30, v10 -; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc +; GCN-NEXT: v_readfirstlane_b32 s2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-NEXT: v_add_co_u32_e32 v10, vcc, v2, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 30, v11 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v3, vcc -; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v4, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 30, v12 -; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v5, vcc -; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v6, v12 -; GCN-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v7, vcc -; GCN-NEXT: v_and_b32_e32 v9, -4, v9 -; GCN-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v9 -; GCN-NEXT: v_and_b32_e32 v10, -4, v10 -; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v13, vcc -; GCN-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v10 -; GCN-NEXT: v_and_b32_e32 v11, -4, v11 -; GCN-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v14, vcc -; GCN-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v11 -; GCN-NEXT: v_and_b32_e32 v12, -4, v12 -; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v15, vcc -; GCN-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v12 -; GCN-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v16, vcc +; GCN-NEXT: v_readfirstlane_b32 s7, v5 +; GCN-NEXT: s_ashr_i32 s10, s7, 31 +; GCN-NEXT: v_readfirstlane_b32 s6, v4 +; GCN-NEXT: s_lshr_b32 s10, s10, 30 +; GCN-NEXT: s_add_u32 s10, s6, s10 +; GCN-NEXT: s_addc_u32 s11, s7, 0 +; GCN-NEXT: s_and_b32 s10, s10, -4 +; GCN-NEXT: v_readfirstlane_b32 s9, v7 +; GCN-NEXT: s_sub_u32 s6, s6, s10 +; GCN-NEXT: s_subb_u32 s7, s7, s11 +; GCN-NEXT: s_ashr_i32 s10, s9, 31 +; GCN-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_lshr_b32 s6, s10, 30 +; GCN-NEXT: s_add_u32 s6, s8, s6 +; GCN-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: s_and_b32 s6, s6, -4 +; GCN-NEXT: s_sub_u32 s6, s8, s6 +; GCN-NEXT: s_subb_u32 s7, s9, s7 +; GCN-NEXT: s_ashr_i32 s8, s3, 31 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: s_lshr_b32 s6, s8, 30 +; GCN-NEXT: s_add_u32 s6, s2, s6 +; GCN-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: s_addc_u32 s7, s3, 0 +; GCN-NEXT: s_and_b32 s6, s6, -4 +; GCN-NEXT: s_sub_u32 s2, s2, s6 +; GCN-NEXT: s_subb_u32 s3, s3, s7 +; GCN-NEXT: s_ashr_i32 s6, s5, 31 +; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: s_lshr_b32 s2, s6, 30 +; GCN-NEXT: s_add_u32 s2, s4, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: s_addc_u32 s3, s5, 0 +; GCN-NEXT: s_and_b32 s2, s2, -4 +; GCN-NEXT: s_sub_u32 s2, s4, s2 +; GCN-NEXT: s_subb_u32 s3, s5, s3 +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm @@ -8932,40 +8972,56 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; TAHITI-NEXT: s_waitcnt lgkmcnt(0) ; TAHITI-NEXT: s_mov_b32 s8, s6 ; TAHITI-NEXT: s_mov_b32 s9, s7 -; TAHITI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TAHITI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; TAHITI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; TAHITI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; TAHITI-NEXT: s_mov_b32 s0, s4 ; TAHITI-NEXT: s_mov_b32 s1, s5 ; TAHITI-NEXT: s_waitcnt vmcnt(1) -; TAHITI-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; TAHITI-NEXT: v_lshrrev_b32_e32 v8, 30, v8 -; TAHITI-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; TAHITI-NEXT: v_add_i32_e32 v8, vcc, v0, v8 -; TAHITI-NEXT: v_lshrrev_b32_e32 v9, 30, v9 -; TAHITI-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc +; TAHITI-NEXT: v_readfirstlane_b32 s4, v0 ; TAHITI-NEXT: s_waitcnt vmcnt(0) -; TAHITI-NEXT: v_ashrrev_i32_e32 v10, 31, v5 -; TAHITI-NEXT: v_add_i32_e32 v9, vcc, v2, v9 -; TAHITI-NEXT: v_lshrrev_b32_e32 v10, 30, v10 -; TAHITI-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc -; TAHITI-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; TAHITI-NEXT: v_add_i32_e32 v10, vcc, v4, v10 -; TAHITI-NEXT: v_lshrrev_b32_e32 v11, 30, v11 -; TAHITI-NEXT: v_addc_u32_e32 v14, vcc, 0, v5, vcc -; TAHITI-NEXT: v_add_i32_e32 v11, vcc, v6, v11 -; TAHITI-NEXT: v_addc_u32_e32 v15, vcc, 0, v7, vcc -; TAHITI-NEXT: v_and_b32_e32 v8, -4, v8 -; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; TAHITI-NEXT: v_and_b32_e32 v9, -4, v9 -; TAHITI-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc -; TAHITI-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; TAHITI-NEXT: v_and_b32_e32 v10, -4, v10 -; TAHITI-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc -; TAHITI-NEXT: v_sub_i32_e32 v4, vcc, v4, v10 -; TAHITI-NEXT: v_and_b32_e32 v11, -4, v11 -; TAHITI-NEXT: v_subb_u32_e32 v5, vcc, v5, v14, vcc -; TAHITI-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 -; TAHITI-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc +; TAHITI-NEXT: v_readfirstlane_b32 s9, v5 +; TAHITI-NEXT: s_ashr_i32 s12, s9, 31 +; TAHITI-NEXT: v_readfirstlane_b32 s8, v4 +; TAHITI-NEXT: s_lshr_b32 s12, s12, 30 +; TAHITI-NEXT: s_add_u32 s12, s8, s12 +; TAHITI-NEXT: s_addc_u32 s13, s9, 0 +; TAHITI-NEXT: s_and_b32 s12, s12, -4 +; TAHITI-NEXT: v_readfirstlane_b32 s11, v7 +; TAHITI-NEXT: s_sub_u32 s8, s8, s12 +; TAHITI-NEXT: s_subb_u32 s9, s9, s13 +; TAHITI-NEXT: s_ashr_i32 s12, s11, 31 +; TAHITI-NEXT: v_readfirstlane_b32 s10, v6 +; TAHITI-NEXT: v_mov_b32_e32 v0, s8 +; TAHITI-NEXT: s_lshr_b32 s8, s12, 30 +; TAHITI-NEXT: s_add_u32 s8, s10, s8 +; TAHITI-NEXT: v_readfirstlane_b32 s5, v1 +; TAHITI-NEXT: v_mov_b32_e32 v1, s9 +; TAHITI-NEXT: s_addc_u32 s9, s11, 0 +; TAHITI-NEXT: s_and_b32 s8, s8, -4 +; TAHITI-NEXT: s_sub_u32 s8, s10, s8 +; TAHITI-NEXT: s_subb_u32 s9, s11, s9 +; TAHITI-NEXT: s_ashr_i32 s10, s5, 31 +; TAHITI-NEXT: v_readfirstlane_b32 s6, v2 +; TAHITI-NEXT: v_mov_b32_e32 v2, s8 +; TAHITI-NEXT: s_lshr_b32 s8, s10, 30 +; TAHITI-NEXT: s_add_u32 s8, s4, s8 +; TAHITI-NEXT: v_readfirstlane_b32 s7, v3 +; TAHITI-NEXT: v_mov_b32_e32 v3, s9 +; TAHITI-NEXT: s_addc_u32 s9, s5, 0 +; TAHITI-NEXT: s_and_b32 s8, s8, -4 +; TAHITI-NEXT: s_sub_u32 s4, s4, s8 +; TAHITI-NEXT: s_subb_u32 s5, s5, s9 +; TAHITI-NEXT: s_ashr_i32 s8, s7, 31 +; TAHITI-NEXT: v_mov_b32_e32 v4, s4 +; TAHITI-NEXT: s_lshr_b32 s4, s8, 30 +; TAHITI-NEXT: s_add_u32 s4, s6, s4 +; TAHITI-NEXT: v_mov_b32_e32 v5, s5 +; TAHITI-NEXT: s_addc_u32 s5, s7, 0 +; TAHITI-NEXT: s_and_b32 s4, s4, -4 +; TAHITI-NEXT: s_sub_u32 s4, s6, s4 +; TAHITI-NEXT: s_subb_u32 s5, s7, s5 +; TAHITI-NEXT: v_mov_b32_e32 v6, s4 +; TAHITI-NEXT: v_mov_b32_e32 v7, s5 ; TAHITI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; TAHITI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; TAHITI-NEXT: s_endpgm @@ -8974,52 +9030,69 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: v_mov_b32_e32 v0, s2 -; TONGA-NEXT: v_mov_b32_e32 v1, s3 -; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: s_add_u32 s4, s2, 16 ; TONGA-NEXT: s_addc_u32 s5, s3, 0 -; TONGA-NEXT: v_mov_b32_e32 v4, s4 -; TONGA-NEXT: v_mov_b32_e32 v5, s5 +; TONGA-NEXT: v_mov_b32_e32 v0, s4 +; TONGA-NEXT: v_mov_b32_e32 v5, s3 +; TONGA-NEXT: v_mov_b32_e32 v1, s5 +; TONGA-NEXT: v_mov_b32_e32 v4, s2 +; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; TONGA-NEXT: v_mov_b32_e32 v9, s1 ; TONGA-NEXT: v_mov_b32_e32 v8, s0 -; TONGA-NEXT: s_add_u32 s0, s0, 16 -; TONGA-NEXT: s_addc_u32 s1, s1, 0 -; TONGA-NEXT: v_mov_b32_e32 v11, s1 -; TONGA-NEXT: v_mov_b32_e32 v10, s0 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v1 -; TONGA-NEXT: v_lshrrev_b32_e32 v12, 30, v12 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v0, v12 -; TONGA-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc -; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 -; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 -; TONGA-NEXT: v_lshrrev_b32_e32 v13, 30, v13 -; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v2, v13 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc -; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 +; TONGA-NEXT: v_readfirstlane_b32 s2, v0 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v5 -; TONGA-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v12 -; TONGA-NEXT: v_lshrrev_b32_e32 v14, 30, v14 -; TONGA-NEXT: v_lshrrev_b32_e32 v15, 30, v15 -; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc -; TONGA-NEXT: v_add_u32_e64 v12, s[0:1], v4, v14 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v6, v15 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v7, vcc -; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 -; TONGA-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v5, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v12 -; TONGA-NEXT: v_and_b32_e32 v13, -4, v13 -; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v14, vcc -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 -; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc +; TONGA-NEXT: v_readfirstlane_b32 s7, v5 +; TONGA-NEXT: s_ashr_i32 s10, s7, 31 +; TONGA-NEXT: v_readfirstlane_b32 s6, v4 +; TONGA-NEXT: s_lshr_b32 s10, s10, 30 +; TONGA-NEXT: s_add_u32 s10, s6, s10 +; TONGA-NEXT: s_addc_u32 s11, s7, 0 +; TONGA-NEXT: s_and_b32 s10, s10, -4 +; TONGA-NEXT: v_readfirstlane_b32 s9, v7 +; TONGA-NEXT: s_sub_u32 s6, s6, s10 +; TONGA-NEXT: s_subb_u32 s7, s7, s11 +; TONGA-NEXT: s_ashr_i32 s10, s9, 31 +; TONGA-NEXT: v_readfirstlane_b32 s8, v6 +; TONGA-NEXT: v_mov_b32_e32 v0, s6 +; TONGA-NEXT: s_lshr_b32 s6, s10, 30 +; TONGA-NEXT: s_add_u32 s6, s8, s6 +; TONGA-NEXT: v_readfirstlane_b32 s3, v1 +; TONGA-NEXT: v_mov_b32_e32 v1, s7 +; TONGA-NEXT: s_addc_u32 s7, s9, 0 +; TONGA-NEXT: s_and_b32 s6, s6, -4 +; TONGA-NEXT: s_sub_u32 s6, s8, s6 +; TONGA-NEXT: s_subb_u32 s7, s9, s7 +; TONGA-NEXT: s_ashr_i32 s8, s3, 31 +; TONGA-NEXT: v_readfirstlane_b32 s4, v2 +; TONGA-NEXT: v_mov_b32_e32 v2, s6 +; TONGA-NEXT: s_lshr_b32 s6, s8, 30 +; TONGA-NEXT: s_add_u32 s6, s2, s6 +; TONGA-NEXT: v_readfirstlane_b32 s5, v3 +; TONGA-NEXT: v_mov_b32_e32 v3, s7 +; TONGA-NEXT: s_addc_u32 s7, s3, 0 +; TONGA-NEXT: s_and_b32 s6, s6, -4 +; TONGA-NEXT: s_sub_u32 s2, s2, s6 +; TONGA-NEXT: s_subb_u32 s3, s3, s7 +; TONGA-NEXT: s_ashr_i32 s6, s5, 31 ; TONGA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; TONGA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: v_mov_b32_e32 v0, s2 +; TONGA-NEXT: s_lshr_b32 s2, s6, 30 +; TONGA-NEXT: s_add_u32 s2, s4, s2 +; TONGA-NEXT: v_mov_b32_e32 v1, s3 +; TONGA-NEXT: s_addc_u32 s3, s5, 0 +; TONGA-NEXT: s_and_b32 s2, s2, -4 +; TONGA-NEXT: s_sub_u32 s2, s4, s2 +; TONGA-NEXT: s_subb_u32 s3, s5, s3 +; TONGA-NEXT: s_add_u32 s0, s0, 16 +; TONGA-NEXT: s_addc_u32 s1, s1, 0 +; TONGA-NEXT: v_mov_b32_e32 v5, s1 +; TONGA-NEXT: v_mov_b32_e32 v2, s2 +; TONGA-NEXT: v_mov_b32_e32 v3, s3 +; TONGA-NEXT: v_mov_b32_e32 v4, s0 +; TONGA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; TONGA-NEXT: s_endpgm ; ; EG-LABEL: srem_v4i64_4: diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index e64e3def98c26..6e814eefc2d89 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -653,11 +653,11 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s0, s0, 7 +; GCN-NEXT: s_ashr_i32 s0, s1, 7 ; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -694,11 +694,11 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 7 +; GCN-IR-NEXT: s_ashr_i32 s0, s1, 7 ; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -742,11 +742,11 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s0, s0, 1 +; GCN-NEXT: s_ashr_i32 s0, s1, 1 ; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -783,11 +783,11 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 1 +; GCN-IR-NEXT: s_ashr_i32 s0, s1, 1 ; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -832,11 +832,11 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_abs_i32 s8, s0 +; GCN-NEXT: s_abs_i32 s8, s1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_sub_i32 s0, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -871,11 +871,11 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_abs_i32 s8, s0 +; GCN-IR-NEXT: s_abs_i32 s8, s1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -921,47 +921,45 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 31 -; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 31 -; GCN-NEXT: s_ashr_i32 s6, s5, 31 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s6 -; GCN-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] +; GCN-NEXT: s_ashr_i64 s[10:11], s[2:3], 31 +; GCN-NEXT: s_ashr_i64 s[6:7], s[4:5], 31 +; GCN-NEXT: s_ashr_i32 s4, s5, 31 +; GCN-NEXT: s_add_u32 s6, s6, s4 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_addc_u32 s7, s7, s4 +; GCN-NEXT: s_xor_b64 s[8:9], s[6:7], s[4:5] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s4, 0, s8 -; GCN-NEXT: s_subb_u32 s5, 0, s9 -; GCN-NEXT: s_ashr_i32 s10, s3, 31 +; GCN-NEXT: s_sub_u32 s2, 0, s8 +; GCN-NEXT: s_subb_u32 s4, 0, s9 +; GCN-NEXT: s_ashr_i32 s12, s3, 31 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_add_u32 s2, s2, s10 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: s_addc_u32 s3, s3, s10 +; GCN-NEXT: s_mov_b32 s13, s12 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 ; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc @@ -969,12 +967,12 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -990,18 +988,20 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: s_add_u32 s2, s10, s12 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: s_addc_u32 s3, s11, s12 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s12, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s12, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s13, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s13, v1 +; GCN-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] +; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s13, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s13, v0 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -1013,9 +1013,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_mul_lo_u32 v0, s8, v0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] @@ -1030,7 +1030,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s13 +; GCN-NEXT: v_mov_b32_e32 v4, s11 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc @@ -1042,10 +1042,10 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s10, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s10, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s12, v1 +; GCN-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -1053,25 +1053,25 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_srem33_64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 31 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 31 ; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 31 ; GCN-IR-NEXT: s_mov_b32 s5, s4 -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[8:9], 31 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[6:7], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s6, s2, s4 ; GCN-IR-NEXT: s_subb_u32 s7, s3, s4 ; GCN-IR-NEXT: s_ashr_i32 s2, s9, 31 ; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[10:11], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s8, s8, s2 ; GCN-IR-NEXT: s_subb_u32 s9, s9, s2 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[8:9], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[6:7], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[8:9] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[2:3], s[10:11] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[2:3] ; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[6:7] ; GCN-IR-NEXT: s_sub_u32 s14, s12, s20 ; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll index b5d9d00c48045..ed2f06b8136a2 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll @@ -3963,8 +3963,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4067,8 +4067,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4175,8 +4175,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -4283,8 +4283,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4387,8 +4387,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4502,8 +4502,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll index 2a989ecd2ebad..8812cae20f110 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll @@ -3963,8 +3963,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4067,8 +4067,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4175,8 +4175,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -4283,8 +4283,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4387,8 +4387,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4502,8 +4502,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll index 69fd58aadfbcc..82eb122f9f703 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll @@ -3839,8 +3839,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -3943,8 +3943,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4051,8 +4051,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -4159,8 +4159,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4263,8 +4263,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4378,8 +4378,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index 1d3b42ee43b0f..7b304b2c81e2f 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -3575,8 +3575,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -3679,8 +3679,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -3787,8 +3787,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -3895,8 +3895,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -3999,8 +3999,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4114,8 +4114,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index cdbbabe3e3b05..f2f947814c667 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -151,74 +151,74 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: .reg .b64 %rd<66>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [urem_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [urem_i128_param_0]; ; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [urem_i128_param_1]; -; CHECK-NEXT: or.b64 %rd7, %rd1, %rd2; -; CHECK-NEXT: setp.eq.b64 %p1, %rd7, 0; -; CHECK-NEXT: or.b64 %rd8, %rd5, %rd6; -; CHECK-NEXT: setp.eq.b64 %p2, %rd8, 0; +; CHECK-NEXT: or.b64 %rd9, %rd1, %rd2; +; CHECK-NEXT: setp.eq.b64 %p1, %rd9, 0; +; CHECK-NEXT: or.b64 %rd10, %rd7, %rd8; +; CHECK-NEXT: setp.eq.b64 %p2, %rd10, 0; ; CHECK-NEXT: or.pred %p3, %p1, %p2; ; CHECK-NEXT: setp.ne.b64 %p4, %rd2, 0; ; CHECK-NEXT: clz.b64 %r1, %rd2; -; CHECK-NEXT: cvt.u64.u32 %rd9, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd11, %r1; ; CHECK-NEXT: clz.b64 %r2, %rd1; -; CHECK-NEXT: cvt.u64.u32 %rd10, %r2; -; CHECK-NEXT: add.s64 %rd11, %rd10, 64; -; CHECK-NEXT: selp.b64 %rd12, %rd9, %rd11, %p4; -; CHECK-NEXT: setp.ne.b64 %p5, %rd6, 0; -; CHECK-NEXT: clz.b64 %r3, %rd6; -; CHECK-NEXT: cvt.u64.u32 %rd13, %r3; -; CHECK-NEXT: clz.b64 %r4, %rd5; -; CHECK-NEXT: cvt.u64.u32 %rd14, %r4; -; CHECK-NEXT: add.s64 %rd15, %rd14, 64; -; CHECK-NEXT: selp.b64 %rd16, %rd13, %rd15, %p5; +; CHECK-NEXT: cvt.u64.u32 %rd12, %r2; +; CHECK-NEXT: add.s64 %rd13, %rd12, 64; +; CHECK-NEXT: selp.b64 %rd14, %rd11, %rd13, %p4; +; CHECK-NEXT: setp.ne.b64 %p5, %rd8, 0; +; CHECK-NEXT: clz.b64 %r3, %rd8; +; CHECK-NEXT: cvt.u64.u32 %rd15, %r3; +; CHECK-NEXT: clz.b64 %r4, %rd7; +; CHECK-NEXT: cvt.u64.u32 %rd16, %r4; +; CHECK-NEXT: add.s64 %rd17, %rd16, 64; +; CHECK-NEXT: selp.b64 %rd18, %rd15, %rd17, %p5; ; CHECK-NEXT: mov.b64 %rd57, 0; -; CHECK-NEXT: sub.cc.s64 %rd17, %rd12, %rd16; -; CHECK-NEXT: subc.cc.s64 %rd18, %rd57, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd17, 127; -; CHECK-NEXT: setp.eq.b64 %p7, %rd18, 0; +; CHECK-NEXT: sub.cc.s64 %rd3, %rd14, %rd18; +; CHECK-NEXT: subc.cc.s64 %rd4, %rd57, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd3, 127; +; CHECK-NEXT: setp.eq.b64 %p7, %rd4, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.b64 %p9, %rd18, 0; +; CHECK-NEXT: setp.ne.b64 %p9, %rd4, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd19, %rd17, 127; -; CHECK-NEXT: or.b64 %rd20, %rd19, %rd18; +; CHECK-NEXT: xor.b64 %rd19, %rd3, 127; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd4; ; CHECK-NEXT: setp.eq.b64 %p12, %rd20, 0; -; CHECK-NEXT: selp.b64 %rd65, 0, %rd6, %p11; -; CHECK-NEXT: selp.b64 %rd64, 0, %rd5, %p11; +; CHECK-NEXT: selp.b64 %rd65, 0, %rd8, %p11; +; CHECK-NEXT: selp.b64 %rd64, 0, %rd7, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB1_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd58, %rd17, 1; -; CHECK-NEXT: addc.cc.s64 %rd59, %rd18, 0; +; CHECK-NEXT: add.cc.s64 %rd58, %rd3, 1; +; CHECK-NEXT: addc.cc.s64 %rd59, %rd4, 0; ; CHECK-NEXT: or.b64 %rd21, %rd58, %rd59; ; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd17; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd3; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd22, %rd6, %r6; +; CHECK-NEXT: shl.b64 %rd22, %rd8, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd23, %rd5, %r7; +; CHECK-NEXT: shr.u64 %rd23, %rd7, %r7; ; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd25, %rd5, %r8; +; CHECK-NEXT: shl.b64 %rd25, %rd7, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; ; CHECK-NEXT: selp.b64 %rd63, %rd25, %rd24, %p15; -; CHECK-NEXT: shl.b64 %rd62, %rd5, %r6; +; CHECK-NEXT: shl.b64 %rd62, %rd7, %r6; ; CHECK-NEXT: mov.b64 %rd56, %rd57; ; CHECK-NEXT: @%p14 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader ; CHECK-NEXT: cvt.u32.u64 %r9, %rd58; -; CHECK-NEXT: shr.u64 %rd26, %rd5, %r9; +; CHECK-NEXT: shr.u64 %rd26, %rd7, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd27, %rd6, %r10; +; CHECK-NEXT: shl.b64 %rd27, %rd8, %r10; ; CHECK-NEXT: or.b64 %rd28, %rd26, %rd27; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd29, %rd6, %r11; +; CHECK-NEXT: shr.u64 %rd29, %rd8, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; ; CHECK-NEXT: selp.b64 %rd60, %rd29, %rd28, %p16; -; CHECK-NEXT: shr.u64 %rd61, %rd6, %r9; -; CHECK-NEXT: add.cc.s64 %rd3, %rd1, -1; -; CHECK-NEXT: addc.cc.s64 %rd4, %rd2, -1; +; CHECK-NEXT: shr.u64 %rd61, %rd8, %r9; +; CHECK-NEXT: add.cc.s64 %rd5, %rd1, -1; +; CHECK-NEXT: addc.cc.s64 %rd6, %rd2, -1; ; CHECK-NEXT: mov.b64 %rd56, 0; ; CHECK-NEXT: mov.b64 %rd57, %rd56; ; CHECK-NEXT: $L__BB1_2: // %udiv-do-while @@ -235,8 +235,8 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: shl.b64 %rd39, %rd62, 1; ; CHECK-NEXT: or.b64 %rd62, %rd57, %rd39; ; CHECK-NEXT: or.b64 %rd63, %rd56, %rd38; -; CHECK-NEXT: sub.cc.s64 %rd40, %rd3, %rd35; -; CHECK-NEXT: subc.cc.s64 %rd41, %rd4, %rd32; +; CHECK-NEXT: sub.cc.s64 %rd40, %rd5, %rd35; +; CHECK-NEXT: subc.cc.s64 %rd41, %rd6, %rd32; ; CHECK-NEXT: shr.s64 %rd42, %rd41, 63; ; CHECK-NEXT: and.b64 %rd57, %rd42, 1; ; CHECK-NEXT: and.b64 %rd43, %rd42, %rd1; @@ -261,8 +261,8 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: mad.lo.s64 %rd51, %rd1, %rd65, %rd50; ; CHECK-NEXT: mad.lo.s64 %rd52, %rd2, %rd64, %rd51; ; CHECK-NEXT: mul.lo.s64 %rd53, %rd1, %rd64; -; CHECK-NEXT: sub.cc.s64 %rd54, %rd5, %rd53; -; CHECK-NEXT: subc.cc.s64 %rd55, %rd6, %rd52; +; CHECK-NEXT: sub.cc.s64 %rd54, %rd7, %rd53; +; CHECK-NEXT: subc.cc.s64 %rd55, %rd8, %rd52; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd54, %rd55}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, %rhs @@ -447,74 +447,74 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: .reg .b64 %rd<60>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [udiv_i128_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [udiv_i128_param_1]; -; CHECK-NEXT: or.b64 %rd7, %rd5, %rd6; -; CHECK-NEXT: setp.eq.b64 %p1, %rd7, 0; -; CHECK-NEXT: or.b64 %rd8, %rd3, %rd4; -; CHECK-NEXT: setp.eq.b64 %p2, %rd8, 0; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [udiv_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [udiv_i128_param_1]; +; CHECK-NEXT: or.b64 %rd9, %rd7, %rd8; +; CHECK-NEXT: setp.eq.b64 %p1, %rd9, 0; +; CHECK-NEXT: or.b64 %rd10, %rd5, %rd6; +; CHECK-NEXT: setp.eq.b64 %p2, %rd10, 0; ; CHECK-NEXT: or.pred %p3, %p1, %p2; -; CHECK-NEXT: setp.ne.b64 %p4, %rd6, 0; -; CHECK-NEXT: clz.b64 %r1, %rd6; -; CHECK-NEXT: cvt.u64.u32 %rd9, %r1; -; CHECK-NEXT: clz.b64 %r2, %rd5; -; CHECK-NEXT: cvt.u64.u32 %rd10, %r2; -; CHECK-NEXT: add.s64 %rd11, %rd10, 64; -; CHECK-NEXT: selp.b64 %rd12, %rd9, %rd11, %p4; -; CHECK-NEXT: setp.ne.b64 %p5, %rd4, 0; -; CHECK-NEXT: clz.b64 %r3, %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd13, %r3; -; CHECK-NEXT: clz.b64 %r4, %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd14, %r4; -; CHECK-NEXT: add.s64 %rd15, %rd14, 64; -; CHECK-NEXT: selp.b64 %rd16, %rd13, %rd15, %p5; +; CHECK-NEXT: setp.ne.b64 %p4, %rd8, 0; +; CHECK-NEXT: clz.b64 %r1, %rd8; +; CHECK-NEXT: cvt.u64.u32 %rd11, %r1; +; CHECK-NEXT: clz.b64 %r2, %rd7; +; CHECK-NEXT: cvt.u64.u32 %rd12, %r2; +; CHECK-NEXT: add.s64 %rd13, %rd12, 64; +; CHECK-NEXT: selp.b64 %rd14, %rd11, %rd13, %p4; +; CHECK-NEXT: setp.ne.b64 %p5, %rd6, 0; +; CHECK-NEXT: clz.b64 %r3, %rd6; +; CHECK-NEXT: cvt.u64.u32 %rd15, %r3; +; CHECK-NEXT: clz.b64 %r4, %rd5; +; CHECK-NEXT: cvt.u64.u32 %rd16, %r4; +; CHECK-NEXT: add.s64 %rd17, %rd16, 64; +; CHECK-NEXT: selp.b64 %rd18, %rd15, %rd17, %p5; ; CHECK-NEXT: mov.b64 %rd51, 0; -; CHECK-NEXT: sub.cc.s64 %rd17, %rd12, %rd16; -; CHECK-NEXT: subc.cc.s64 %rd18, %rd51, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd17, 127; -; CHECK-NEXT: setp.eq.b64 %p7, %rd18, 0; +; CHECK-NEXT: sub.cc.s64 %rd1, %rd14, %rd18; +; CHECK-NEXT: subc.cc.s64 %rd2, %rd51, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd1, 127; +; CHECK-NEXT: setp.eq.b64 %p7, %rd2, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.b64 %p9, %rd18, 0; +; CHECK-NEXT: setp.ne.b64 %p9, %rd2, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd19, %rd17, 127; -; CHECK-NEXT: or.b64 %rd20, %rd19, %rd18; +; CHECK-NEXT: xor.b64 %rd19, %rd1, 127; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd2; ; CHECK-NEXT: setp.eq.b64 %p12, %rd20, 0; -; CHECK-NEXT: selp.b64 %rd59, 0, %rd4, %p11; -; CHECK-NEXT: selp.b64 %rd58, 0, %rd3, %p11; +; CHECK-NEXT: selp.b64 %rd59, 0, %rd6, %p11; +; CHECK-NEXT: selp.b64 %rd58, 0, %rd5, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB5_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd52, %rd17, 1; -; CHECK-NEXT: addc.cc.s64 %rd53, %rd18, 0; +; CHECK-NEXT: add.cc.s64 %rd52, %rd1, 1; +; CHECK-NEXT: addc.cc.s64 %rd53, %rd2, 0; ; CHECK-NEXT: or.b64 %rd21, %rd52, %rd53; ; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd17; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd1; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd22, %rd4, %r6; +; CHECK-NEXT: shl.b64 %rd22, %rd6, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd23, %rd3, %r7; +; CHECK-NEXT: shr.u64 %rd23, %rd5, %r7; ; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd25, %rd3, %r8; +; CHECK-NEXT: shl.b64 %rd25, %rd5, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; ; CHECK-NEXT: selp.b64 %rd57, %rd25, %rd24, %p15; -; CHECK-NEXT: shl.b64 %rd56, %rd3, %r6; +; CHECK-NEXT: shl.b64 %rd56, %rd5, %r6; ; CHECK-NEXT: mov.b64 %rd50, %rd51; ; CHECK-NEXT: @%p14 bra $L__BB5_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader ; CHECK-NEXT: cvt.u32.u64 %r9, %rd52; -; CHECK-NEXT: shr.u64 %rd26, %rd3, %r9; +; CHECK-NEXT: shr.u64 %rd26, %rd5, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd27, %rd4, %r10; +; CHECK-NEXT: shl.b64 %rd27, %rd6, %r10; ; CHECK-NEXT: or.b64 %rd28, %rd26, %rd27; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd29, %rd4, %r11; +; CHECK-NEXT: shr.u64 %rd29, %rd6, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; ; CHECK-NEXT: selp.b64 %rd54, %rd29, %rd28, %p16; -; CHECK-NEXT: shr.u64 %rd55, %rd4, %r9; -; CHECK-NEXT: add.cc.s64 %rd1, %rd5, -1; -; CHECK-NEXT: addc.cc.s64 %rd2, %rd6, -1; +; CHECK-NEXT: shr.u64 %rd55, %rd6, %r9; +; CHECK-NEXT: add.cc.s64 %rd3, %rd7, -1; +; CHECK-NEXT: addc.cc.s64 %rd4, %rd8, -1; ; CHECK-NEXT: mov.b64 %rd50, 0; ; CHECK-NEXT: mov.b64 %rd51, %rd50; ; CHECK-NEXT: $L__BB5_2: // %udiv-do-while @@ -531,12 +531,12 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: shl.b64 %rd39, %rd56, 1; ; CHECK-NEXT: or.b64 %rd56, %rd51, %rd39; ; CHECK-NEXT: or.b64 %rd57, %rd50, %rd38; -; CHECK-NEXT: sub.cc.s64 %rd40, %rd1, %rd35; -; CHECK-NEXT: subc.cc.s64 %rd41, %rd2, %rd32; +; CHECK-NEXT: sub.cc.s64 %rd40, %rd3, %rd35; +; CHECK-NEXT: subc.cc.s64 %rd41, %rd4, %rd32; ; CHECK-NEXT: shr.s64 %rd42, %rd41, 63; ; CHECK-NEXT: and.b64 %rd51, %rd42, 1; -; CHECK-NEXT: and.b64 %rd43, %rd42, %rd5; -; CHECK-NEXT: and.b64 %rd44, %rd42, %rd6; +; CHECK-NEXT: and.b64 %rd43, %rd42, %rd7; +; CHECK-NEXT: and.b64 %rd44, %rd42, %rd8; ; CHECK-NEXT: sub.cc.s64 %rd54, %rd35, %rd43; ; CHECK-NEXT: subc.cc.s64 %rd55, %rd32, %rd44; ; CHECK-NEXT: add.cc.s64 %rd52, %rd52, -1; diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll index f11a9c854c465..c3b21b389e783 100644 --- a/llvm/test/CodeGen/RISCV/abds.ll +++ b/llvm/test/CodeGen/RISCV/abds.ll @@ -2013,48 +2013,48 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a2) ; RV32I-NEXT: lw a4, 4(a2) -; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a2, 12(a2) -; RV32I-NEXT: lw a7, 8(a1) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a7, 12(a2) +; RV32I-NEXT: lw a2, 8(a1) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a6, 0(a1) +; RV32I-NEXT: lw a5, 0(a1) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a7, a5 -; RV32I-NEXT: sub t0, t0, a2 -; RV32I-NEXT: sltu a2, a6, a3 -; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: mv t1, a2 +; RV32I-NEXT: sltu t1, a2, a6 +; RV32I-NEXT: sub a7, t0, a7 +; RV32I-NEXT: sltu t0, a5, a3 +; RV32I-NEXT: sub a7, a7, t1 +; RV32I-NEXT: mv t1, t0 ; RV32I-NEXT: beq a1, a4, .LBB31_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: sltu t1, a1, a4 ; RV32I-NEXT: .LBB31_2: -; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: sub a6, a2, a6 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sltu a4, a5, t1 -; RV32I-NEXT: sub a5, a5, t1 -; RV32I-NEXT: sub a4, t0, a4 -; RV32I-NEXT: sub a2, a1, a2 -; RV32I-NEXT: sub a1, a6, a3 -; RV32I-NEXT: bgez a4, .LBB31_4 +; RV32I-NEXT: sltu a4, a6, t1 +; RV32I-NEXT: sub a2, a1, t0 +; RV32I-NEXT: sub a1, a7, a4 +; RV32I-NEXT: sub a4, a6, t1 +; RV32I-NEXT: sub a3, a5, a3 +; RV32I-NEXT: bgez a1, .LBB31_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: snez a3, a2 -; RV32I-NEXT: snez a6, a1 -; RV32I-NEXT: neg a7, a5 -; RV32I-NEXT: snez a5, a5 -; RV32I-NEXT: or a3, a6, a3 -; RV32I-NEXT: add a4, a4, a5 -; RV32I-NEXT: add a2, a2, a6 -; RV32I-NEXT: sltu a6, a7, a3 -; RV32I-NEXT: neg a4, a4 -; RV32I-NEXT: sub a5, a7, a3 -; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: sub a4, a4, a6 +; RV32I-NEXT: neg a5, a4 +; RV32I-NEXT: or a6, a3, a2 +; RV32I-NEXT: snez a4, a4 +; RV32I-NEXT: snez a7, a3 +; RV32I-NEXT: neg a3, a3 +; RV32I-NEXT: snez a6, a6 +; RV32I-NEXT: add a1, a1, a4 +; RV32I-NEXT: add a2, a2, a7 +; RV32I-NEXT: sltu a7, a5, a6 ; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a4, a5, a6 +; RV32I-NEXT: sub a1, a1, a7 +; RV32I-NEXT: neg a2, a2 ; RV32I-NEXT: .LBB31_4: -; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a2, 4(a0) -; RV32I-NEXT: sw a5, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_subnsw_i128: @@ -2076,48 +2076,48 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a2) ; RV32ZBB-NEXT: lw a4, 4(a2) -; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a2, 12(a2) -; RV32ZBB-NEXT: lw a7, 8(a1) +; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a7, 12(a2) +; RV32ZBB-NEXT: lw a2, 8(a1) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a6, 0(a1) +; RV32ZBB-NEXT: lw a5, 0(a1) ; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a7, a5 -; RV32ZBB-NEXT: sub t0, t0, a2 -; RV32ZBB-NEXT: sltu a2, a6, a3 -; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: mv t1, a2 +; RV32ZBB-NEXT: sltu t1, a2, a6 +; RV32ZBB-NEXT: sub a7, t0, a7 +; RV32ZBB-NEXT: sltu t0, a5, a3 +; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: mv t1, t0 ; RV32ZBB-NEXT: beq a1, a4, .LBB31_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: sltu t1, a1, a4 ; RV32ZBB-NEXT: .LBB31_2: -; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: sub a6, a2, a6 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sltu a4, a5, t1 -; RV32ZBB-NEXT: sub a5, a5, t1 -; RV32ZBB-NEXT: sub a4, t0, a4 -; RV32ZBB-NEXT: sub a2, a1, a2 -; RV32ZBB-NEXT: sub a1, a6, a3 -; RV32ZBB-NEXT: bgez a4, .LBB31_4 +; RV32ZBB-NEXT: sltu a4, a6, t1 +; RV32ZBB-NEXT: sub a2, a1, t0 +; RV32ZBB-NEXT: sub a1, a7, a4 +; RV32ZBB-NEXT: sub a4, a6, t1 +; RV32ZBB-NEXT: sub a3, a5, a3 +; RV32ZBB-NEXT: bgez a1, .LBB31_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: snez a3, a2 -; RV32ZBB-NEXT: snez a6, a1 -; RV32ZBB-NEXT: neg a7, a5 -; RV32ZBB-NEXT: snez a5, a5 -; RV32ZBB-NEXT: or a3, a6, a3 -; RV32ZBB-NEXT: add a4, a4, a5 -; RV32ZBB-NEXT: add a2, a2, a6 -; RV32ZBB-NEXT: sltu a6, a7, a3 -; RV32ZBB-NEXT: neg a4, a4 -; RV32ZBB-NEXT: sub a5, a7, a3 -; RV32ZBB-NEXT: neg a2, a2 -; RV32ZBB-NEXT: sub a4, a4, a6 +; RV32ZBB-NEXT: neg a5, a4 +; RV32ZBB-NEXT: or a6, a3, a2 +; RV32ZBB-NEXT: snez a4, a4 +; RV32ZBB-NEXT: snez a7, a3 +; RV32ZBB-NEXT: neg a3, a3 +; RV32ZBB-NEXT: snez a6, a6 +; RV32ZBB-NEXT: add a1, a1, a4 +; RV32ZBB-NEXT: add a2, a2, a7 +; RV32ZBB-NEXT: sltu a7, a5, a6 ; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a4, a5, a6 +; RV32ZBB-NEXT: sub a1, a1, a7 +; RV32ZBB-NEXT: neg a2, a2 ; RV32ZBB-NEXT: .LBB31_4: -; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a3, 0(a0) ; RV32ZBB-NEXT: sw a2, 4(a0) -; RV32ZBB-NEXT: sw a5, 8(a0) -; RV32ZBB-NEXT: sw a4, 12(a0) +; RV32ZBB-NEXT: sw a4, 8(a0) +; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_subnsw_i128: @@ -2144,48 +2144,48 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a2) ; RV32I-NEXT: lw a4, 4(a2) -; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a2, 12(a2) -; RV32I-NEXT: lw a7, 8(a1) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a7, 12(a2) +; RV32I-NEXT: lw a2, 8(a1) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a6, 0(a1) +; RV32I-NEXT: lw a5, 0(a1) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a7, a5 -; RV32I-NEXT: sub t0, t0, a2 -; RV32I-NEXT: sltu a2, a6, a3 -; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: mv t1, a2 +; RV32I-NEXT: sltu t1, a2, a6 +; RV32I-NEXT: sub a7, t0, a7 +; RV32I-NEXT: sltu t0, a5, a3 +; RV32I-NEXT: sub a7, a7, t1 +; RV32I-NEXT: mv t1, t0 ; RV32I-NEXT: beq a1, a4, .LBB32_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: sltu t1, a1, a4 ; RV32I-NEXT: .LBB32_2: -; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: sub a6, a2, a6 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sltu a4, a5, t1 -; RV32I-NEXT: sub a5, a5, t1 -; RV32I-NEXT: sub a4, t0, a4 -; RV32I-NEXT: sub a2, a1, a2 -; RV32I-NEXT: sub a1, a6, a3 -; RV32I-NEXT: bgez a4, .LBB32_4 +; RV32I-NEXT: sltu a4, a6, t1 +; RV32I-NEXT: sub a2, a1, t0 +; RV32I-NEXT: sub a1, a7, a4 +; RV32I-NEXT: sub a4, a6, t1 +; RV32I-NEXT: sub a3, a5, a3 +; RV32I-NEXT: bgez a1, .LBB32_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: snez a3, a2 -; RV32I-NEXT: snez a6, a1 -; RV32I-NEXT: neg a7, a5 -; RV32I-NEXT: snez a5, a5 -; RV32I-NEXT: or a3, a6, a3 -; RV32I-NEXT: add a4, a4, a5 -; RV32I-NEXT: add a2, a2, a6 -; RV32I-NEXT: sltu a6, a7, a3 -; RV32I-NEXT: neg a4, a4 -; RV32I-NEXT: sub a5, a7, a3 -; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: sub a4, a4, a6 +; RV32I-NEXT: neg a5, a4 +; RV32I-NEXT: or a6, a3, a2 +; RV32I-NEXT: snez a4, a4 +; RV32I-NEXT: snez a7, a3 +; RV32I-NEXT: neg a3, a3 +; RV32I-NEXT: snez a6, a6 +; RV32I-NEXT: add a1, a1, a4 +; RV32I-NEXT: add a2, a2, a7 +; RV32I-NEXT: sltu a7, a5, a6 ; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a4, a5, a6 +; RV32I-NEXT: sub a1, a1, a7 +; RV32I-NEXT: neg a2, a2 ; RV32I-NEXT: .LBB32_4: -; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a2, 4(a0) -; RV32I-NEXT: sw a5, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_subnsw_i128_undef: @@ -2207,48 +2207,48 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a2) ; RV32ZBB-NEXT: lw a4, 4(a2) -; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a2, 12(a2) -; RV32ZBB-NEXT: lw a7, 8(a1) +; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a7, 12(a2) +; RV32ZBB-NEXT: lw a2, 8(a1) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a6, 0(a1) +; RV32ZBB-NEXT: lw a5, 0(a1) ; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a7, a5 -; RV32ZBB-NEXT: sub t0, t0, a2 -; RV32ZBB-NEXT: sltu a2, a6, a3 -; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: mv t1, a2 +; RV32ZBB-NEXT: sltu t1, a2, a6 +; RV32ZBB-NEXT: sub a7, t0, a7 +; RV32ZBB-NEXT: sltu t0, a5, a3 +; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: mv t1, t0 ; RV32ZBB-NEXT: beq a1, a4, .LBB32_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: sltu t1, a1, a4 ; RV32ZBB-NEXT: .LBB32_2: -; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: sub a6, a2, a6 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sltu a4, a5, t1 -; RV32ZBB-NEXT: sub a5, a5, t1 -; RV32ZBB-NEXT: sub a4, t0, a4 -; RV32ZBB-NEXT: sub a2, a1, a2 -; RV32ZBB-NEXT: sub a1, a6, a3 -; RV32ZBB-NEXT: bgez a4, .LBB32_4 +; RV32ZBB-NEXT: sltu a4, a6, t1 +; RV32ZBB-NEXT: sub a2, a1, t0 +; RV32ZBB-NEXT: sub a1, a7, a4 +; RV32ZBB-NEXT: sub a4, a6, t1 +; RV32ZBB-NEXT: sub a3, a5, a3 +; RV32ZBB-NEXT: bgez a1, .LBB32_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: snez a3, a2 -; RV32ZBB-NEXT: snez a6, a1 -; RV32ZBB-NEXT: neg a7, a5 -; RV32ZBB-NEXT: snez a5, a5 -; RV32ZBB-NEXT: or a3, a6, a3 -; RV32ZBB-NEXT: add a4, a4, a5 -; RV32ZBB-NEXT: add a2, a2, a6 -; RV32ZBB-NEXT: sltu a6, a7, a3 -; RV32ZBB-NEXT: neg a4, a4 -; RV32ZBB-NEXT: sub a5, a7, a3 -; RV32ZBB-NEXT: neg a2, a2 -; RV32ZBB-NEXT: sub a4, a4, a6 +; RV32ZBB-NEXT: neg a5, a4 +; RV32ZBB-NEXT: or a6, a3, a2 +; RV32ZBB-NEXT: snez a4, a4 +; RV32ZBB-NEXT: snez a7, a3 +; RV32ZBB-NEXT: neg a3, a3 +; RV32ZBB-NEXT: snez a6, a6 +; RV32ZBB-NEXT: add a1, a1, a4 +; RV32ZBB-NEXT: add a2, a2, a7 +; RV32ZBB-NEXT: sltu a7, a5, a6 ; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a4, a5, a6 +; RV32ZBB-NEXT: sub a1, a1, a7 +; RV32ZBB-NEXT: neg a2, a2 ; RV32ZBB-NEXT: .LBB32_4: -; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a3, 0(a0) ; RV32ZBB-NEXT: sw a2, 4(a0) -; RV32ZBB-NEXT: sw a5, 8(a0) -; RV32ZBB-NEXT: sw a4, 12(a0) +; RV32ZBB-NEXT: sw a4, 8(a0) +; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_subnsw_i128_undef: diff --git a/llvm/test/CodeGen/RISCV/abdu-neg.ll b/llvm/test/CodeGen/RISCV/abdu-neg.ll index 713b52f53e3d9..405c0b905ca16 100644 --- a/llvm/test/CodeGen/RISCV/abdu-neg.ll +++ b/llvm/test/CodeGen/RISCV/abdu-neg.ll @@ -624,87 +624,88 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_ext_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a5, 0(a2) -; RV32I-NEXT: lw a7, 4(a2) -; RV32I-NEXT: lw a3, 8(a2) -; RV32I-NEXT: lw t1, 12(a2) -; RV32I-NEXT: lw a4, 8(a1) -; RV32I-NEXT: lw a6, 12(a1) -; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw a4, 0(a2) +; RV32I-NEXT: lw a6, 4(a2) +; RV32I-NEXT: lw t1, 8(a2) +; RV32I-NEXT: lw a2, 12(a2) +; RV32I-NEXT: lw a3, 8(a1) +; RV32I-NEXT: lw a5, 12(a1) +; RV32I-NEXT: lw a7, 0(a1) ; RV32I-NEXT: lw t0, 4(a1) -; RV32I-NEXT: sltu a1, a4, a3 -; RV32I-NEXT: sub t1, a6, t1 -; RV32I-NEXT: sltu t2, a2, a5 -; RV32I-NEXT: sub a1, t1, a1 -; RV32I-NEXT: mv t1, t2 -; RV32I-NEXT: beq t0, a7, .LBB11_2 +; RV32I-NEXT: sltu a1, a3, t1 +; RV32I-NEXT: sub a2, a5, a2 +; RV32I-NEXT: sltu t2, a7, a4 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: mv a2, t2 +; RV32I-NEXT: beq t0, a6, .LBB11_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, t0, a7 +; RV32I-NEXT: sltu a2, t0, a6 ; RV32I-NEXT: .LBB11_2: -; RV32I-NEXT: sub a3, a4, a3 -; RV32I-NEXT: sltu t3, a3, t1 +; RV32I-NEXT: sub t1, a3, t1 +; RV32I-NEXT: sltu t3, t1, a2 ; RV32I-NEXT: sub a1, a1, t3 -; RV32I-NEXT: sub a3, a3, t1 -; RV32I-NEXT: beq a1, a6, .LBB11_4 +; RV32I-NEXT: sub a2, t1, a2 +; RV32I-NEXT: beq a1, a5, .LBB11_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t1, a6, a1 +; RV32I-NEXT: sltu t1, a5, a1 ; RV32I-NEXT: j .LBB11_5 ; RV32I-NEXT: .LBB11_4: -; RV32I-NEXT: sltu t1, a4, a3 +; RV32I-NEXT: sltu t1, a3, a2 ; RV32I-NEXT: .LBB11_5: -; RV32I-NEXT: sub a7, t0, a7 -; RV32I-NEXT: sub a7, a7, t2 -; RV32I-NEXT: sub a5, a2, a5 -; RV32I-NEXT: beq a7, t0, .LBB11_7 +; RV32I-NEXT: sub a6, t0, a6 +; RV32I-NEXT: sub a6, a6, t2 +; RV32I-NEXT: sub t2, a7, a4 +; RV32I-NEXT: beq a6, t0, .LBB11_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: sltu a2, t0, a7 +; RV32I-NEXT: sltu a4, t0, a6 ; RV32I-NEXT: j .LBB11_8 ; RV32I-NEXT: .LBB11_7: -; RV32I-NEXT: sltu a2, a2, a5 +; RV32I-NEXT: sltu a4, a7, t2 ; RV32I-NEXT: .LBB11_8: -; RV32I-NEXT: xor a6, a1, a6 -; RV32I-NEXT: xor a4, a3, a4 -; RV32I-NEXT: or a4, a4, a6 -; RV32I-NEXT: beqz a4, .LBB11_10 +; RV32I-NEXT: xor a5, a1, a5 +; RV32I-NEXT: xor a3, a2, a3 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: beqz a3, .LBB11_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a2, t1 +; RV32I-NEXT: mv a4, t1 ; RV32I-NEXT: .LBB11_10: -; RV32I-NEXT: neg a4, a2 -; RV32I-NEXT: xor t0, a5, a4 -; RV32I-NEXT: xor t3, a7, a4 -; RV32I-NEXT: sltu a5, t0, a4 -; RV32I-NEXT: add a6, t3, a2 -; RV32I-NEXT: add t0, t0, a2 -; RV32I-NEXT: sub t1, a6, a5 -; RV32I-NEXT: snez a6, t1 -; RV32I-NEXT: snez t2, t0 -; RV32I-NEXT: or a6, t2, a6 -; RV32I-NEXT: beqz a7, .LBB11_12 +; RV32I-NEXT: neg a5, a4 +; RV32I-NEXT: xor t0, t2, a5 +; RV32I-NEXT: sltu a7, t0, a5 +; RV32I-NEXT: xor t1, a6, a5 +; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: beqz a6, .LBB11_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu a5, t3, a4 +; RV32I-NEXT: sltu a3, t1, a5 ; RV32I-NEXT: .LBB11_12: -; RV32I-NEXT: xor a3, a3, a4 -; RV32I-NEXT: xor a1, a1, a4 -; RV32I-NEXT: add t1, t1, t2 -; RV32I-NEXT: neg a7, t0 -; RV32I-NEXT: add t0, a3, a2 -; RV32I-NEXT: sltu a3, a3, a4 +; RV32I-NEXT: xor a2, a2, a5 +; RV32I-NEXT: add t1, t1, a4 +; RV32I-NEXT: add t0, t0, a4 +; RV32I-NEXT: xor a1, a1, a5 +; RV32I-NEXT: add a6, a2, a4 +; RV32I-NEXT: sub a7, t1, a7 +; RV32I-NEXT: sltu a2, a2, a5 +; RV32I-NEXT: add a1, a1, a4 +; RV32I-NEXT: snez a4, t0 +; RV32I-NEXT: neg a5, t0 +; RV32I-NEXT: sub t1, a6, a3 +; RV32I-NEXT: or t0, t0, a7 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sltu a2, a6, a3 +; RV32I-NEXT: add a4, a7, a4 +; RV32I-NEXT: neg a3, t1 +; RV32I-NEXT: snez a6, t0 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: snez a2, t1 +; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: sltu a7, a3, a6 ; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: neg a2, t1 -; RV32I-NEXT: sub a4, t0, a5 -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: sltu a3, t0, a5 -; RV32I-NEXT: neg a5, a4 -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: snez a3, a4 -; RV32I-NEXT: sltu a4, a5, a6 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: sub a3, a5, a6 +; RV32I-NEXT: sub a2, a3, a6 ; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sw a7, 0(a0) -; RV32I-NEXT: sw a2, 4(a0) -; RV32I-NEXT: sw a3, 8(a0) +; RV32I-NEXT: sub a1, a1, a7 +; RV32I-NEXT: sw a5, 0(a0) +; RV32I-NEXT: sw a4, 4(a0) +; RV32I-NEXT: sw a2, 8(a0) ; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; @@ -736,87 +737,88 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; ; RV32ZBB-LABEL: abd_ext_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a5, 0(a2) -; RV32ZBB-NEXT: lw a7, 4(a2) -; RV32ZBB-NEXT: lw a3, 8(a2) -; RV32ZBB-NEXT: lw t1, 12(a2) -; RV32ZBB-NEXT: lw a4, 8(a1) -; RV32ZBB-NEXT: lw a6, 12(a1) -; RV32ZBB-NEXT: lw a2, 0(a1) +; RV32ZBB-NEXT: lw a4, 0(a2) +; RV32ZBB-NEXT: lw a6, 4(a2) +; RV32ZBB-NEXT: lw t1, 8(a2) +; RV32ZBB-NEXT: lw a2, 12(a2) +; RV32ZBB-NEXT: lw a3, 8(a1) +; RV32ZBB-NEXT: lw a5, 12(a1) +; RV32ZBB-NEXT: lw a7, 0(a1) ; RV32ZBB-NEXT: lw t0, 4(a1) -; RV32ZBB-NEXT: sltu a1, a4, a3 -; RV32ZBB-NEXT: sub t1, a6, t1 -; RV32ZBB-NEXT: sltu t2, a2, a5 -; RV32ZBB-NEXT: sub a1, t1, a1 -; RV32ZBB-NEXT: mv t1, t2 -; RV32ZBB-NEXT: beq t0, a7, .LBB11_2 +; RV32ZBB-NEXT: sltu a1, a3, t1 +; RV32ZBB-NEXT: sub a2, a5, a2 +; RV32ZBB-NEXT: sltu t2, a7, a4 +; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: mv a2, t2 +; RV32ZBB-NEXT: beq t0, a6, .LBB11_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, t0, a7 +; RV32ZBB-NEXT: sltu a2, t0, a6 ; RV32ZBB-NEXT: .LBB11_2: -; RV32ZBB-NEXT: sub a3, a4, a3 -; RV32ZBB-NEXT: sltu t3, a3, t1 +; RV32ZBB-NEXT: sub t1, a3, t1 +; RV32ZBB-NEXT: sltu t3, t1, a2 ; RV32ZBB-NEXT: sub a1, a1, t3 -; RV32ZBB-NEXT: sub a3, a3, t1 -; RV32ZBB-NEXT: beq a1, a6, .LBB11_4 +; RV32ZBB-NEXT: sub a2, t1, a2 +; RV32ZBB-NEXT: beq a1, a5, .LBB11_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t1, a6, a1 +; RV32ZBB-NEXT: sltu t1, a5, a1 ; RV32ZBB-NEXT: j .LBB11_5 ; RV32ZBB-NEXT: .LBB11_4: -; RV32ZBB-NEXT: sltu t1, a4, a3 +; RV32ZBB-NEXT: sltu t1, a3, a2 ; RV32ZBB-NEXT: .LBB11_5: -; RV32ZBB-NEXT: sub a7, t0, a7 -; RV32ZBB-NEXT: sub a7, a7, t2 -; RV32ZBB-NEXT: sub a5, a2, a5 -; RV32ZBB-NEXT: beq a7, t0, .LBB11_7 +; RV32ZBB-NEXT: sub a6, t0, a6 +; RV32ZBB-NEXT: sub a6, a6, t2 +; RV32ZBB-NEXT: sub t2, a7, a4 +; RV32ZBB-NEXT: beq a6, t0, .LBB11_7 ; RV32ZBB-NEXT: # %bb.6: -; RV32ZBB-NEXT: sltu a2, t0, a7 +; RV32ZBB-NEXT: sltu a4, t0, a6 ; RV32ZBB-NEXT: j .LBB11_8 ; RV32ZBB-NEXT: .LBB11_7: -; RV32ZBB-NEXT: sltu a2, a2, a5 +; RV32ZBB-NEXT: sltu a4, a7, t2 ; RV32ZBB-NEXT: .LBB11_8: -; RV32ZBB-NEXT: xor a6, a1, a6 -; RV32ZBB-NEXT: xor a4, a3, a4 -; RV32ZBB-NEXT: or a4, a4, a6 -; RV32ZBB-NEXT: beqz a4, .LBB11_10 +; RV32ZBB-NEXT: xor a5, a1, a5 +; RV32ZBB-NEXT: xor a3, a2, a3 +; RV32ZBB-NEXT: or a3, a3, a5 +; RV32ZBB-NEXT: beqz a3, .LBB11_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: mv a2, t1 +; RV32ZBB-NEXT: mv a4, t1 ; RV32ZBB-NEXT: .LBB11_10: -; RV32ZBB-NEXT: neg a4, a2 -; RV32ZBB-NEXT: xor t0, a5, a4 -; RV32ZBB-NEXT: xor t3, a7, a4 -; RV32ZBB-NEXT: sltu a5, t0, a4 -; RV32ZBB-NEXT: add a6, t3, a2 -; RV32ZBB-NEXT: add t0, t0, a2 -; RV32ZBB-NEXT: sub t1, a6, a5 -; RV32ZBB-NEXT: snez a6, t1 -; RV32ZBB-NEXT: snez t2, t0 -; RV32ZBB-NEXT: or a6, t2, a6 -; RV32ZBB-NEXT: beqz a7, .LBB11_12 +; RV32ZBB-NEXT: neg a5, a4 +; RV32ZBB-NEXT: xor t0, t2, a5 +; RV32ZBB-NEXT: sltu a7, t0, a5 +; RV32ZBB-NEXT: xor t1, a6, a5 +; RV32ZBB-NEXT: mv a3, a7 +; RV32ZBB-NEXT: beqz a6, .LBB11_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu a5, t3, a4 +; RV32ZBB-NEXT: sltu a3, t1, a5 ; RV32ZBB-NEXT: .LBB11_12: -; RV32ZBB-NEXT: xor a3, a3, a4 -; RV32ZBB-NEXT: xor a1, a1, a4 -; RV32ZBB-NEXT: add t1, t1, t2 -; RV32ZBB-NEXT: neg a7, t0 -; RV32ZBB-NEXT: add t0, a3, a2 -; RV32ZBB-NEXT: sltu a3, a3, a4 +; RV32ZBB-NEXT: xor a2, a2, a5 +; RV32ZBB-NEXT: add t1, t1, a4 +; RV32ZBB-NEXT: add t0, t0, a4 +; RV32ZBB-NEXT: xor a1, a1, a5 +; RV32ZBB-NEXT: add a6, a2, a4 +; RV32ZBB-NEXT: sub a7, t1, a7 +; RV32ZBB-NEXT: sltu a2, a2, a5 +; RV32ZBB-NEXT: add a1, a1, a4 +; RV32ZBB-NEXT: snez a4, t0 +; RV32ZBB-NEXT: neg a5, t0 +; RV32ZBB-NEXT: sub t1, a6, a3 +; RV32ZBB-NEXT: or t0, t0, a7 +; RV32ZBB-NEXT: sub a1, a1, a2 +; RV32ZBB-NEXT: sltu a2, a6, a3 +; RV32ZBB-NEXT: add a4, a7, a4 +; RV32ZBB-NEXT: neg a3, t1 +; RV32ZBB-NEXT: snez a6, t0 +; RV32ZBB-NEXT: sub a1, a1, a2 +; RV32ZBB-NEXT: snez a2, t1 +; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: sltu a7, a3, a6 ; RV32ZBB-NEXT: add a1, a1, a2 -; RV32ZBB-NEXT: neg a2, t1 -; RV32ZBB-NEXT: sub a4, t0, a5 -; RV32ZBB-NEXT: sub a1, a1, a3 -; RV32ZBB-NEXT: sltu a3, t0, a5 -; RV32ZBB-NEXT: neg a5, a4 -; RV32ZBB-NEXT: sub a1, a1, a3 -; RV32ZBB-NEXT: snez a3, a4 -; RV32ZBB-NEXT: sltu a4, a5, a6 -; RV32ZBB-NEXT: add a1, a1, a3 -; RV32ZBB-NEXT: sub a3, a5, a6 +; RV32ZBB-NEXT: sub a2, a3, a6 ; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sw a7, 0(a0) -; RV32ZBB-NEXT: sw a2, 4(a0) -; RV32ZBB-NEXT: sw a3, 8(a0) +; RV32ZBB-NEXT: sub a1, a1, a7 +; RV32ZBB-NEXT: sw a5, 0(a0) +; RV32ZBB-NEXT: sw a4, 4(a0) +; RV32ZBB-NEXT: sw a2, 8(a0) ; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: ret ; @@ -857,87 +859,88 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_ext_i128_undef: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a5, 0(a2) -; RV32I-NEXT: lw a7, 4(a2) -; RV32I-NEXT: lw a3, 8(a2) -; RV32I-NEXT: lw t1, 12(a2) -; RV32I-NEXT: lw a4, 8(a1) -; RV32I-NEXT: lw a6, 12(a1) -; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw a4, 0(a2) +; RV32I-NEXT: lw a6, 4(a2) +; RV32I-NEXT: lw t1, 8(a2) +; RV32I-NEXT: lw a2, 12(a2) +; RV32I-NEXT: lw a3, 8(a1) +; RV32I-NEXT: lw a5, 12(a1) +; RV32I-NEXT: lw a7, 0(a1) ; RV32I-NEXT: lw t0, 4(a1) -; RV32I-NEXT: sltu a1, a4, a3 -; RV32I-NEXT: sub t1, a6, t1 -; RV32I-NEXT: sltu t2, a2, a5 -; RV32I-NEXT: sub a1, t1, a1 -; RV32I-NEXT: mv t1, t2 -; RV32I-NEXT: beq t0, a7, .LBB12_2 +; RV32I-NEXT: sltu a1, a3, t1 +; RV32I-NEXT: sub a2, a5, a2 +; RV32I-NEXT: sltu t2, a7, a4 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: mv a2, t2 +; RV32I-NEXT: beq t0, a6, .LBB12_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, t0, a7 +; RV32I-NEXT: sltu a2, t0, a6 ; RV32I-NEXT: .LBB12_2: -; RV32I-NEXT: sub a3, a4, a3 -; RV32I-NEXT: sltu t3, a3, t1 +; RV32I-NEXT: sub t1, a3, t1 +; RV32I-NEXT: sltu t3, t1, a2 ; RV32I-NEXT: sub a1, a1, t3 -; RV32I-NEXT: sub a3, a3, t1 -; RV32I-NEXT: beq a1, a6, .LBB12_4 +; RV32I-NEXT: sub a2, t1, a2 +; RV32I-NEXT: beq a1, a5, .LBB12_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t1, a6, a1 +; RV32I-NEXT: sltu t1, a5, a1 ; RV32I-NEXT: j .LBB12_5 ; RV32I-NEXT: .LBB12_4: -; RV32I-NEXT: sltu t1, a4, a3 +; RV32I-NEXT: sltu t1, a3, a2 ; RV32I-NEXT: .LBB12_5: -; RV32I-NEXT: sub a7, t0, a7 -; RV32I-NEXT: sub a7, a7, t2 -; RV32I-NEXT: sub a5, a2, a5 -; RV32I-NEXT: beq a7, t0, .LBB12_7 +; RV32I-NEXT: sub a6, t0, a6 +; RV32I-NEXT: sub a6, a6, t2 +; RV32I-NEXT: sub t2, a7, a4 +; RV32I-NEXT: beq a6, t0, .LBB12_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: sltu a2, t0, a7 +; RV32I-NEXT: sltu a4, t0, a6 ; RV32I-NEXT: j .LBB12_8 ; RV32I-NEXT: .LBB12_7: -; RV32I-NEXT: sltu a2, a2, a5 +; RV32I-NEXT: sltu a4, a7, t2 ; RV32I-NEXT: .LBB12_8: -; RV32I-NEXT: xor a6, a1, a6 -; RV32I-NEXT: xor a4, a3, a4 -; RV32I-NEXT: or a4, a4, a6 -; RV32I-NEXT: beqz a4, .LBB12_10 +; RV32I-NEXT: xor a5, a1, a5 +; RV32I-NEXT: xor a3, a2, a3 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: beqz a3, .LBB12_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a2, t1 +; RV32I-NEXT: mv a4, t1 ; RV32I-NEXT: .LBB12_10: -; RV32I-NEXT: neg a4, a2 -; RV32I-NEXT: xor t0, a5, a4 -; RV32I-NEXT: xor t3, a7, a4 -; RV32I-NEXT: sltu a5, t0, a4 -; RV32I-NEXT: add a6, t3, a2 -; RV32I-NEXT: add t0, t0, a2 -; RV32I-NEXT: sub t1, a6, a5 -; RV32I-NEXT: snez a6, t1 -; RV32I-NEXT: snez t2, t0 -; RV32I-NEXT: or a6, t2, a6 -; RV32I-NEXT: beqz a7, .LBB12_12 +; RV32I-NEXT: neg a5, a4 +; RV32I-NEXT: xor t0, t2, a5 +; RV32I-NEXT: sltu a7, t0, a5 +; RV32I-NEXT: xor t1, a6, a5 +; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: beqz a6, .LBB12_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu a5, t3, a4 +; RV32I-NEXT: sltu a3, t1, a5 ; RV32I-NEXT: .LBB12_12: -; RV32I-NEXT: xor a3, a3, a4 -; RV32I-NEXT: xor a1, a1, a4 -; RV32I-NEXT: add t1, t1, t2 -; RV32I-NEXT: neg a7, t0 -; RV32I-NEXT: add t0, a3, a2 -; RV32I-NEXT: sltu a3, a3, a4 +; RV32I-NEXT: xor a2, a2, a5 +; RV32I-NEXT: add t1, t1, a4 +; RV32I-NEXT: add t0, t0, a4 +; RV32I-NEXT: xor a1, a1, a5 +; RV32I-NEXT: add a6, a2, a4 +; RV32I-NEXT: sub a7, t1, a7 +; RV32I-NEXT: sltu a2, a2, a5 +; RV32I-NEXT: add a1, a1, a4 +; RV32I-NEXT: snez a4, t0 +; RV32I-NEXT: neg a5, t0 +; RV32I-NEXT: sub t1, a6, a3 +; RV32I-NEXT: or t0, t0, a7 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sltu a2, a6, a3 +; RV32I-NEXT: add a4, a7, a4 +; RV32I-NEXT: neg a3, t1 +; RV32I-NEXT: snez a6, t0 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: snez a2, t1 +; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: sltu a7, a3, a6 ; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: neg a2, t1 -; RV32I-NEXT: sub a4, t0, a5 -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: sltu a3, t0, a5 -; RV32I-NEXT: neg a5, a4 -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: snez a3, a4 -; RV32I-NEXT: sltu a4, a5, a6 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: sub a3, a5, a6 +; RV32I-NEXT: sub a2, a3, a6 ; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sw a7, 0(a0) -; RV32I-NEXT: sw a2, 4(a0) -; RV32I-NEXT: sw a3, 8(a0) +; RV32I-NEXT: sub a1, a1, a7 +; RV32I-NEXT: sw a5, 0(a0) +; RV32I-NEXT: sw a4, 4(a0) +; RV32I-NEXT: sw a2, 8(a0) ; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; @@ -969,87 +972,88 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; ; RV32ZBB-LABEL: abd_ext_i128_undef: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a5, 0(a2) -; RV32ZBB-NEXT: lw a7, 4(a2) -; RV32ZBB-NEXT: lw a3, 8(a2) -; RV32ZBB-NEXT: lw t1, 12(a2) -; RV32ZBB-NEXT: lw a4, 8(a1) -; RV32ZBB-NEXT: lw a6, 12(a1) -; RV32ZBB-NEXT: lw a2, 0(a1) +; RV32ZBB-NEXT: lw a4, 0(a2) +; RV32ZBB-NEXT: lw a6, 4(a2) +; RV32ZBB-NEXT: lw t1, 8(a2) +; RV32ZBB-NEXT: lw a2, 12(a2) +; RV32ZBB-NEXT: lw a3, 8(a1) +; RV32ZBB-NEXT: lw a5, 12(a1) +; RV32ZBB-NEXT: lw a7, 0(a1) ; RV32ZBB-NEXT: lw t0, 4(a1) -; RV32ZBB-NEXT: sltu a1, a4, a3 -; RV32ZBB-NEXT: sub t1, a6, t1 -; RV32ZBB-NEXT: sltu t2, a2, a5 -; RV32ZBB-NEXT: sub a1, t1, a1 -; RV32ZBB-NEXT: mv t1, t2 -; RV32ZBB-NEXT: beq t0, a7, .LBB12_2 +; RV32ZBB-NEXT: sltu a1, a3, t1 +; RV32ZBB-NEXT: sub a2, a5, a2 +; RV32ZBB-NEXT: sltu t2, a7, a4 +; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: mv a2, t2 +; RV32ZBB-NEXT: beq t0, a6, .LBB12_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, t0, a7 +; RV32ZBB-NEXT: sltu a2, t0, a6 ; RV32ZBB-NEXT: .LBB12_2: -; RV32ZBB-NEXT: sub a3, a4, a3 -; RV32ZBB-NEXT: sltu t3, a3, t1 +; RV32ZBB-NEXT: sub t1, a3, t1 +; RV32ZBB-NEXT: sltu t3, t1, a2 ; RV32ZBB-NEXT: sub a1, a1, t3 -; RV32ZBB-NEXT: sub a3, a3, t1 -; RV32ZBB-NEXT: beq a1, a6, .LBB12_4 +; RV32ZBB-NEXT: sub a2, t1, a2 +; RV32ZBB-NEXT: beq a1, a5, .LBB12_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t1, a6, a1 +; RV32ZBB-NEXT: sltu t1, a5, a1 ; RV32ZBB-NEXT: j .LBB12_5 ; RV32ZBB-NEXT: .LBB12_4: -; RV32ZBB-NEXT: sltu t1, a4, a3 +; RV32ZBB-NEXT: sltu t1, a3, a2 ; RV32ZBB-NEXT: .LBB12_5: -; RV32ZBB-NEXT: sub a7, t0, a7 -; RV32ZBB-NEXT: sub a7, a7, t2 -; RV32ZBB-NEXT: sub a5, a2, a5 -; RV32ZBB-NEXT: beq a7, t0, .LBB12_7 +; RV32ZBB-NEXT: sub a6, t0, a6 +; RV32ZBB-NEXT: sub a6, a6, t2 +; RV32ZBB-NEXT: sub t2, a7, a4 +; RV32ZBB-NEXT: beq a6, t0, .LBB12_7 ; RV32ZBB-NEXT: # %bb.6: -; RV32ZBB-NEXT: sltu a2, t0, a7 +; RV32ZBB-NEXT: sltu a4, t0, a6 ; RV32ZBB-NEXT: j .LBB12_8 ; RV32ZBB-NEXT: .LBB12_7: -; RV32ZBB-NEXT: sltu a2, a2, a5 +; RV32ZBB-NEXT: sltu a4, a7, t2 ; RV32ZBB-NEXT: .LBB12_8: -; RV32ZBB-NEXT: xor a6, a1, a6 -; RV32ZBB-NEXT: xor a4, a3, a4 -; RV32ZBB-NEXT: or a4, a4, a6 -; RV32ZBB-NEXT: beqz a4, .LBB12_10 +; RV32ZBB-NEXT: xor a5, a1, a5 +; RV32ZBB-NEXT: xor a3, a2, a3 +; RV32ZBB-NEXT: or a3, a3, a5 +; RV32ZBB-NEXT: beqz a3, .LBB12_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: mv a2, t1 +; RV32ZBB-NEXT: mv a4, t1 ; RV32ZBB-NEXT: .LBB12_10: -; RV32ZBB-NEXT: neg a4, a2 -; RV32ZBB-NEXT: xor t0, a5, a4 -; RV32ZBB-NEXT: xor t3, a7, a4 -; RV32ZBB-NEXT: sltu a5, t0, a4 -; RV32ZBB-NEXT: add a6, t3, a2 -; RV32ZBB-NEXT: add t0, t0, a2 -; RV32ZBB-NEXT: sub t1, a6, a5 -; RV32ZBB-NEXT: snez a6, t1 -; RV32ZBB-NEXT: snez t2, t0 -; RV32ZBB-NEXT: or a6, t2, a6 -; RV32ZBB-NEXT: beqz a7, .LBB12_12 +; RV32ZBB-NEXT: neg a5, a4 +; RV32ZBB-NEXT: xor t0, t2, a5 +; RV32ZBB-NEXT: sltu a7, t0, a5 +; RV32ZBB-NEXT: xor t1, a6, a5 +; RV32ZBB-NEXT: mv a3, a7 +; RV32ZBB-NEXT: beqz a6, .LBB12_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu a5, t3, a4 +; RV32ZBB-NEXT: sltu a3, t1, a5 ; RV32ZBB-NEXT: .LBB12_12: -; RV32ZBB-NEXT: xor a3, a3, a4 -; RV32ZBB-NEXT: xor a1, a1, a4 -; RV32ZBB-NEXT: add t1, t1, t2 -; RV32ZBB-NEXT: neg a7, t0 -; RV32ZBB-NEXT: add t0, a3, a2 -; RV32ZBB-NEXT: sltu a3, a3, a4 +; RV32ZBB-NEXT: xor a2, a2, a5 +; RV32ZBB-NEXT: add t1, t1, a4 +; RV32ZBB-NEXT: add t0, t0, a4 +; RV32ZBB-NEXT: xor a1, a1, a5 +; RV32ZBB-NEXT: add a6, a2, a4 +; RV32ZBB-NEXT: sub a7, t1, a7 +; RV32ZBB-NEXT: sltu a2, a2, a5 +; RV32ZBB-NEXT: add a1, a1, a4 +; RV32ZBB-NEXT: snez a4, t0 +; RV32ZBB-NEXT: neg a5, t0 +; RV32ZBB-NEXT: sub t1, a6, a3 +; RV32ZBB-NEXT: or t0, t0, a7 +; RV32ZBB-NEXT: sub a1, a1, a2 +; RV32ZBB-NEXT: sltu a2, a6, a3 +; RV32ZBB-NEXT: add a4, a7, a4 +; RV32ZBB-NEXT: neg a3, t1 +; RV32ZBB-NEXT: snez a6, t0 +; RV32ZBB-NEXT: sub a1, a1, a2 +; RV32ZBB-NEXT: snez a2, t1 +; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: sltu a7, a3, a6 ; RV32ZBB-NEXT: add a1, a1, a2 -; RV32ZBB-NEXT: neg a2, t1 -; RV32ZBB-NEXT: sub a4, t0, a5 -; RV32ZBB-NEXT: sub a1, a1, a3 -; RV32ZBB-NEXT: sltu a3, t0, a5 -; RV32ZBB-NEXT: neg a5, a4 -; RV32ZBB-NEXT: sub a1, a1, a3 -; RV32ZBB-NEXT: snez a3, a4 -; RV32ZBB-NEXT: sltu a4, a5, a6 -; RV32ZBB-NEXT: add a1, a1, a3 -; RV32ZBB-NEXT: sub a3, a5, a6 +; RV32ZBB-NEXT: sub a2, a3, a6 ; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sw a7, 0(a0) -; RV32ZBB-NEXT: sw a2, 4(a0) -; RV32ZBB-NEXT: sw a3, 8(a0) +; RV32ZBB-NEXT: sub a1, a1, a7 +; RV32ZBB-NEXT: sw a5, 0(a0) +; RV32ZBB-NEXT: sw a4, 4(a0) +; RV32ZBB-NEXT: sw a2, 8(a0) ; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll index 18d071cc39bb6..17721dc7241a7 100644 --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -1091,21 +1091,21 @@ define i64 @stest_f64i64(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixdfti -; RV32IF-NEXT: lw a3, 8(sp) +; RV32IF-NEXT: lw a2, 8(sp) ; RV32IF-NEXT: lw a1, 12(sp) -; RV32IF-NEXT: lw a2, 16(sp) +; RV32IF-NEXT: lw a0, 16(sp) ; RV32IF-NEXT: lw a4, 20(sp) -; RV32IF-NEXT: lui a0, 524288 -; RV32IF-NEXT: addi a5, a0, -1 +; RV32IF-NEXT: lui a3, 524288 +; RV32IF-NEXT: addi a5, a3, -1 ; RV32IF-NEXT: beq a1, a5, .LBB18_2 ; RV32IF-NEXT: # %bb.1: # %entry ; RV32IF-NEXT: sltu a6, a1, a5 -; RV32IF-NEXT: or a7, a2, a4 +; RV32IF-NEXT: or a7, a0, a4 ; RV32IF-NEXT: bnez a7, .LBB18_3 ; RV32IF-NEXT: j .LBB18_4 ; RV32IF-NEXT: .LBB18_2: -; RV32IF-NEXT: sltiu a6, a3, -1 -; RV32IF-NEXT: or a7, a2, a4 +; RV32IF-NEXT: sltiu a6, a2, -1 +; RV32IF-NEXT: or a7, a0, a4 ; RV32IF-NEXT: beqz a7, .LBB18_4 ; RV32IF-NEXT: .LBB18_3: # %entry ; RV32IF-NEXT: srli a6, a4, 31 @@ -1116,19 +1116,19 @@ define i64 @stest_f64i64(double %x) { ; RV32IF-NEXT: # %bb.5: # %entry ; RV32IF-NEXT: mv a1, a5 ; RV32IF-NEXT: .LBB18_6: # %entry -; RV32IF-NEXT: or a3, t0, a3 +; RV32IF-NEXT: or a2, t0, a2 ; RV32IF-NEXT: and a4, a7, a4 -; RV32IF-NEXT: and a2, a7, a2 -; RV32IF-NEXT: beq a1, a0, .LBB18_8 +; RV32IF-NEXT: and a5, a7, a0 +; RV32IF-NEXT: beq a1, a3, .LBB18_8 ; RV32IF-NEXT: # %bb.7: # %entry -; RV32IF-NEXT: sltu a0, a0, a1 +; RV32IF-NEXT: sltu a0, a3, a1 ; RV32IF-NEXT: j .LBB18_9 ; RV32IF-NEXT: .LBB18_8: -; RV32IF-NEXT: snez a0, a3 +; RV32IF-NEXT: snez a0, a2 ; RV32IF-NEXT: .LBB18_9: # %entry -; RV32IF-NEXT: and a2, a2, a4 -; RV32IF-NEXT: li a5, -1 -; RV32IF-NEXT: beq a2, a5, .LBB18_11 +; RV32IF-NEXT: and a5, a5, a4 +; RV32IF-NEXT: li a3, -1 +; RV32IF-NEXT: beq a5, a3, .LBB18_11 ; RV32IF-NEXT: # %bb.10: # %entry ; RV32IF-NEXT: srli a4, a4, 31 ; RV32IF-NEXT: xori a0, a4, 1 @@ -1138,7 +1138,7 @@ define i64 @stest_f64i64(double %x) { ; RV32IF-NEXT: lui a1, 524288 ; RV32IF-NEXT: .LBB18_13: # %entry ; RV32IF-NEXT: neg a0, a0 -; RV32IF-NEXT: and a0, a0, a3 +; RV32IF-NEXT: and a0, a0, a2 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: .cfi_restore ra ; RV32IF-NEXT: addi sp, sp, 32 @@ -1194,21 +1194,21 @@ define i64 @stest_f64i64(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixdfti -; RV32IFD-NEXT: lw a3, 8(sp) +; RV32IFD-NEXT: lw a2, 8(sp) ; RV32IFD-NEXT: lw a1, 12(sp) -; RV32IFD-NEXT: lw a2, 16(sp) +; RV32IFD-NEXT: lw a0, 16(sp) ; RV32IFD-NEXT: lw a4, 20(sp) -; RV32IFD-NEXT: lui a0, 524288 -; RV32IFD-NEXT: addi a5, a0, -1 +; RV32IFD-NEXT: lui a3, 524288 +; RV32IFD-NEXT: addi a5, a3, -1 ; RV32IFD-NEXT: beq a1, a5, .LBB18_2 ; RV32IFD-NEXT: # %bb.1: # %entry ; RV32IFD-NEXT: sltu a6, a1, a5 -; RV32IFD-NEXT: or a7, a2, a4 +; RV32IFD-NEXT: or a7, a0, a4 ; RV32IFD-NEXT: bnez a7, .LBB18_3 ; RV32IFD-NEXT: j .LBB18_4 ; RV32IFD-NEXT: .LBB18_2: -; RV32IFD-NEXT: sltiu a6, a3, -1 -; RV32IFD-NEXT: or a7, a2, a4 +; RV32IFD-NEXT: sltiu a6, a2, -1 +; RV32IFD-NEXT: or a7, a0, a4 ; RV32IFD-NEXT: beqz a7, .LBB18_4 ; RV32IFD-NEXT: .LBB18_3: # %entry ; RV32IFD-NEXT: srli a6, a4, 31 @@ -1219,19 +1219,19 @@ define i64 @stest_f64i64(double %x) { ; RV32IFD-NEXT: # %bb.5: # %entry ; RV32IFD-NEXT: mv a1, a5 ; RV32IFD-NEXT: .LBB18_6: # %entry -; RV32IFD-NEXT: or a3, t0, a3 +; RV32IFD-NEXT: or a2, t0, a2 ; RV32IFD-NEXT: and a4, a7, a4 -; RV32IFD-NEXT: and a2, a7, a2 -; RV32IFD-NEXT: beq a1, a0, .LBB18_8 +; RV32IFD-NEXT: and a5, a7, a0 +; RV32IFD-NEXT: beq a1, a3, .LBB18_8 ; RV32IFD-NEXT: # %bb.7: # %entry -; RV32IFD-NEXT: sltu a0, a0, a1 +; RV32IFD-NEXT: sltu a0, a3, a1 ; RV32IFD-NEXT: j .LBB18_9 ; RV32IFD-NEXT: .LBB18_8: -; RV32IFD-NEXT: snez a0, a3 +; RV32IFD-NEXT: snez a0, a2 ; RV32IFD-NEXT: .LBB18_9: # %entry -; RV32IFD-NEXT: and a2, a2, a4 -; RV32IFD-NEXT: li a5, -1 -; RV32IFD-NEXT: beq a2, a5, .LBB18_11 +; RV32IFD-NEXT: and a5, a5, a4 +; RV32IFD-NEXT: li a3, -1 +; RV32IFD-NEXT: beq a5, a3, .LBB18_11 ; RV32IFD-NEXT: # %bb.10: # %entry ; RV32IFD-NEXT: srli a4, a4, 31 ; RV32IFD-NEXT: xori a0, a4, 1 @@ -1241,7 +1241,7 @@ define i64 @stest_f64i64(double %x) { ; RV32IFD-NEXT: lui a1, 524288 ; RV32IFD-NEXT: .LBB18_13: # %entry ; RV32IFD-NEXT: neg a0, a0 -; RV32IFD-NEXT: and a0, a0, a3 +; RV32IFD-NEXT: and a0, a0, a2 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: .cfi_restore ra ; RV32IFD-NEXT: addi sp, sp, 32 @@ -1378,8 +1378,8 @@ define i64 @ustest_f64i64(double %x) { ; RV32IF-NEXT: # %bb.4: # %entry ; RV32IF-NEXT: li a0, 1 ; RV32IF-NEXT: .LBB20_5: # %entry -; RV32IF-NEXT: lw a4, 8(sp) -; RV32IF-NEXT: lw a3, 12(sp) +; RV32IF-NEXT: lw a3, 8(sp) +; RV32IF-NEXT: lw a4, 12(sp) ; RV32IF-NEXT: and a5, a2, a1 ; RV32IF-NEXT: beqz a5, .LBB20_7 ; RV32IF-NEXT: # %bb.6: # %entry @@ -1393,12 +1393,12 @@ define i64 @ustest_f64i64(double %x) { ; RV32IF-NEXT: and a2, a2, a3 ; RV32IF-NEXT: bnez a0, .LBB20_10 ; RV32IF-NEXT: # %bb.9: -; RV32IF-NEXT: or a0, a4, a2 +; RV32IF-NEXT: or a0, a2, a4 ; RV32IF-NEXT: snez a1, a0 ; RV32IF-NEXT: .LBB20_10: # %entry ; RV32IF-NEXT: neg a1, a1 -; RV32IF-NEXT: and a0, a1, a4 -; RV32IF-NEXT: and a1, a1, a2 +; RV32IF-NEXT: and a0, a1, a2 +; RV32IF-NEXT: and a1, a1, a4 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: .cfi_restore ra ; RV32IF-NEXT: addi sp, sp, 32 @@ -1461,8 +1461,8 @@ define i64 @ustest_f64i64(double %x) { ; RV32IFD-NEXT: # %bb.4: # %entry ; RV32IFD-NEXT: li a0, 1 ; RV32IFD-NEXT: .LBB20_5: # %entry -; RV32IFD-NEXT: lw a4, 8(sp) -; RV32IFD-NEXT: lw a3, 12(sp) +; RV32IFD-NEXT: lw a3, 8(sp) +; RV32IFD-NEXT: lw a4, 12(sp) ; RV32IFD-NEXT: and a5, a2, a1 ; RV32IFD-NEXT: beqz a5, .LBB20_7 ; RV32IFD-NEXT: # %bb.6: # %entry @@ -1476,12 +1476,12 @@ define i64 @ustest_f64i64(double %x) { ; RV32IFD-NEXT: and a2, a2, a3 ; RV32IFD-NEXT: bnez a0, .LBB20_10 ; RV32IFD-NEXT: # %bb.9: -; RV32IFD-NEXT: or a0, a4, a2 +; RV32IFD-NEXT: or a0, a2, a4 ; RV32IFD-NEXT: snez a1, a0 ; RV32IFD-NEXT: .LBB20_10: # %entry ; RV32IFD-NEXT: neg a1, a1 -; RV32IFD-NEXT: and a0, a1, a4 -; RV32IFD-NEXT: and a1, a1, a2 +; RV32IFD-NEXT: and a0, a1, a2 +; RV32IFD-NEXT: and a1, a1, a4 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: .cfi_restore ra ; RV32IFD-NEXT: addi sp, sp, 32 @@ -1506,21 +1506,21 @@ define i64 @stest_f32i64(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a3, 8(sp) +; RV32-NEXT: lw a2, 8(sp) ; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a0, 16(sp) ; RV32-NEXT: lw a4, 20(sp) -; RV32-NEXT: lui a0, 524288 -; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: lui a3, 524288 +; RV32-NEXT: addi a5, a3, -1 ; RV32-NEXT: beq a1, a5, .LBB21_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a6, a1, a5 -; RV32-NEXT: or a7, a2, a4 +; RV32-NEXT: or a7, a0, a4 ; RV32-NEXT: bnez a7, .LBB21_3 ; RV32-NEXT: j .LBB21_4 ; RV32-NEXT: .LBB21_2: -; RV32-NEXT: sltiu a6, a3, -1 -; RV32-NEXT: or a7, a2, a4 +; RV32-NEXT: sltiu a6, a2, -1 +; RV32-NEXT: or a7, a0, a4 ; RV32-NEXT: beqz a7, .LBB21_4 ; RV32-NEXT: .LBB21_3: # %entry ; RV32-NEXT: srli a6, a4, 31 @@ -1531,19 +1531,19 @@ define i64 @stest_f32i64(float %x) { ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB21_6: # %entry -; RV32-NEXT: or a3, t0, a3 +; RV32-NEXT: or a2, t0, a2 ; RV32-NEXT: and a4, a7, a4 -; RV32-NEXT: and a2, a7, a2 -; RV32-NEXT: beq a1, a0, .LBB21_8 +; RV32-NEXT: and a5, a7, a0 +; RV32-NEXT: beq a1, a3, .LBB21_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: sltu a0, a3, a1 ; RV32-NEXT: j .LBB21_9 ; RV32-NEXT: .LBB21_8: -; RV32-NEXT: snez a0, a3 +; RV32-NEXT: snez a0, a2 ; RV32-NEXT: .LBB21_9: # %entry -; RV32-NEXT: and a2, a2, a4 -; RV32-NEXT: li a5, -1 -; RV32-NEXT: beq a2, a5, .LBB21_11 +; RV32-NEXT: and a5, a5, a4 +; RV32-NEXT: li a3, -1 +; RV32-NEXT: beq a5, a3, .LBB21_11 ; RV32-NEXT: # %bb.10: # %entry ; RV32-NEXT: srli a4, a4, 31 ; RV32-NEXT: xori a0, a4, 1 @@ -1553,7 +1553,7 @@ define i64 @stest_f32i64(float %x) { ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB21_13: # %entry ; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -1658,8 +1658,8 @@ define i64 @ustest_f32i64(float %x) { ; RV32-NEXT: # %bb.4: # %entry ; RV32-NEXT: li a0, 1 ; RV32-NEXT: .LBB23_5: # %entry -; RV32-NEXT: lw a4, 8(sp) -; RV32-NEXT: lw a3, 12(sp) +; RV32-NEXT: lw a3, 8(sp) +; RV32-NEXT: lw a4, 12(sp) ; RV32-NEXT: and a5, a2, a1 ; RV32-NEXT: beqz a5, .LBB23_7 ; RV32-NEXT: # %bb.6: # %entry @@ -1673,12 +1673,12 @@ define i64 @ustest_f32i64(float %x) { ; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: bnez a0, .LBB23_10 ; RV32-NEXT: # %bb.9: -; RV32-NEXT: or a0, a4, a2 +; RV32-NEXT: or a0, a2, a4 ; RV32-NEXT: snez a1, a0 ; RV32-NEXT: .LBB23_10: # %entry ; RV32-NEXT: neg a1, a1 -; RV32-NEXT: and a0, a1, a4 -; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: and a0, a1, a2 +; RV32-NEXT: and a1, a1, a4 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -1733,21 +1733,21 @@ define i64 @stest_f16i64(half %x) { ; RV32-NEXT: call __extendhfsf2 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a3, 8(sp) +; RV32-NEXT: lw a2, 8(sp) ; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a0, 16(sp) ; RV32-NEXT: lw a4, 20(sp) -; RV32-NEXT: lui a0, 524288 -; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: lui a3, 524288 +; RV32-NEXT: addi a5, a3, -1 ; RV32-NEXT: beq a1, a5, .LBB24_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a6, a1, a5 -; RV32-NEXT: or a7, a2, a4 +; RV32-NEXT: or a7, a0, a4 ; RV32-NEXT: bnez a7, .LBB24_3 ; RV32-NEXT: j .LBB24_4 ; RV32-NEXT: .LBB24_2: -; RV32-NEXT: sltiu a6, a3, -1 -; RV32-NEXT: or a7, a2, a4 +; RV32-NEXT: sltiu a6, a2, -1 +; RV32-NEXT: or a7, a0, a4 ; RV32-NEXT: beqz a7, .LBB24_4 ; RV32-NEXT: .LBB24_3: # %entry ; RV32-NEXT: srli a6, a4, 31 @@ -1758,19 +1758,19 @@ define i64 @stest_f16i64(half %x) { ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB24_6: # %entry -; RV32-NEXT: or a3, t0, a3 +; RV32-NEXT: or a2, t0, a2 ; RV32-NEXT: and a4, a7, a4 -; RV32-NEXT: and a2, a7, a2 -; RV32-NEXT: beq a1, a0, .LBB24_8 +; RV32-NEXT: and a5, a7, a0 +; RV32-NEXT: beq a1, a3, .LBB24_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: sltu a0, a3, a1 ; RV32-NEXT: j .LBB24_9 ; RV32-NEXT: .LBB24_8: -; RV32-NEXT: snez a0, a3 +; RV32-NEXT: snez a0, a2 ; RV32-NEXT: .LBB24_9: # %entry -; RV32-NEXT: and a2, a2, a4 -; RV32-NEXT: li a5, -1 -; RV32-NEXT: beq a2, a5, .LBB24_11 +; RV32-NEXT: and a5, a5, a4 +; RV32-NEXT: li a3, -1 +; RV32-NEXT: beq a5, a3, .LBB24_11 ; RV32-NEXT: # %bb.10: # %entry ; RV32-NEXT: srli a4, a4, 31 ; RV32-NEXT: xori a0, a4, 1 @@ -1780,7 +1780,7 @@ define i64 @stest_f16i64(half %x) { ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB24_13: # %entry ; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -1921,8 +1921,8 @@ define i64 @ustest_f16i64(half %x) { ; RV32-NEXT: # %bb.4: # %entry ; RV32-NEXT: li a0, 1 ; RV32-NEXT: .LBB26_5: # %entry -; RV32-NEXT: lw a4, 8(sp) -; RV32-NEXT: lw a3, 12(sp) +; RV32-NEXT: lw a3, 8(sp) +; RV32-NEXT: lw a4, 12(sp) ; RV32-NEXT: and a5, a2, a1 ; RV32-NEXT: beqz a5, .LBB26_7 ; RV32-NEXT: # %bb.6: # %entry @@ -1936,12 +1936,12 @@ define i64 @ustest_f16i64(half %x) { ; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: bnez a0, .LBB26_10 ; RV32-NEXT: # %bb.9: -; RV32-NEXT: or a0, a4, a2 +; RV32-NEXT: or a0, a2, a4 ; RV32-NEXT: snez a1, a0 ; RV32-NEXT: .LBB26_10: # %entry ; RV32-NEXT: neg a1, a1 -; RV32-NEXT: and a0, a1, a4 -; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: and a0, a1, a2 +; RV32-NEXT: and a1, a1, a4 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -3027,21 +3027,21 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixdfti -; RV32IF-NEXT: lw a3, 8(sp) +; RV32IF-NEXT: lw a2, 8(sp) ; RV32IF-NEXT: lw a1, 12(sp) -; RV32IF-NEXT: lw a2, 16(sp) +; RV32IF-NEXT: lw a0, 16(sp) ; RV32IF-NEXT: lw a4, 20(sp) -; RV32IF-NEXT: lui a0, 524288 -; RV32IF-NEXT: addi a5, a0, -1 +; RV32IF-NEXT: lui a3, 524288 +; RV32IF-NEXT: addi a5, a3, -1 ; RV32IF-NEXT: beq a1, a5, .LBB45_2 ; RV32IF-NEXT: # %bb.1: # %entry ; RV32IF-NEXT: sltu a6, a1, a5 -; RV32IF-NEXT: or a7, a2, a4 +; RV32IF-NEXT: or a7, a0, a4 ; RV32IF-NEXT: bnez a7, .LBB45_3 ; RV32IF-NEXT: j .LBB45_4 ; RV32IF-NEXT: .LBB45_2: -; RV32IF-NEXT: sltiu a6, a3, -1 -; RV32IF-NEXT: or a7, a2, a4 +; RV32IF-NEXT: sltiu a6, a2, -1 +; RV32IF-NEXT: or a7, a0, a4 ; RV32IF-NEXT: beqz a7, .LBB45_4 ; RV32IF-NEXT: .LBB45_3: # %entry ; RV32IF-NEXT: srli a6, a4, 31 @@ -3052,19 +3052,19 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IF-NEXT: # %bb.5: # %entry ; RV32IF-NEXT: mv a1, a5 ; RV32IF-NEXT: .LBB45_6: # %entry -; RV32IF-NEXT: or a3, t0, a3 +; RV32IF-NEXT: or a2, t0, a2 ; RV32IF-NEXT: and a4, a7, a4 -; RV32IF-NEXT: and a2, a7, a2 -; RV32IF-NEXT: beq a1, a0, .LBB45_8 +; RV32IF-NEXT: and a5, a7, a0 +; RV32IF-NEXT: beq a1, a3, .LBB45_8 ; RV32IF-NEXT: # %bb.7: # %entry -; RV32IF-NEXT: sltu a0, a0, a1 +; RV32IF-NEXT: sltu a0, a3, a1 ; RV32IF-NEXT: j .LBB45_9 ; RV32IF-NEXT: .LBB45_8: -; RV32IF-NEXT: snez a0, a3 +; RV32IF-NEXT: snez a0, a2 ; RV32IF-NEXT: .LBB45_9: # %entry -; RV32IF-NEXT: and a2, a2, a4 -; RV32IF-NEXT: li a5, -1 -; RV32IF-NEXT: beq a2, a5, .LBB45_11 +; RV32IF-NEXT: and a5, a5, a4 +; RV32IF-NEXT: li a3, -1 +; RV32IF-NEXT: beq a5, a3, .LBB45_11 ; RV32IF-NEXT: # %bb.10: # %entry ; RV32IF-NEXT: srli a4, a4, 31 ; RV32IF-NEXT: xori a0, a4, 1 @@ -3074,7 +3074,7 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IF-NEXT: lui a1, 524288 ; RV32IF-NEXT: .LBB45_13: # %entry ; RV32IF-NEXT: neg a0, a0 -; RV32IF-NEXT: and a0, a0, a3 +; RV32IF-NEXT: and a0, a0, a2 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: .cfi_restore ra ; RV32IF-NEXT: addi sp, sp, 32 @@ -3130,21 +3130,21 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixdfti -; RV32IFD-NEXT: lw a3, 8(sp) +; RV32IFD-NEXT: lw a2, 8(sp) ; RV32IFD-NEXT: lw a1, 12(sp) -; RV32IFD-NEXT: lw a2, 16(sp) +; RV32IFD-NEXT: lw a0, 16(sp) ; RV32IFD-NEXT: lw a4, 20(sp) -; RV32IFD-NEXT: lui a0, 524288 -; RV32IFD-NEXT: addi a5, a0, -1 +; RV32IFD-NEXT: lui a3, 524288 +; RV32IFD-NEXT: addi a5, a3, -1 ; RV32IFD-NEXT: beq a1, a5, .LBB45_2 ; RV32IFD-NEXT: # %bb.1: # %entry ; RV32IFD-NEXT: sltu a6, a1, a5 -; RV32IFD-NEXT: or a7, a2, a4 +; RV32IFD-NEXT: or a7, a0, a4 ; RV32IFD-NEXT: bnez a7, .LBB45_3 ; RV32IFD-NEXT: j .LBB45_4 ; RV32IFD-NEXT: .LBB45_2: -; RV32IFD-NEXT: sltiu a6, a3, -1 -; RV32IFD-NEXT: or a7, a2, a4 +; RV32IFD-NEXT: sltiu a6, a2, -1 +; RV32IFD-NEXT: or a7, a0, a4 ; RV32IFD-NEXT: beqz a7, .LBB45_4 ; RV32IFD-NEXT: .LBB45_3: # %entry ; RV32IFD-NEXT: srli a6, a4, 31 @@ -3155,19 +3155,19 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IFD-NEXT: # %bb.5: # %entry ; RV32IFD-NEXT: mv a1, a5 ; RV32IFD-NEXT: .LBB45_6: # %entry -; RV32IFD-NEXT: or a3, t0, a3 +; RV32IFD-NEXT: or a2, t0, a2 ; RV32IFD-NEXT: and a4, a7, a4 -; RV32IFD-NEXT: and a2, a7, a2 -; RV32IFD-NEXT: beq a1, a0, .LBB45_8 +; RV32IFD-NEXT: and a5, a7, a0 +; RV32IFD-NEXT: beq a1, a3, .LBB45_8 ; RV32IFD-NEXT: # %bb.7: # %entry -; RV32IFD-NEXT: sltu a0, a0, a1 +; RV32IFD-NEXT: sltu a0, a3, a1 ; RV32IFD-NEXT: j .LBB45_9 ; RV32IFD-NEXT: .LBB45_8: -; RV32IFD-NEXT: snez a0, a3 +; RV32IFD-NEXT: snez a0, a2 ; RV32IFD-NEXT: .LBB45_9: # %entry -; RV32IFD-NEXT: and a2, a2, a4 -; RV32IFD-NEXT: li a5, -1 -; RV32IFD-NEXT: beq a2, a5, .LBB45_11 +; RV32IFD-NEXT: and a5, a5, a4 +; RV32IFD-NEXT: li a3, -1 +; RV32IFD-NEXT: beq a5, a3, .LBB45_11 ; RV32IFD-NEXT: # %bb.10: # %entry ; RV32IFD-NEXT: srli a4, a4, 31 ; RV32IFD-NEXT: xori a0, a4, 1 @@ -3177,7 +3177,7 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IFD-NEXT: lui a1, 524288 ; RV32IFD-NEXT: .LBB45_13: # %entry ; RV32IFD-NEXT: neg a0, a0 -; RV32IFD-NEXT: and a0, a0, a3 +; RV32IFD-NEXT: and a0, a0, a2 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: .cfi_restore ra ; RV32IFD-NEXT: addi sp, sp, 32 @@ -3400,21 +3400,21 @@ define i64 @stest_f32i64_mm(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a3, 8(sp) +; RV32-NEXT: lw a2, 8(sp) ; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a0, 16(sp) ; RV32-NEXT: lw a4, 20(sp) -; RV32-NEXT: lui a0, 524288 -; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: lui a3, 524288 +; RV32-NEXT: addi a5, a3, -1 ; RV32-NEXT: beq a1, a5, .LBB48_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a6, a1, a5 -; RV32-NEXT: or a7, a2, a4 +; RV32-NEXT: or a7, a0, a4 ; RV32-NEXT: bnez a7, .LBB48_3 ; RV32-NEXT: j .LBB48_4 ; RV32-NEXT: .LBB48_2: -; RV32-NEXT: sltiu a6, a3, -1 -; RV32-NEXT: or a7, a2, a4 +; RV32-NEXT: sltiu a6, a2, -1 +; RV32-NEXT: or a7, a0, a4 ; RV32-NEXT: beqz a7, .LBB48_4 ; RV32-NEXT: .LBB48_3: # %entry ; RV32-NEXT: srli a6, a4, 31 @@ -3425,19 +3425,19 @@ define i64 @stest_f32i64_mm(float %x) { ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB48_6: # %entry -; RV32-NEXT: or a3, t0, a3 +; RV32-NEXT: or a2, t0, a2 ; RV32-NEXT: and a4, a7, a4 -; RV32-NEXT: and a2, a7, a2 -; RV32-NEXT: beq a1, a0, .LBB48_8 +; RV32-NEXT: and a5, a7, a0 +; RV32-NEXT: beq a1, a3, .LBB48_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: sltu a0, a3, a1 ; RV32-NEXT: j .LBB48_9 ; RV32-NEXT: .LBB48_8: -; RV32-NEXT: snez a0, a3 +; RV32-NEXT: snez a0, a2 ; RV32-NEXT: .LBB48_9: # %entry -; RV32-NEXT: and a2, a2, a4 -; RV32-NEXT: li a5, -1 -; RV32-NEXT: beq a2, a5, .LBB48_11 +; RV32-NEXT: and a5, a5, a4 +; RV32-NEXT: li a3, -1 +; RV32-NEXT: beq a5, a3, .LBB48_11 ; RV32-NEXT: # %bb.10: # %entry ; RV32-NEXT: srli a4, a4, 31 ; RV32-NEXT: xori a0, a4, 1 @@ -3447,7 +3447,7 @@ define i64 @stest_f32i64_mm(float %x) { ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB48_13: # %entry ; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -3601,21 +3601,21 @@ define i64 @stest_f16i64_mm(half %x) { ; RV32-NEXT: call __extendhfsf2 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a3, 8(sp) +; RV32-NEXT: lw a2, 8(sp) ; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a0, 16(sp) ; RV32-NEXT: lw a4, 20(sp) -; RV32-NEXT: lui a0, 524288 -; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: lui a3, 524288 +; RV32-NEXT: addi a5, a3, -1 ; RV32-NEXT: beq a1, a5, .LBB51_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a6, a1, a5 -; RV32-NEXT: or a7, a2, a4 +; RV32-NEXT: or a7, a0, a4 ; RV32-NEXT: bnez a7, .LBB51_3 ; RV32-NEXT: j .LBB51_4 ; RV32-NEXT: .LBB51_2: -; RV32-NEXT: sltiu a6, a3, -1 -; RV32-NEXT: or a7, a2, a4 +; RV32-NEXT: sltiu a6, a2, -1 +; RV32-NEXT: or a7, a0, a4 ; RV32-NEXT: beqz a7, .LBB51_4 ; RV32-NEXT: .LBB51_3: # %entry ; RV32-NEXT: srli a6, a4, 31 @@ -3626,19 +3626,19 @@ define i64 @stest_f16i64_mm(half %x) { ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB51_6: # %entry -; RV32-NEXT: or a3, t0, a3 +; RV32-NEXT: or a2, t0, a2 ; RV32-NEXT: and a4, a7, a4 -; RV32-NEXT: and a2, a7, a2 -; RV32-NEXT: beq a1, a0, .LBB51_8 +; RV32-NEXT: and a5, a7, a0 +; RV32-NEXT: beq a1, a3, .LBB51_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: sltu a0, a3, a1 ; RV32-NEXT: j .LBB51_9 ; RV32-NEXT: .LBB51_8: -; RV32-NEXT: snez a0, a3 +; RV32-NEXT: snez a0, a2 ; RV32-NEXT: .LBB51_9: # %entry -; RV32-NEXT: and a2, a2, a4 -; RV32-NEXT: li a5, -1 -; RV32-NEXT: beq a2, a5, .LBB51_11 +; RV32-NEXT: and a5, a5, a4 +; RV32-NEXT: li a3, -1 +; RV32-NEXT: beq a5, a3, .LBB51_11 ; RV32-NEXT: # %bb.10: # %entry ; RV32-NEXT: srli a4, a4, 31 ; RV32-NEXT: xori a0, a4, 1 @@ -3648,7 +3648,7 @@ define i64 @stest_f16i64_mm(half %x) { ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB51_13: # %entry ; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll index c157c63722cb4..a570e261db6a6 100644 --- a/llvm/test/CodeGen/RISCV/iabs.ll +++ b/llvm/test/CodeGen/RISCV/iabs.ll @@ -308,11 +308,11 @@ define i128 @abs128(i128 %x) { ; RV32I-NEXT: bgez a2, .LBB8_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: neg a5, a1 -; RV32I-NEXT: snez a6, a4 -; RV32I-NEXT: snez a7, a3 +; RV32I-NEXT: or a6, a3, a4 ; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: snez a7, a3 ; RV32I-NEXT: neg a4, a4 -; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: snez a6, a6 ; RV32I-NEXT: add a1, a2, a1 ; RV32I-NEXT: sub a4, a4, a7 ; RV32I-NEXT: sltu a2, a5, a6 @@ -336,11 +336,11 @@ define i128 @abs128(i128 %x) { ; RV32ZBB-NEXT: bgez a2, .LBB8_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: neg a5, a1 -; RV32ZBB-NEXT: snez a6, a4 -; RV32ZBB-NEXT: snez a7, a3 +; RV32ZBB-NEXT: or a6, a3, a4 ; RV32ZBB-NEXT: snez a1, a1 +; RV32ZBB-NEXT: snez a7, a3 ; RV32ZBB-NEXT: neg a4, a4 -; RV32ZBB-NEXT: or a6, a7, a6 +; RV32ZBB-NEXT: snez a6, a6 ; RV32ZBB-NEXT: add a1, a2, a1 ; RV32ZBB-NEXT: sub a4, a4, a7 ; RV32ZBB-NEXT: sltu a2, a5, a6 @@ -390,11 +390,11 @@ define i128 @select_abs128(i128 %x) { ; RV32I-NEXT: bgez a2, .LBB9_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: neg a5, a1 -; RV32I-NEXT: snez a6, a4 -; RV32I-NEXT: snez a7, a3 +; RV32I-NEXT: or a6, a3, a4 ; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: snez a7, a3 ; RV32I-NEXT: neg a4, a4 -; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: snez a6, a6 ; RV32I-NEXT: add a1, a2, a1 ; RV32I-NEXT: sub a4, a4, a7 ; RV32I-NEXT: sltu a2, a5, a6 @@ -418,11 +418,11 @@ define i128 @select_abs128(i128 %x) { ; RV32ZBB-NEXT: bgez a2, .LBB9_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: neg a5, a1 -; RV32ZBB-NEXT: snez a6, a4 -; RV32ZBB-NEXT: snez a7, a3 +; RV32ZBB-NEXT: or a6, a3, a4 ; RV32ZBB-NEXT: snez a1, a1 +; RV32ZBB-NEXT: snez a7, a3 ; RV32ZBB-NEXT: neg a4, a4 -; RV32ZBB-NEXT: or a6, a7, a6 +; RV32ZBB-NEXT: snez a6, a6 ; RV32ZBB-NEXT: add a1, a2, a1 ; RV32ZBB-NEXT: sub a4, a4, a7 ; RV32ZBB-NEXT: sltu a2, a5, a6 diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll index a06c7505d543d..87c8343a417cd 100644 --- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll +++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll @@ -7,18 +7,18 @@ define i32 @ctz_nxv4i32( %a) #0 { ; RV32-LABEL: ctz_nxv4i32: ; RV32: # %bb.0: -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; RV32-NEXT: vid.v v10 -; RV32-NEXT: li a1, -1 +; RV32-NEXT: vmv.v.i v11, -1 +; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32-NEXT: vmsne.vi v0, v8, 0 ; RV32-NEXT: srli a0, a0, 1 ; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vmadd.vx v10, a1, v8 -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV32-NEXT: vmacc.vv v8, v10, v11 +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: sub a0, a0, a1 @@ -28,18 +28,18 @@ define i32 @ctz_nxv4i32( %a) #0 { ; ; RV64-LABEL: ctz_nxv4i32: ; RV64: # %bb.0: -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; RV64-NEXT: vid.v v10 -; RV64-NEXT: li a1, -1 +; RV64-NEXT: vmv.v.i v11, -1 +; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64-NEXT: vmsne.vi v0, v8, 0 ; RV64-NEXT: srli a0, a0, 1 ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vmadd.vx v10, a1, v8 -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV64-NEXT: vmacc.vv v8, v10, v11 +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: sub a0, a0, a1 @@ -109,17 +109,17 @@ define i64 @ctz_nxv8i1_no_range( %a) { ; ; RV64-LABEL: ctz_nxv8i1_no_range: ; RV64: # %bb.0: -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV64-NEXT: vid.v v16 -; RV64-NEXT: li a1, -1 +; RV64-NEXT: vmv.v.i v24, -1 +; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-NEXT: vmsne.vi v0, v8, 0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vmadd.vx v16, a1, v8 -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV64-NEXT: vmacc.vv v8, v16, v24 +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vmerge.vvm v8, v16, v8, v0 ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: sub a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll index 24853ebafefcc..47ed3e5133c48 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll @@ -270,17 +270,19 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: srliw a0, a0, 1 ; RV64I-NEXT: beqz a0, .LBB4_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: addi a1, a2, 1365 -; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: srli a2, a0, 2 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: srli a2, a0, 4 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: slli a2, a0, 33 +; RV64I-NEXT: srli a2, a2, 41 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: slli a2, a0, 33 +; RV64I-NEXT: srli a2, a2, 49 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: not a0, a0 ; RV64I-NEXT: srli a2, a0, 1 diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index d133f9d1db389..8f73663eec059 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -231,17 +231,19 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: srliw a0, a0, 1 ; RV64I-NEXT: beqz a0, .LBB4_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: addi a1, a2, 1365 -; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: srli a2, a0, 2 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: srli a2, a0, 4 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: slli a2, a0, 33 +; RV64I-NEXT: srli a2, a2, 41 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: slli a2, a0, 33 +; RV64I-NEXT: srli a2, a2, 49 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: not a0, a0 ; RV64I-NEXT: srli a2, a0, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll index 32892bca84747..a84f2a32208e4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -1380,38 +1380,38 @@ define @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv32f64( %va, @fcmp_oeq_vv_nxv32f64( %va, @icmp_eq_vv_nxv32i32( %va, @icmp_eq_vx_nxv32i32( %va, i32 %b, ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: srli a2, a3, 2 -; CHECK-NEXT: slli a3, a3, 1 +; CHECK-NEXT: slli a4, a3, 1 ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: sub a4, a1, a3 -; CHECK-NEXT: sltu a5, a1, a4 -; CHECK-NEXT: addi a5, a5, -1 -; CHECK-NEXT: and a4, a5, a4 -; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma +; CHECK-NEXT: sub a5, a1, a4 +; CHECK-NEXT: sltu a6, a1, a5 +; CHECK-NEXT: addi a6, a6, -1 +; CHECK-NEXT: and a5, a6, a5 +; CHECK-NEXT: vsetvli zero, a5, e32, m8, ta, ma ; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t -; CHECK-NEXT: bltu a1, a3, .LBB190_2 +; CHECK-NEXT: bltu a1, a4, .LBB190_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: mv a1, a4 ; CHECK-NEXT: .LBB190_2: ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma +; CHECK-NEXT: srli a3, a3, 1 +; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma ; CHECK-NEXT: vslideup.vx v16, v25, a2 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2299,22 +2301,23 @@ define @icmp_eq_vx_swap_nxv32i32( %va, i32 ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: srli a2, a3, 2 -; CHECK-NEXT: slli a3, a3, 1 +; CHECK-NEXT: slli a4, a3, 1 ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: sub a4, a1, a3 -; CHECK-NEXT: sltu a5, a1, a4 -; CHECK-NEXT: addi a5, a5, -1 -; CHECK-NEXT: and a4, a5, a4 -; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma +; CHECK-NEXT: sub a5, a1, a4 +; CHECK-NEXT: sltu a6, a1, a5 +; CHECK-NEXT: addi a6, a6, -1 +; CHECK-NEXT: and a5, a6, a5 +; CHECK-NEXT: vsetvli zero, a5, e32, m8, ta, ma ; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t -; CHECK-NEXT: bltu a1, a3, .LBB191_2 +; CHECK-NEXT: bltu a1, a4, .LBB191_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: mv a1, a4 ; CHECK-NEXT: .LBB191_2: ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma +; CHECK-NEXT: srli a3, a3, 1 +; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma ; CHECK-NEXT: vslideup.vx v16, v25, a2 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll index b6aa4affbb10f..419c2cfb04b26 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll @@ -719,38 +719,38 @@ define @strided_load_nxv16f64(ptr %ptr, i64 %stride, @strided_load_nxv16f64_allones_mask(ptr %ptr, i64 %stride, i32 zeroext %evl) { ; CHECK-RV32-LABEL: strided_load_nxv16f64_allones_mask: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: csrr a4, vlenb -; CHECK-RV32-NEXT: sub a2, a3, a4 -; CHECK-RV32-NEXT: sltu a5, a3, a2 +; CHECK-RV32-NEXT: csrr a2, vlenb +; CHECK-RV32-NEXT: sub a4, a3, a2 +; CHECK-RV32-NEXT: sltu a5, a3, a4 ; CHECK-RV32-NEXT: addi a5, a5, -1 -; CHECK-RV32-NEXT: and a2, a5, a2 -; CHECK-RV32-NEXT: bltu a3, a4, .LBB56_2 +; CHECK-RV32-NEXT: and a4, a5, a4 +; CHECK-RV32-NEXT: bltu a3, a2, .LBB56_2 ; CHECK-RV32-NEXT: # %bb.1: -; CHECK-RV32-NEXT: mv a3, a4 +; CHECK-RV32-NEXT: mv a3, a2 ; CHECK-RV32-NEXT: .LBB56_2: -; CHECK-RV32-NEXT: mul a4, a3, a1 -; CHECK-RV32-NEXT: add a4, a0, a4 -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV32-NEXT: vlse64.v v16, (a4), a1 +; CHECK-RV32-NEXT: mul a2, a3, a1 +; CHECK-RV32-NEXT: add a2, a0, a2 +; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma +; CHECK-RV32-NEXT: vlse64.v v16, (a2), a1 ; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: strided_load_nxv16f64_allones_mask: ; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: csrr a4, vlenb -; CHECK-RV64-NEXT: sub a3, a2, a4 -; CHECK-RV64-NEXT: sltu a5, a2, a3 +; CHECK-RV64-NEXT: csrr a3, vlenb +; CHECK-RV64-NEXT: sub a4, a2, a3 +; CHECK-RV64-NEXT: sltu a5, a2, a4 ; CHECK-RV64-NEXT: addi a5, a5, -1 -; CHECK-RV64-NEXT: and a3, a5, a3 -; CHECK-RV64-NEXT: bltu a2, a4, .LBB56_2 +; CHECK-RV64-NEXT: and a4, a5, a4 +; CHECK-RV64-NEXT: bltu a2, a3, .LBB56_2 ; CHECK-RV64-NEXT: # %bb.1: -; CHECK-RV64-NEXT: mv a2, a4 +; CHECK-RV64-NEXT: mv a2, a3 ; CHECK-RV64-NEXT: .LBB56_2: -; CHECK-RV64-NEXT: mul a4, a2, a1 -; CHECK-RV64-NEXT: add a4, a0, a4 -; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; CHECK-RV64-NEXT: vlse64.v v16, (a4), a1 +; CHECK-RV64-NEXT: mul a3, a2, a1 +; CHECK-RV64-NEXT: add a3, a0, a3 +; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma +; CHECK-RV64-NEXT: vlse64.v v16, (a3), a1 ; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1 ; CHECK-RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll index 946c0bbd7ff6f..ea001e273013b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll @@ -1415,22 +1415,24 @@ define @vadd_vi_nxv32i32_evl_nx8( %va, @vfadd_vf_nxv32bf16( %va, bf ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: fmv.x.h a1, fa0 @@ -530,10 +530,7 @@ define @vfadd_vf_nxv32bf16( %va, bf ; CHECK-NEXT: sltu a2, a0, a3 ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a2, a2, a3 -; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: slli a3, a3, 3 -; CHECK-NEXT: add a3, sp, a3 -; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: addi a3, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t @@ -547,24 +544,17 @@ define @vfadd_vf_nxv32bf16( %va, bf ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB24_2: ; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24, v0.t +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t +; CHECK-NEXT: vfadd.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -611,13 +601,13 @@ define @vfadd_vf_nxv32bf16_unmasked( @vfadd_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v7, v0 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 @@ -1300,10 +1290,7 @@ define @vfadd_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: sltu a2, a0, a3 ; ZVFHMIN-NEXT: addi a2, a2, -1 ; ZVFHMIN-NEXT: and a2, a2, a3 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: addi a3, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t @@ -1317,24 +1304,17 @@ define @vfadd_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB50_2: ; ZVFHMIN-NEXT: vmv1r.v v0, v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t +; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -1387,13 +1367,13 @@ define @vfadd_vf_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB51_2: +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24 +; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll index 736d575a1a4e3..e5401681bf5f0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll @@ -476,9 +476,9 @@ define @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: fmv.x.h a1, fa0 @@ -492,10 +492,7 @@ define @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: sltu a2, a0, a3 ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a2, a2, a3 -; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: slli a3, a3, 3 -; CHECK-NEXT: add a3, sp, a3 -; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: addi a3, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t @@ -509,24 +506,17 @@ define @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB22_2: ; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24, v0.t +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t +; CHECK-NEXT: vfdiv.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -573,13 +563,13 @@ define @vfdiv_vf_nxv32bf16_unmasked( @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v7, v0 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 @@ -1212,10 +1202,7 @@ define @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: sltu a2, a0, a3 ; ZVFHMIN-NEXT: addi a2, a2, -1 ; ZVFHMIN-NEXT: and a2, a2, a3 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: addi a3, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t @@ -1229,24 +1216,17 @@ define @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB46_2: ; ZVFHMIN-NEXT: vmv1r.v v0, v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24, v0.t +; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -1299,13 +1279,13 @@ define @vfdiv_vf_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB47_2: +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24 +; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll index 0bfa68298f6b5..77c6b2b87b829 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -1001,24 +1001,20 @@ define @vfma_vf_nxv32bf16_commute( ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t +; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -1026,19 +1022,18 @@ define @vfma_vf_nxv32bf16_commute( ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24, v0.t ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t +; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v12, v4 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t @@ -1233,22 +1228,22 @@ define @vfma_vf_nxv32bf16_unmasked_commute( @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a1, a0 @@ -2499,19 +2490,18 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vmv.v.v v16, v8 ; ZVFHMIN-NEXT: vmv4r.v v12, v4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t @@ -2718,22 +2708,22 @@ define @vfma_vf_nxv32f16_unmasked_commute( @vfmsub_vf_nxv32f16_commute( %va ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a1, a0 @@ -8482,19 +8468,18 @@ define @vfmsub_vf_nxv32f16_commute( %va ; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vmv.v.v v16, v8 ; ZVFHMIN-NEXT: vmv4r.v v12, v4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t @@ -8637,9 +8622,75 @@ define @vfmsub_vf_nxv32f16_unmasked_commute( poison, half %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %negvc = call @llvm.vp.fneg.nxv32f16( %vc, splat (i1 true), i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %vb, %va, %negvc, splat (i1 true), i32 %evl) + ret %v +} + +define @vfnmadd_vv_nxv32f16( %va, %b, %c, %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vv_nxv32f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: vl8re16.v v24, (a0) +; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t +; ZVFH-NEXT: vmv.v.v v8, v16 +; ZVFH-NEXT: ret + %negb = call @llvm.vp.fneg.nxv32f16( %b, %m, i32 %evl) + %negc = call @llvm.vp.fneg.nxv32f16( %c, %m, i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %va, %negb, %negc, %m, i32 %evl) + ret %v +} + +define @vfnmadd_vv_nxv32f16_commuted( %va, %b, %c, %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted: +; ZVFH: # %bb.0: +; ZVFH-NEXT: vl8re16.v v24, (a0) +; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t +; ZVFH-NEXT: ret + %negb = call @llvm.vp.fneg.nxv32f16( %b, %m, i32 %evl) + %negc = call @llvm.vp.fneg.nxv32f16( %c, %m, i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %negb, %va, %negc, %m, i32 %evl) + ret %v +} + +define @vfnmadd_vv_nxv32f16_unmasked( %va, %b, %c, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: vl8re16.v v24, (a0) +; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vv v8, v16, v24 +; ZVFH-NEXT: ret + %negb = call @llvm.vp.fneg.nxv32f16( %b, splat (i1 true), i32 %evl) + %negc = call @llvm.vp.fneg.nxv32f16( %c, splat (i1 true), i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %va, %negb, %negc, splat (i1 true), i32 %evl) + ret %v +} + +define @vfnmadd_vv_nxv32f16_unmasked_commuted( %va, %b, %c, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted: +; ZVFH: # %bb.0: +; ZVFH-NEXT: vl8re16.v v24, (a0) +; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vv v8, v16, v24 +; ZVFH-NEXT: ret + %negb = call @llvm.vp.fneg.nxv32f16( %b, splat (i1 true), i32 %evl) + %negc = call @llvm.vp.fneg.nxv32f16( %c, splat (i1 true), i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %negb, %va, %negc, splat (i1 true), i32 %evl) + ret %v +} + +define @vfnmadd_vf_nxv32f16( %va, half %b, %vc, %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vf_nxv32f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: vfmsub_vf_nxv32f16_unmasked_commute: +; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 @@ -8647,27 +8698,28 @@ define @vfmsub_vf_nxv32f16_unmasked_commute( @vfmsub_vf_nxv32f16_unmasked_commute( @vfmsub_vf_nxv32f16_unmasked_commute( @vfmsub_vf_nxv32f16_unmasked_commute( @vfmsub_vf_nxv32f16_unmasked_commute( poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer - %negvc = call @llvm.vp.fneg.nxv32f16( %vc, splat (i1 true), i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %vb, %va, %negvc, splat (i1 true), i32 %evl) - ret %v -} - -define @vfnmadd_vv_nxv32f16( %va, %b, %c, %m, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vv_nxv32f16: -; ZVFH: # %bb.0: -; ZVFH-NEXT: vl8re16.v v24, (a0) -; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t -; ZVFH-NEXT: vmv.v.v v8, v16 -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 5 -; ZVFHMIN-NEXT: sub sp, sp, a2 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv1r.v v3, v0 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: lui a2, 8 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t -; ZVFHMIN-NEXT: slli a0, a3, 1 -; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a4, a1, a0 -; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 -; ZVFHMIN-NEXT: sltu a3, a1, a4 -; ZVFHMIN-NEXT: csrr a5, vlenb -; ZVFHMIN-NEXT: slli a5, a5, 4 -; ZVFHMIN-NEXT: add a5, sp, a5 -; ZVFHMIN-NEXT: addi a5, a5, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v24, v24, a2, v0.t -; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a3, a3, a4 -; ZVFHMIN-NEXT: vmv1r.v v0, v6 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t -; ZVFHMIN-NEXT: vmv.v.v v4, v12 -; ZVFHMIN-NEXT: bltu a1, a0, .LBB286_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: .LBB286_2: -; ZVFHMIN-NEXT: vmv1r.v v0, v3 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t -; ZVFHMIN-NEXT: vmv4r.v v12, v4 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 -; ZVFHMIN-NEXT: ret - %negb = call @llvm.vp.fneg.nxv32f16( %b, %m, i32 %evl) - %negc = call @llvm.vp.fneg.nxv32f16( %c, %m, i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %va, %negb, %negc, %m, i32 %evl) - ret %v -} - -define @vfnmadd_vv_nxv32f16_commuted( %va, %b, %c, %m, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted: -; ZVFH: # %bb.0: -; ZVFH-NEXT: vl8re16.v v24, (a0) -; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_commuted: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 5 -; ZVFHMIN-NEXT: sub sp, sp, a2 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv1r.v v3, v0 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN-NEXT: lui a2, 8 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t -; ZVFHMIN-NEXT: slli a0, a3, 1 -; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a4, a1, a0 -; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 -; ZVFHMIN-NEXT: sltu a3, a1, a4 -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t -; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a3, a3, a4 -; ZVFHMIN-NEXT: vmv1r.v v0, v6 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t -; ZVFHMIN-NEXT: bltu a1, a0, .LBB287_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: .LBB287_2: -; ZVFHMIN-NEXT: vmv1r.v v0, v3 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t -; ZVFHMIN-NEXT: vmv.v.v v16, v8 -; ZVFHMIN-NEXT: vmv4r.v v12, v4 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 -; ZVFHMIN-NEXT: ret - %negb = call @llvm.vp.fneg.nxv32f16( %b, %m, i32 %evl) - %negc = call @llvm.vp.fneg.nxv32f16( %c, %m, i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %negb, %va, %negc, %m, i32 %evl) - ret %v -} - -define @vfnmadd_vv_nxv32f16_unmasked( %va, %b, %c, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked: -; ZVFH: # %bb.0: -; ZVFH-NEXT: vl8re16.v v24, (a0) -; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vv v8, v16, v24 -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 5 -; ZVFHMIN-NEXT: sub sp, sp, a2 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vl8re16.v v24, (a0) -; ZVFHMIN-NEXT: lui a2, 8 -; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v8 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v16, v16, a2 -; ZVFHMIN-NEXT: slli a0, a3, 1 -; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a4, a1, a0 -; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3 -; ZVFHMIN-NEXT: sltu a3, a1, a4 -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v24, a2 -; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a3, a3, a4 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t -; ZVFHMIN-NEXT: bltu a1, a0, .LBB288_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: .LBB288_2: -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 -; ZVFHMIN-NEXT: ret - %negb = call @llvm.vp.fneg.nxv32f16( %b, splat (i1 true), i32 %evl) - %negc = call @llvm.vp.fneg.nxv32f16( %c, splat (i1 true), i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %va, %negb, %negc, splat (i1 true), i32 %evl) - ret %v -} - -define @vfnmadd_vv_nxv32f16_unmasked_commuted( %va, %b, %c, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted: -; ZVFH: # %bb.0: -; ZVFH-NEXT: vl8re16.v v24, (a0) -; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vv v8, v16, v24 -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 5 -; ZVFHMIN-NEXT: sub sp, sp, a2 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vl8re16.v v24, (a0) -; ZVFHMIN-NEXT: lui a2, 8 -; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v8 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v16, v16, a2 -; ZVFHMIN-NEXT: slli a0, a3, 1 -; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a4, a1, a0 -; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3 -; ZVFHMIN-NEXT: sltu a3, a1, a4 -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v24, a2 -; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a3, a3, a4 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t -; ZVFHMIN-NEXT: bltu a1, a0, .LBB289_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: .LBB289_2: -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 -; ZVFHMIN-NEXT: ret - %negb = call @llvm.vp.fneg.nxv32f16( %b, splat (i1 true), i32 %evl) - %negc = call @llvm.vp.fneg.nxv32f16( %c, splat (i1 true), i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %negb, %va, %negc, splat (i1 true), i32 %evl) - ret %v -} - -define @vfnmadd_vf_nxv32f16( %va, half %b, %vc, %m, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vf_nxv32f16: -; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv1r.v v3, v0 -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: lui a2, 8 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vmv.v.x v24, a1 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a4, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a4 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t -; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t -; ZVFHMIN-NEXT: slli a1, a3, 1 -; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a2, a0, a1 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3 -; ZVFHMIN-NEXT: sltu a3, a0, a2 -; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a2, a3, a2 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t -; ZVFHMIN-NEXT: bltu a0, a1, .LBB290_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB290_2: -; ZVFHMIN-NEXT: vmv1r.v v0, v3 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t -; ZVFHMIN-NEXT: vmv.v.v v16, v8 -; ZVFHMIN-NEXT: vmv4r.v v12, v4 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 -; ZVFHMIN-NEXT: ret - %elt.head = insertelement poison, half %b, i32 0 - %vb = shufflevector %elt.head, poison, zeroinitializer - %negva = call @llvm.vp.fneg.nxv32f16( %va, %m, i32 %evl) - %negvc = call @llvm.vp.fneg.nxv32f16( %vc, %m, i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %negva, %vb, %negvc, %m, i32 %evl) - ret %v -} - -define @vfnmadd_vf_nxv32f16_commute( %va, half %b, %vc, %m, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vf_nxv32f16_commute: -; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_commute: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv1r.v v3, v0 -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: lui a2, 8 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vmv.v.x v24, a1 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a4, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a4 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t -; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t -; ZVFHMIN-NEXT: slli a1, a3, 1 -; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a2, a0, a1 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3 -; ZVFHMIN-NEXT: sltu a3, a0, a2 -; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a2, a3, a2 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t -; ZVFHMIN-NEXT: vmv.v.v v4, v12 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB291_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB291_2: -; ZVFHMIN-NEXT: vmv1r.v v0, v3 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t -; ZVFHMIN-NEXT: vmv.v.v v16, v8 -; ZVFHMIN-NEXT: vmv4r.v v12, v4 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 -; ZVFHMIN-NEXT: ret - %elt.head = insertelement poison, half %b, i32 0 - %vb = shufflevector %elt.head, poison, zeroinitializer - %negva = call @llvm.vp.fneg.nxv32f16( %va, %m, i32 %evl) - %negvc = call @llvm.vp.fneg.nxv32f16( %vc, %m, i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %vb, %negva, %negvc, %m, i32 %evl) - ret %v -} - -define @vfnmadd_vf_nxv32f16_unmasked( %va, half %b, %vc, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked: -; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16 -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1 -; ZVFHMIN-NEXT: slli a1, a3, 1 -; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a4, a0, a1 -; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3 -; ZVFHMIN-NEXT: sltu a3, a0, a4 -; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a3, a3, a4 -; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 -; ZVFHMIN-NEXT: add a4, sp, a4 -; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 -; ZVFHMIN-NEXT: add a4, sp, a4 -; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 -; ZVFHMIN-NEXT: mv a5, a4 -; ZVFHMIN-NEXT: slli a4, a4, 1 -; ZVFHMIN-NEXT: add a4, a4, a5 -; ZVFHMIN-NEXT: add a4, sp, a4 -; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t -; ZVFHMIN-NEXT: bltu a0, a1, .LBB292_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB292_2: -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 -; ZVFHMIN-NEXT: ret - %elt.head = insertelement poison, half %b, i32 0 - %vb = shufflevector %elt.head, poison, zeroinitializer - %negva = call @llvm.vp.fneg.nxv32f16( %va, splat (i1 true), i32 %evl) - %negvc = call @llvm.vp.fneg.nxv32f16( %vc, splat (i1 true), i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %negva, %vb, %negvc, splat (i1 true), i32 %evl) - ret %v -} - -define @vfnmadd_vf_nxv32f16_unmasked_commute( %va, half %b, %vc, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute: -; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16 -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a2, fa0 -; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1 -; ZVFHMIN-NEXT: slli a1, a3, 1 -; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a4, a0, a1 -; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3 -; ZVFHMIN-NEXT: sltu a3, a0, a4 -; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a3, a3, a4 -; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 -; ZVFHMIN-NEXT: add a4, sp, a4 -; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 -; ZVFHMIN-NEXT: add a4, sp, a4 -; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t -; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t -; ZVFHMIN-NEXT: bltu a0, a1, .LBB293_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB293_2: -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 -; ZVFHMIN-NEXT: ret - %elt.head = insertelement poison, half %b, i32 0 - %vb = shufflevector %elt.head, poison, zeroinitializer - %negva = call @llvm.vp.fneg.nxv32f16( %va, splat (i1 true), i32 %evl) - %negvc = call @llvm.vp.fneg.nxv32f16( %vc, splat (i1 true), i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %vb, %negva, %negvc, splat (i1 true), i32 %evl) + %negva = call @llvm.vp.fneg.nxv32f16( %va, %m, i32 %evl) + %negvc = call @llvm.vp.fneg.nxv32f16( %vc, %m, i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %negva, %vb, %negvc, %m, i32 %evl) ret %v } -define @vfnmadd_vf_nxv32f16_neg_splat( %va, half %b, %vc, %m, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat: +define @vfnmadd_vf_nxv32f16_commute( %va, half %b, %vc, %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vf_nxv32f16_commute: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat: +; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 @@ -9779,24 +8830,25 @@ define @vfnmadd_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: slli a1, a1, 5 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v3, v0 +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: lui a2, 8 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: mv a4, a1 ; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: add a1, a1, a4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: lui a2, 8 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vmv.v.x v24, a1 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t +; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t -; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t ; ZVFHMIN-NEXT: sub a2, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3 @@ -9834,10 +8886,10 @@ define @vfnmadd_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t ; ZVFHMIN-NEXT: vmv.v.v v4, v12 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB294_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB291_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB294_2: +; ZVFHMIN-NEXT: .LBB291_2: ; ZVFHMIN-NEXT: vmv1r.v v0, v3 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 @@ -9862,152 +8914,21 @@ define @vfnmadd_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t -; ZVFHMIN-NEXT: vmv.v.v v16, v8 -; ZVFHMIN-NEXT: vmv4r.v v12, v4 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 -; ZVFHMIN-NEXT: ret - %elt.head = insertelement poison, half %b, i32 0 - %vb = shufflevector %elt.head, poison, zeroinitializer - %negvb = call @llvm.vp.fneg.nxv32f16( %vb, %m, i32 %evl) - %negvc = call @llvm.vp.fneg.nxv32f16( %vc, %m, i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %va, %negvb, %negvc, %m, i32 %evl) - ret %v -} - -define @vfnmadd_vf_nxv32f16_neg_splat_commute( %va, half %b, %vc, %m, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute: -; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv1r.v v3, v0 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: lui a2, 8 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vmv.v.x v24, a1 -; ZVFHMIN-NEXT: slli a1, a3, 1 -; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t -; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t -; ZVFHMIN-NEXT: sub a2, a0, a1 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3 -; ZVFHMIN-NEXT: sltu a3, a0, a2 -; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a2, a3, a2 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t -; ZVFHMIN-NEXT: bltu a0, a1, .LBB295_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB295_2: -; ZVFHMIN-NEXT: vmv1r.v v0, v3 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t ; ZVFHMIN-NEXT: vmv.v.v v16, v8 ; ZVFHMIN-NEXT: vmv4r.v v12, v4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma @@ -10021,20 +8942,20 @@ define @vfnmadd_vf_nxv32f16_neg_splat_commute( poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer - %negvb = call @llvm.vp.fneg.nxv32f16( %vb, %m, i32 %evl) + %negva = call @llvm.vp.fneg.nxv32f16( %va, %m, i32 %evl) %negvc = call @llvm.vp.fneg.nxv32f16( %vc, %m, i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %negvb, %va, %negvc, %m, i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %vb, %negva, %negvc, %m, i32 %evl) ret %v } -define @vfnmadd_vf_nxv32f16_neg_splat_unmasked( %va, half %b, %vc, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked: +define @vfnmadd_vf_nxv32f16_unmasked( %va, half %b, %vc, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16 ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked: +; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 @@ -10042,80 +8963,74 @@ define @vfnmadd_vf_nxv32f16_neg_splat_unmasked( @vfnmadd_vf_nxv32f16_neg_splat_unmasked( @vfnmadd_vf_nxv32f16_neg_splat_unmasked( poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer - %negvb = call @llvm.vp.fneg.nxv32f16( %vb, splat (i1 true), i32 %evl) + %negva = call @llvm.vp.fneg.nxv32f16( %va, splat (i1 true), i32 %evl) %negvc = call @llvm.vp.fneg.nxv32f16( %vc, splat (i1 true), i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %va, %negvb, %negvc, splat (i1 true), i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %negva, %vb, %negvc, splat (i1 true), i32 %evl) ret %v } -define @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( %va, half %b, %vc, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute: +define @vfnmadd_vf_nxv32f16_unmasked_commute( %va, half %b, %vc, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16 ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute: +; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 @@ -10161,80 +9082,82 @@ define @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer - %negvb = call @llvm.vp.fneg.nxv32f16( %vb, splat (i1 true), i32 %evl) + %negva = call @llvm.vp.fneg.nxv32f16( %va, splat (i1 true), i32 %evl) %negvc = call @llvm.vp.fneg.nxv32f16( %vc, splat (i1 true), i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %negvb, %va, %negvc, splat (i1 true), i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %vb, %negva, %negvc, splat (i1 true), i32 %evl) ret %v } -define @vfnmsub_vv_nxv32f16( %va, %b, %c, %m, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmsub_vv_nxv32f16: +define @vfnmadd_vf_nxv32f16_neg_splat( %va, half %b, %vc, %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat: ; ZVFH: # %bb.0: -; ZVFH-NEXT: vl8re16.v v24, (a0) -; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t -; ZVFH-NEXT: vmv.v.v v8, v16 +; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16: +; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 5 -; ZVFHMIN-NEXT: sub sp, sp, a2 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v3, v0 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: lui a2, 8 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t -; ZVFHMIN-NEXT: slli a0, a3, 1 +; ZVFHMIN-NEXT: vmv.v.x v24, a1 +; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a4, a1, a0 -; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 -; ZVFHMIN-NEXT: sltu a3, a1, a4 -; ZVFHMIN-NEXT: csrr a5, vlenb -; ZVFHMIN-NEXT: slli a5, a5, 4 -; ZVFHMIN-NEXT: add a5, sp, a5 -; ZVFHMIN-NEXT: addi a5, a5, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v24, v24, a2, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t +; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t +; ZVFHMIN-NEXT: sub a2, a0, a1 +; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3 +; ZVFHMIN-NEXT: sltu a3, a0, a2 ; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a3, a3, a4 -; ZVFHMIN-NEXT: vmv1r.v v0, v6 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t +; ZVFHMIN-NEXT: and a2, a3, a2 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: mv a3, a2 @@ -10340,31 +9249,40 @@ define @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vv_nxv32f16( %va, @llvm.vp.fneg.nxv32f16( %b, %m, i32 %evl) - %negc = call @llvm.vp.fneg.nxv32f16( %c, %m, i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %va, %negb, %negc, %m, i32 %evl) + %elt.head = insertelement poison, half %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %negvb = call @llvm.vp.fneg.nxv32f16( %vb, %m, i32 %evl) + %negvc = call @llvm.vp.fneg.nxv32f16( %vc, %m, i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %va, %negvb, %negvc, %m, i32 %evl) ret %v } -define @vfnmsub_vv_nxv32f16_commuted( %va, %b, %c, %m, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmsub_vv_nxv32f16_commuted: +define @vfnmadd_vf_nxv32f16_neg_splat_commute( %va, half %b, %vc, %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute: ; ZVFH: # %bb.0: -; ZVFH-NEXT: vl8re16.v v24, (a0) -; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t +; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_commuted: +; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 5 -; ZVFHMIN-NEXT: sub sp, sp, a2 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v3, v0 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: lui a2, 8 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t -; ZVFHMIN-NEXT: slli a0, a3, 1 +; ZVFHMIN-NEXT: vmv.v.x v24, a1 +; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a4, a1, a0 -; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 -; ZVFHMIN-NEXT: sltu a3, a1, a4 -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t +; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t +; ZVFHMIN-NEXT: sub a2, a0, a1 +; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3 +; ZVFHMIN-NEXT: sltu a3, a0, a2 ; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a3, a3, a4 -; ZVFHMIN-NEXT: vmv1r.v v0, v6 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t +; ZVFHMIN-NEXT: and a2, a3, a2 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: mv a3, a2 @@ -10471,38 +9381,38 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t -; ZVFHMIN-NEXT: bltu a1, a0, .LBB299_2 +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t +; ZVFHMIN-NEXT: bltu a0, a1, .LBB295_2 ; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: .LBB299_2: +; ZVFHMIN-NEXT: mv a0, a1 +; ZVFHMIN-NEXT: .LBB295_2: ; ZVFHMIN-NEXT: vmv1r.v v0, v3 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 @@ -10511,15 +9421,15 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t ; ZVFHMIN-NEXT: vmv.v.v v16, v8 @@ -10533,68 +9443,69 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret - %negb = call @llvm.vp.fneg.nxv32f16( %b, %m, i32 %evl) - %negc = call @llvm.vp.fneg.nxv32f16( %c, %m, i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %negb, %va, %negc, %m, i32 %evl) + %elt.head = insertelement poison, half %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %negvb = call @llvm.vp.fneg.nxv32f16( %vb, %m, i32 %evl) + %negvc = call @llvm.vp.fneg.nxv32f16( %vc, %m, i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %negvb, %va, %negvc, %m, i32 %evl) ret %v } -define @vfnmsub_vv_nxv32f16_unmasked( %va, %b, %c, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked: +define @vfnmadd_vf_nxv32f16_neg_splat_unmasked( %va, half %b, %vc, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: vl8re16.v v24, (a0) -; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vv v8, v16, v24 +; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16 ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_unmasked: +; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 5 -; ZVFHMIN-NEXT: sub sp, sp, a2 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vl8re16.v v24, (a0) +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: sub sp, sp, a1 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: lui a2, 8 -; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v8 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: vxor.vx v16, v16, a2 -; ZVFHMIN-NEXT: slli a0, a3, 1 +; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a4, a1, a0 -; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3 -; ZVFHMIN-NEXT: sltu a3, a1, a4 -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v24, a2 +; ZVFHMIN-NEXT: sub a2, a0, a1 +; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3 +; ZVFHMIN-NEXT: sltu a3, a0, a2 ; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a3, a3, a4 +; ZVFHMIN-NEXT: and a2, a3, a2 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: mv a3, a2 @@ -10610,16 +9521,16 @@ define @vfnmsub_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t -; ZVFHMIN-NEXT: bltu a1, a0, .LBB300_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB296_2 ; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: .LBB300_2: -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; ZVFHMIN-NEXT: mv a0, a1 +; ZVFHMIN-NEXT: .LBB296_2: +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill @@ -10628,7 +9539,7 @@ define @vfnmsub_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a1, a0 @@ -10636,14 +9547,14 @@ define @vfnmsub_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v0 +; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add sp, sp, a0 @@ -10651,68 +9562,69 @@ define @vfnmsub_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret - %negb = call @llvm.vp.fneg.nxv32f16( %b, splat (i1 true), i32 %evl) - %negc = call @llvm.vp.fneg.nxv32f16( %c, splat (i1 true), i32 %evl) - %v = call @llvm.vp.fma.nxv32f16( %va, %negb, %negc, splat (i1 true), i32 %evl) + %elt.head = insertelement poison, half %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %negvb = call @llvm.vp.fneg.nxv32f16( %vb, splat (i1 true), i32 %evl) + %negvc = call @llvm.vp.fneg.nxv32f16( %vc, splat (i1 true), i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %va, %negvb, %negvc, splat (i1 true), i32 %evl) ret %v } -define @vfnmsub_vv_nxv32f16_unmasked_commuted( %va, %b, %c, i32 zeroext %evl) { -; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted: +define @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( %va, half %b, %vc, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute: ; ZVFH: # %bb.0: -; ZVFH-NEXT: vl8re16.v v24, (a0) -; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFH-NEXT: vfnmadd.vv v8, v16, v24 +; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16 ; ZVFH-NEXT: ret ; -; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted: +; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 5 -; ZVFHMIN-NEXT: sub sp, sp, a2 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vl8re16.v v24, (a0) +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: lui a2, 8 -; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v8 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: vxor.vx v16, v16, a2 -; ZVFHMIN-NEXT: slli a0, a3, 1 +; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: sub a4, a1, a0 -; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3 -; ZVFHMIN-NEXT: sltu a3, a1, a4 -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v24, a2 +; ZVFHMIN-NEXT: sub a2, a0, a1 +; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3 +; ZVFHMIN-NEXT: sltu a3, a0, a2 ; ZVFHMIN-NEXT: addi a3, a3, -1 -; ZVFHMIN-NEXT: and a3, a3, a4 +; ZVFHMIN-NEXT: and a2, a3, a2 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: mv a3, a2 @@ -10720,33 +9632,33 @@ define @vfnmsub_vv_nxv32f16_unmasked_commuted( @vfnmsub_vv_nxv32f16_unmasked_commuted( @vfnmsub_vv_nxv32f16_unmasked_commuted( poison, half %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %negvb = call @llvm.vp.fneg.nxv32f16( %vb, splat (i1 true), i32 %evl) + %negvc = call @llvm.vp.fneg.nxv32f16( %vc, splat (i1 true), i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %negvb, %va, %negvc, splat (i1 true), i32 %evl) + ret %v +} + +define @vfnmsub_vv_nxv32f16( %va, %b, %c, %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmsub_vv_nxv32f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: vl8re16.v v24, (a0) +; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t +; ZVFH-NEXT: vmv.v.v v8, v16 +; ZVFH-NEXT: ret + %negb = call @llvm.vp.fneg.nxv32f16( %b, %m, i32 %evl) + %negc = call @llvm.vp.fneg.nxv32f16( %c, %m, i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %va, %negb, %negc, %m, i32 %evl) + ret %v +} + +define @vfnmsub_vv_nxv32f16_commuted( %va, %b, %c, %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmsub_vv_nxv32f16_commuted: +; ZVFH: # %bb.0: +; ZVFH-NEXT: vl8re16.v v24, (a0) +; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t +; ZVFH-NEXT: ret + %negb = call @llvm.vp.fneg.nxv32f16( %b, %m, i32 %evl) + %negc = call @llvm.vp.fneg.nxv32f16( %c, %m, i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %negb, %va, %negc, %m, i32 %evl) + ret %v +} + +define @vfnmsub_vv_nxv32f16_unmasked( %va, %b, %c, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: vl8re16.v v24, (a0) +; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vv v8, v16, v24 +; ZVFH-NEXT: ret + %negb = call @llvm.vp.fneg.nxv32f16( %b, splat (i1 true), i32 %evl) + %negc = call @llvm.vp.fneg.nxv32f16( %c, splat (i1 true), i32 %evl) + %v = call @llvm.vp.fma.nxv32f16( %va, %negb, %negc, splat (i1 true), i32 %evl) + ret %v +} + +define @vfnmsub_vv_nxv32f16_unmasked_commuted( %va, %b, %c, i32 zeroext %evl) { +; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted: +; ZVFH: # %bb.0: +; ZVFH-NEXT: vl8re16.v v24, (a0) +; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFH-NEXT: vfnmadd.vv v8, v16, v24 +; ZVFH-NEXT: ret %negb = call @llvm.vp.fneg.nxv32f16( %b, splat (i1 true), i32 %evl) %negc = call @llvm.vp.fneg.nxv32f16( %c, splat (i1 true), i32 %evl) %v = call @llvm.vp.fma.nxv32f16( %negb, %va, %negc, splat (i1 true), i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll index 5c8e499d2f5e1..3df26e5a99585 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll @@ -610,9 +610,9 @@ define @vfmul_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v7, v0 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 @@ -626,10 +626,7 @@ define @vfmul_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: sltu a2, a0, a3 ; ZVFHMIN-NEXT: addi a2, a2, -1 ; ZVFHMIN-NEXT: and a2, a2, a3 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: addi a3, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t @@ -643,24 +640,17 @@ define @vfmul_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB22_2: ; ZVFHMIN-NEXT: vmv1r.v v0, v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v16, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmul.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -713,13 +703,13 @@ define @vfmul_vf_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB23_2: +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v16, v16, v24 +; ZVFHMIN-NEXT: vfmul.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll index dc0bfbd0f76dd..4fe0bb64e4ef1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll @@ -476,9 +476,9 @@ define @vfsub_vf_nxv32bf16( %va, bf ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: fmv.x.h a1, fa0 @@ -492,10 +492,7 @@ define @vfsub_vf_nxv32bf16( %va, bf ; CHECK-NEXT: sltu a2, a0, a3 ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a2, a2, a3 -; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: slli a3, a3, 3 -; CHECK-NEXT: add a3, sp, a3 -; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: addi a3, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t @@ -509,24 +506,17 @@ define @vfsub_vf_nxv32bf16( %va, bf ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB22_2: ; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24, v0.t +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t +; CHECK-NEXT: vfsub.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -573,13 +563,13 @@ define @vfsub_vf_nxv32bf16_unmasked( @vfsub_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v7, v0 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 @@ -1212,10 +1202,7 @@ define @vfsub_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: sltu a2, a0, a3 ; ZVFHMIN-NEXT: addi a2, a2, -1 ; ZVFHMIN-NEXT: and a2, a2, a3 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: addi a3, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t @@ -1229,24 +1216,17 @@ define @vfsub_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB46_2: ; ZVFHMIN-NEXT: vmv1r.v v0, v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24, v0.t +; ZVFHMIN-NEXT: vfsub.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -1299,13 +1279,13 @@ define @vfsub_vf_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB47_2: +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24 +; ZVFHMIN-NEXT: vfsub.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll index d81936354f6f3..77fd2e8f88de5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll @@ -1048,22 +1048,24 @@ define @vmax_vx_nxv32i32_evl_nx8( %va, i3 ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; RV64-NEXT: vmv1r.v v24, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: srli a3, a1, 2 -; RV64-NEXT: slli a2, a1, 1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: srli a3, a2, 2 +; RV64-NEXT: slli a1, a2, 1 +; RV64-NEXT: slli a2, a2, 32 ; RV64-NEXT: vslidedown.vx v0, v0, a3 -; RV64-NEXT: sub a3, a1, a2 -; RV64-NEXT: sltu a4, a1, a3 +; RV64-NEXT: srli a2, a2, 32 +; RV64-NEXT: sub a3, a2, a1 +; RV64-NEXT: sltu a4, a2, a3 ; RV64-NEXT: addi a4, a4, -1 ; RV64-NEXT: and a3, a4, a3 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vmax.vx v16, v16, a0, v0.t -; RV64-NEXT: bltu a1, a2, .LBB82_2 +; RV64-NEXT: bltu a2, a1, .LBB82_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a1, a2 +; RV64-NEXT: mv a2, a1 ; RV64-NEXT: .LBB82_2: ; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vmax.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll index 7603bcef1973e..0700700c02e25 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll @@ -1047,22 +1047,24 @@ define @vmaxu_vx_nxv32i32_evl_nx8( %va, i ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; RV64-NEXT: vmv1r.v v24, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: srli a3, a1, 2 -; RV64-NEXT: slli a2, a1, 1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: srli a3, a2, 2 +; RV64-NEXT: slli a1, a2, 1 +; RV64-NEXT: slli a2, a2, 32 ; RV64-NEXT: vslidedown.vx v0, v0, a3 -; RV64-NEXT: sub a3, a1, a2 -; RV64-NEXT: sltu a4, a1, a3 +; RV64-NEXT: srli a2, a2, 32 +; RV64-NEXT: sub a3, a2, a1 +; RV64-NEXT: sltu a4, a2, a3 ; RV64-NEXT: addi a4, a4, -1 ; RV64-NEXT: and a3, a4, a3 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vmaxu.vx v16, v16, a0, v0.t -; RV64-NEXT: bltu a1, a2, .LBB82_2 +; RV64-NEXT: bltu a2, a1, .LBB82_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a1, a2 +; RV64-NEXT: mv a2, a1 ; RV64-NEXT: .LBB82_2: ; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vmaxu.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll index 3922b09f1f02d..c7ad8744fe2c6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll @@ -1048,22 +1048,24 @@ define @vmin_vx_nxv32i32_evl_nx8( %va, i3 ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; RV64-NEXT: vmv1r.v v24, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: srli a3, a1, 2 -; RV64-NEXT: slli a2, a1, 1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: srli a3, a2, 2 +; RV64-NEXT: slli a1, a2, 1 +; RV64-NEXT: slli a2, a2, 32 ; RV64-NEXT: vslidedown.vx v0, v0, a3 -; RV64-NEXT: sub a3, a1, a2 -; RV64-NEXT: sltu a4, a1, a3 +; RV64-NEXT: srli a2, a2, 32 +; RV64-NEXT: sub a3, a2, a1 +; RV64-NEXT: sltu a4, a2, a3 ; RV64-NEXT: addi a4, a4, -1 ; RV64-NEXT: and a3, a4, a3 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vmin.vx v16, v16, a0, v0.t -; RV64-NEXT: bltu a1, a2, .LBB82_2 +; RV64-NEXT: bltu a2, a1, .LBB82_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a1, a2 +; RV64-NEXT: mv a2, a1 ; RV64-NEXT: .LBB82_2: ; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vmin.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll index 59af953fd52d3..388606e39b277 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll @@ -1047,22 +1047,24 @@ define @vminu_vx_nxv32i32_evl_nx8( %va, i ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; RV64-NEXT: vmv1r.v v24, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: srli a3, a1, 2 -; RV64-NEXT: slli a2, a1, 1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: srli a3, a2, 2 +; RV64-NEXT: slli a1, a2, 1 +; RV64-NEXT: slli a2, a2, 32 ; RV64-NEXT: vslidedown.vx v0, v0, a3 -; RV64-NEXT: sub a3, a1, a2 -; RV64-NEXT: sltu a4, a1, a3 +; RV64-NEXT: srli a2, a2, 32 +; RV64-NEXT: sub a3, a2, a1 +; RV64-NEXT: sltu a4, a2, a3 ; RV64-NEXT: addi a4, a4, -1 ; RV64-NEXT: and a3, a4, a3 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vminu.vx v16, v16, a0, v0.t -; RV64-NEXT: bltu a1, a2, .LBB82_2 +; RV64-NEXT: bltu a2, a1, .LBB82_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a1, a2 +; RV64-NEXT: mv a2, a1 ; RV64-NEXT: .LBB82_2: ; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vminu.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll index b83ddce61f44d..921697566cb3d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll @@ -470,61 +470,61 @@ define @test_vp_splice_nxv16i64( %va, @test_vp_splice_nxv16i64( %va, @test_vp_splice_nxv16i64_negative_offset( %va, %vb, i32 zeroext %evla, i32 zeroext %evlb) #0 { ; CHECK-LABEL: test_vp_splice_nxv16i64_negative_offset: ; CHECK: # %bb.0: -; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a1, a4, 3 -; CHECK-NEXT: slli a7, a4, 1 -; CHECK-NEXT: addi a7, a7, -1 -; CHECK-NEXT: add a5, a0, a1 -; CHECK-NEXT: mv a6, a2 -; CHECK-NEXT: bltu a2, a7, .LBB23_2 +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a4, a5, 1 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: slli a1, a5, 3 +; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv t0, a2 +; CHECK-NEXT: bltu a2, a4, .LBB23_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a6, a7 +; CHECK-NEXT: mv t0, a4 ; CHECK-NEXT: .LBB23_2: ; CHECK-NEXT: addi sp, sp, -80 ; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; CHECK-NEXT: addi s0, sp, 80 -; CHECK-NEXT: csrr a7, vlenb -; CHECK-NEXT: slli a7, a7, 5 -; CHECK-NEXT: sub sp, sp, a7 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 5 +; CHECK-NEXT: sub sp, sp, a4 ; CHECK-NEXT: andi sp, sp, -64 -; CHECK-NEXT: vl8re64.v v24, (a5) -; CHECK-NEXT: slli a5, a6, 3 -; CHECK-NEXT: addi a7, sp, 64 -; CHECK-NEXT: add a6, a7, a5 -; CHECK-NEXT: mv t0, a2 -; CHECK-NEXT: bltu a2, a4, .LBB23_4 +; CHECK-NEXT: add a6, a0, a1 +; CHECK-NEXT: slli a4, t0, 3 +; CHECK-NEXT: addi t0, sp, 64 +; CHECK-NEXT: bltu a7, a5, .LBB23_4 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: mv t0, a4 +; CHECK-NEXT: mv a7, a5 ; CHECK-NEXT: .LBB23_4: +; CHECK-NEXT: vl8re64.v v24, (a6) +; CHECK-NEXT: add a6, t0, a4 ; CHECK-NEXT: vl8re64.v v0, (a0) -; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma -; CHECK-NEXT: vse64.v v8, (a7) -; CHECK-NEXT: sub a0, a2, a4 -; CHECK-NEXT: add a7, a7, a1 -; CHECK-NEXT: sub t0, a3, a4 +; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (t0) +; CHECK-NEXT: sub a0, a2, a5 ; CHECK-NEXT: sltu a2, a2, a0 ; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: and a2, a2, a0 -; CHECK-NEXT: sltu a0, a3, t0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, t0 -; CHECK-NEXT: add t0, a6, a1 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vse64.v v16, (a7) +; CHECK-NEXT: and a0, a2, a0 +; CHECK-NEXT: add t0, t0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vse64.v v24, (t0) -; CHECK-NEXT: bltu a3, a4, .LBB23_6 +; CHECK-NEXT: vse64.v v16, (t0) +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: bltu a3, a5, .LBB23_6 ; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: mv a3, a4 +; CHECK-NEXT: mv a0, a5 ; CHECK-NEXT: .LBB23_6: -; CHECK-NEXT: li a2, 8 -; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v0, (a6) -; CHECK-NEXT: bltu a5, a2, .LBB23_8 +; CHECK-NEXT: sub a2, a3, a5 +; CHECK-NEXT: add a5, a6, a1 +; CHECK-NEXT: sltu a3, a3, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: li a3, 8 +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v24, (a5) +; CHECK-NEXT: bltu a4, a3, .LBB23_8 ; CHECK-NEXT: # %bb.7: -; CHECK-NEXT: li a5, 8 +; CHECK-NEXT: li a4, 8 ; CHECK-NEXT: .LBB23_8: -; CHECK-NEXT: sub a2, a6, a5 +; CHECK-NEXT: sub a2, a6, a4 ; CHECK-NEXT: add a1, a2, a1 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a2) ; CHECK-NEXT: addi sp, s0, -80 ; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll index 2ed3c9bfe2c16..ea8e57efaf6ca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll @@ -431,28 +431,30 @@ define @select_evl_nxv32i32( %a, @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin ; AVX512F-NEXT: shrq $32, %rdi ; AVX512F-NEXT: shrq $48, %rax ; AVX512F-NEXT: shrl $16, %ecx -; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: kmovw %edi, %k4 @@ -364,11 +364,11 @@ define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nou ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi -; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpavgw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw %edi, %k2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 0de308a9e0738..9be816655072c 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1734,20 +1734,20 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: not_avg_v16i8_wide_constants: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps (%rdi), %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm2 ; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movd %eax, %xmm4 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm5 @@ -1762,9 +1762,6 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm8 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm9 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax @@ -1774,6 +1771,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm12 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax +; SSE2-NEXT: movd %eax, %xmm10 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm13 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax @@ -1783,45 +1783,43 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm15 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE2-NEXT: movapd %xmm4, %xmm5 ; SSE2-NEXT: andpd %xmm1, %xmm5 ; SSE2-NEXT: xorpd %xmm4, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: paddw %xmm5, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; SSE2-NEXT: movapd %xmm2, %xmm3 -; SSE2-NEXT: andpd %xmm0, %xmm3 -; SSE2-NEXT: xorpd %xmm2, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: paddw %xmm3, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: packuswb %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; SSE2-NEXT: movapd %xmm0, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: xorpd %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: paddw %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: retq ; @@ -1831,75 +1829,71 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpextrw $7, %xmm3, %edx -; AVX1-NEXT: vpextrw $6, %xmm3, %ecx -; AVX1-NEXT: vpextrw $5, %xmm3, %eax +; AVX1-NEXT: vpextrw $3, %xmm3, %edx +; AVX1-NEXT: vpextrw $2, %xmm3, %ecx +; AVX1-NEXT: vpextrw $1, %xmm3, %eax ; AVX1-NEXT: decl %edx ; AVX1-NEXT: vmovd %edx, %xmm4 -; AVX1-NEXT: vpextrw $4, %xmm3, %edx +; AVX1-NEXT: vpextrw $0, %xmm3, %edx ; AVX1-NEXT: decl %ecx ; AVX1-NEXT: vmovd %ecx, %xmm5 -; AVX1-NEXT: vpextrw $1, %xmm3, %ecx +; AVX1-NEXT: vpextrw $3, %xmm2, %ecx ; AVX1-NEXT: decl %eax ; AVX1-NEXT: vmovd %eax, %xmm6 -; AVX1-NEXT: vpextrw $0, %xmm3, %eax +; AVX1-NEXT: vpextrw $2, %xmm2, %eax ; AVX1-NEXT: decl %edx ; AVX1-NEXT: vmovd %edx, %xmm7 -; AVX1-NEXT: vpextrw $3, %xmm3, %edx -; AVX1-NEXT: decq %rcx -; AVX1-NEXT: vmovq %rcx, %xmm8 -; AVX1-NEXT: vpextrw $2, %xmm3, %ecx -; AVX1-NEXT: decq %rax -; AVX1-NEXT: vmovq %rax, %xmm3 -; AVX1-NEXT: vpextrw $7, %xmm2, %eax +; AVX1-NEXT: vpextrw $1, %xmm2, %edx +; AVX1-NEXT: decl %ecx +; AVX1-NEXT: vmovd %ecx, %xmm8 +; AVX1-NEXT: vpextrw $0, %xmm2, %ecx +; AVX1-NEXT: decl %eax +; AVX1-NEXT: vmovd %eax, %xmm9 +; AVX1-NEXT: vpextrw $7, %xmm3, %eax ; AVX1-NEXT: decl %edx -; AVX1-NEXT: vmovd %edx, %xmm9 -; AVX1-NEXT: vpextrw $6, %xmm2, %edx +; AVX1-NEXT: vmovd %edx, %xmm10 +; AVX1-NEXT: vpextrw $6, %xmm3, %edx ; AVX1-NEXT: decl %ecx -; AVX1-NEXT: vmovd %ecx, %xmm10 -; AVX1-NEXT: vpextrw $5, %xmm2, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm11 +; AVX1-NEXT: vpextrw $7, %xmm2, %ecx ; AVX1-NEXT: decl %eax -; AVX1-NEXT: vmovd %eax, %xmm11 -; AVX1-NEXT: vpextrw $4, %xmm2, %eax +; AVX1-NEXT: vmovd %eax, %xmm12 +; AVX1-NEXT: vpextrw $6, %xmm2, %eax ; AVX1-NEXT: decl %edx -; AVX1-NEXT: vmovd %edx, %xmm12 -; AVX1-NEXT: vpextrw $1, %xmm2, %edx +; AVX1-NEXT: vmovd %edx, %xmm13 +; AVX1-NEXT: vpextrw $5, %xmm3, %edx ; AVX1-NEXT: decl %ecx -; AVX1-NEXT: vmovd %ecx, %xmm13 -; AVX1-NEXT: vpextrw $0, %xmm2, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm14 +; AVX1-NEXT: vpextrw $4, %xmm3, %ecx ; AVX1-NEXT: decl %eax -; AVX1-NEXT: vmovd %eax, %xmm14 -; AVX1-NEXT: vpextrw $3, %xmm2, %eax -; AVX1-NEXT: decq %rdx -; AVX1-NEXT: vmovq %rdx, %xmm15 -; AVX1-NEXT: vpextrw $2, %xmm2, %edx -; AVX1-NEXT: decq %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vpextrw $5, %xmm2, %eax +; AVX1-NEXT: decl %edx +; AVX1-NEXT: vmovd %edx, %xmm15 +; AVX1-NEXT: vpextrw $4, %xmm2, %edx +; AVX1-NEXT: decl %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 ; AVX1-NEXT: decl %eax ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX1-NEXT: vmovd %eax, %xmm5 ; AVX1-NEXT: decl %edx ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-NEXT: vmovd %edx, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm2[0,0,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm1 ; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll index e5696ded4fbf1..021b9747795ef 100644 --- a/llvm/test/CodeGen/X86/bmi-select-distrib.ll +++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll @@ -75,14 +75,14 @@ define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: negl %eax -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: andl %esi, %edx -; X86-NEXT: andl %ecx, %eax +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: andl %ecx, %edx +; X86-NEXT: andl %esi, %eax ; X86-NEXT: testb $1, {{[0-9]+}}(%esp) -; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: cmovel %ecx, %edx +; X86-NEXT: cmovel %esi, %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -399,15 +399,15 @@ define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $-1, %eax -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: adcl $-1, %edx -; X86-NEXT: andl %esi, %edx -; X86-NEXT: andl %ecx, %eax +; X86-NEXT: andl %ecx, %edx +; X86-NEXT: andl %esi, %eax ; X86-NEXT: testb $1, {{[0-9]+}}(%esp) -; X86-NEXT: cmovel %ecx, %eax -; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmovel %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -691,15 +691,15 @@ define i64 @xor_select_sub_1_to_blsmsk_i64(i1 %a0, i64 %a1) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $-1, %eax -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: adcl $-1, %edx -; X86-NEXT: xorl %esi, %edx -; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: xorl %esi, %eax ; X86-NEXT: testb $1, {{[0-9]+}}(%esp) -; X86-NEXT: cmovel %ecx, %eax -; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmovel %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll index ff9f995c4765b..ff9d90be761f8 100644 --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -248,9 +248,11 @@ define void @PR52039(ptr %pa, ptr %pb) { ; ; AVX1-LABEL: PR52039: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [10,10,10,10] -; AVX1-NEXT: vpsubd 16(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu (%rdi), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [10,10,10,10] +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm3 @@ -259,6 +261,7 @@ define void @PR52039(ptr %pa, ptr %pb) { ; AVX1-NEXT: vmovdqu %xmm0, (%rsi) ; AVX1-NEXT: vmovdqu %xmm3, 16(%rdi) ; AVX1-NEXT: vmovdqu %xmm2, (%rdi) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR52039: diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 455b72d16a075..d1a0930d8b7b3 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -208,7 +208,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: bsrl %eax, %ecx ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx +; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: bsrl %ebx, %edx @@ -216,11 +216,11 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bsrl %edi, %edi ; X86-NEXT: xorl $31, %edi -; X86-NEXT: orl $32, %edi +; X86-NEXT: addl $32, %edi ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %edi -; X86-NEXT: orl $64, %edi +; X86-NEXT: addl $64, %edi ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %esi, %edx @@ -230,7 +230,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx +; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %eax, %eax ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -238,10 +238,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %esi ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx -; X86-NEXT: orl $32, %edx +; X86-NEXT: addl $32, %edx ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: orl $64, %edx +; X86-NEXT: addl $64, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: orl %eax, %esi ; X86-NEXT: cmovnel %ecx, %edx diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 859e9244d29d2..370e1c608e44f 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -173,17 +173,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl 48(%ebp), %ecx ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx +; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl %ebx, %eax ; X86-NEXT: xorl $31, %eax -; X86-NEXT: orl $32, %eax +; X86-NEXT: addl $32, %eax ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %edx, %eax -; X86-NEXT: orl $64, %eax +; X86-NEXT: addl $64, %eax ; X86-NEXT: movl 48(%ebp), %edx ; X86-NEXT: orl %esi, %edx ; X86-NEXT: cmovnel %ecx, %eax @@ -193,7 +193,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl 32(%ebp), %ecx ; X86-NEXT: bsrl %ecx, %ecx ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx +; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: movl 28(%ebp), %edi @@ -201,10 +201,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %esi ; X86-NEXT: bsrl 24(%ebp), %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: orl $32, %edx +; X86-NEXT: addl $32, %edx ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: orl $64, %edx +; X86-NEXT: addl $64, %edx ; X86-NEXT: movl 32(%ebp), %esi ; X86-NEXT: orl %ebx, %esi ; X86-NEXT: cmovnel %ecx, %edx diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index e223765eb887b..7cd06c66ade09 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -203,13 +203,12 @@ define <4 x i32> @freeze_add_vec_undef(<4 x i32> %a0) nounwind { ; X86-LABEL: freeze_add_vec_undef: ; X86: # %bb.0: ; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: freeze_add_vec_undef: ; X64: # %bb.0: -; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] +; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq %x = add <4 x i32> %a0, %y = freeze <4 x i32> %x @@ -274,13 +273,12 @@ define <4 x i32> @freeze_sub_vec_undef(<4 x i32> %a0) nounwind { ; X86-LABEL: freeze_sub_vec_undef: ; X86: # %bb.0: ; X86-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: freeze_sub_vec_undef: ; X64: # %bb.0: -; X64-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] +; X64-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq %x = sub <4 x i32> %a0, %y = freeze <4 x i32> %x @@ -345,14 +343,12 @@ define <8 x i16> @freeze_mul_vec(<8 x i16> %a0) nounwind { define <8 x i16> @freeze_mul_vec_undef(<8 x i16> %a0) nounwind { ; X86-LABEL: freeze_mul_vec_undef: ; X86: # %bb.0: -; X86-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,3,4,4,3,0,1] -; X86-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,3,2,1,1,2,u,4] +; X86-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,6,6,4,4,6,0,4] ; X86-NEXT: retl ; ; X64-LABEL: freeze_mul_vec_undef: ; X64: # %bb.0: -; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,3,4,4,3,0,1] -; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,3,2,1,1,2,u,4] +; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,6,6,4,4,6,0,4] ; X64-NEXT: retq %x = mul <8 x i16> %a0, %y = freeze <8 x i16> %x @@ -452,8 +448,7 @@ define i32 @freeze_ashr(i32 %a0) nounwind { ; X86-LABEL: freeze_ashr: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $3, %eax -; X86-NEXT: sarl $3, %eax +; X86-NEXT: sarl $6, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr: @@ -471,15 +466,13 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind { ; X86-LABEL: freeze_ashr_exact: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $3, %eax -; X86-NEXT: sarl $6, %eax +; X86-NEXT: sarl $9, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_exact: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: sarl $3, %eax -; X64-NEXT: sarl $6, %eax +; X64-NEXT: sarl $9, %eax ; X64-NEXT: retq %x = ashr exact i32 %a0, 3 %y = freeze i32 %x @@ -492,18 +485,19 @@ define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sarl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sarl $6, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $3, %edx +; X86-NEXT: sarl $9, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: sarl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax -; X64-NEXT: sarl $6, %eax +; X64-NEXT: movl %edi, %ecx +; X64-NEXT: sarl $3, %ecx +; X64-NEXT: sarl $9, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = ashr exact i32 %a0, 3 %y = freeze i32 %x @@ -566,8 +560,7 @@ define i32 @freeze_lshr(i32 %a0) nounwind { ; X86-LABEL: freeze_lshr: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrl $2, %eax -; X86-NEXT: shrl %eax +; X86-NEXT: shrl $3, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr: @@ -585,15 +578,13 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind { ; X86-LABEL: freeze_lshr_exact: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: shrl $5, %eax +; X86-NEXT: shrl $8, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_exact: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: shrl $5, %eax +; X64-NEXT: shrl $8, %eax ; X64-NEXT: retq %x = lshr exact i32 %a0, 3 %y = freeze i32 %x @@ -606,18 +597,19 @@ define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $5, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: shrl $8, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: shrl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $5, %eax +; X64-NEXT: movl %edi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: shrl $8, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = lshr exact i32 %a0, 3 %y = freeze i32 %x diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll index 15b43c41b9945..40229017bd0d2 100644 --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -171,15 +171,15 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: vmovdqa (%edx), %xmm0 -; X86-NEXT: vpand (%ecx), %xmm0, %xmm0 +; X86-NEXT: vmovdqa (%ecx), %xmm0 +; X86-NEXT: vpand (%edx), %xmm0, %xmm0 ; X86-NEXT: vpextrb $6, %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: freeze_extractelement: ; X64: # %bb.0: -; X64-NEXT: vmovdqa (%rdi), %xmm0 -; X64-NEXT: vpand (%rsi), %xmm0, %xmm0 +; X64-NEXT: vmovdqa (%rsi), %xmm0 +; X64-NEXT: vpand (%rdi), %xmm0, %xmm0 ; X64-NEXT: vpextrb $6, %xmm0, (%rdx) ; X64-NEXT: retq %i0 = load <16 x i8>, ptr %origin0 @@ -198,8 +198,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst, ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: vmovdqa (%esi), %xmm0 -; X86-NEXT: vpand (%edx), %xmm0, %xmm0 +; X86-NEXT: vmovdqa (%edx), %xmm0 +; X86-NEXT: vpand (%esi), %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%ecx) ; X86-NEXT: vpextrb $6, %xmm0, (%eax) ; X86-NEXT: popl %esi @@ -207,8 +207,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst, ; ; X64-LABEL: freeze_extractelement_escape: ; X64: # %bb.0: -; X64-NEXT: vmovdqa (%rdi), %xmm0 -; X64-NEXT: vpand (%rsi), %xmm0, %xmm0 +; X64-NEXT: vmovdqa (%rsi), %xmm0 +; X64-NEXT: vpand (%rdi), %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rcx) ; X64-NEXT: vpextrb $6, %xmm0, (%rdx) ; X64-NEXT: retq @@ -239,8 +239,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id ; X86-NEXT: movl 32(%ebp), %edx ; X86-NEXT: movl 12(%ebp), %esi ; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: vmovaps (%edi), %xmm0 -; X86-NEXT: vandps (%esi), %xmm0, %xmm0 +; X86-NEXT: vmovaps (%esi), %xmm0 +; X86-NEXT: vandps (%edi), %xmm0, %xmm0 ; X86-NEXT: vmovaps %xmm0, (%esp) ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: cmpb (%esp,%eax), %cl @@ -255,8 +255,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id ; X64: # %bb.0: ; X64-NEXT: andl $15, %ecx ; X64-NEXT: andl $15, %edx -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vandps (%rsi), %xmm0, %xmm0 +; X64-NEXT: vmovaps (%rsi), %xmm0 +; X64-NEXT: vandps (%rdi), %xmm0, %xmm0 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movzbl -24(%rsp,%rdx), %eax ; X64-NEXT: cmpb -24(%rsp,%rcx), %al diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll index d151c6f28e51b..9aa0a1dedf594 100644 --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -1009,19 +1009,18 @@ define <2 x i64> @arg_i64_v2i64(<2 x i64> %v, i64 %x, i32 %y) nounwind { ; X86AVX2-NEXT: pushl %esi ; X86AVX2-NEXT: andl $-16, %esp ; X86AVX2-NEXT: subl $48, %esp -; X86AVX2-NEXT: movl 8(%ebp), %edx -; X86AVX2-NEXT: movl 12(%ebp), %eax -; X86AVX2-NEXT: movl 16(%ebp), %ecx +; X86AVX2-NEXT: movl 16(%ebp), %eax +; X86AVX2-NEXT: movl 8(%ebp), %ecx +; X86AVX2-NEXT: movl 12(%ebp), %edx ; X86AVX2-NEXT: vmovaps %xmm0, (%esp) -; X86AVX2-NEXT: addl %ecx, %ecx -; X86AVX2-NEXT: movl %ecx, %esi +; X86AVX2-NEXT: leal (%eax,%eax), %esi ; X86AVX2-NEXT: andl $3, %esi -; X86AVX2-NEXT: movl %edx, (%esp,%esi,4) +; X86AVX2-NEXT: movl %ecx, (%esp,%esi,4) ; X86AVX2-NEXT: vmovaps (%esp), %xmm0 ; X86AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X86AVX2-NEXT: incl %ecx -; X86AVX2-NEXT: andl $3, %ecx -; X86AVX2-NEXT: movl %eax, 16(%esp,%ecx,4) +; X86AVX2-NEXT: leal 1(%eax,%eax), %eax +; X86AVX2-NEXT: andl $3, %eax +; X86AVX2-NEXT: movl %edx, 16(%esp,%eax,4) ; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ; X86AVX2-NEXT: leal -4(%ebp), %esp ; X86AVX2-NEXT: popl %esi @@ -1363,13 +1362,12 @@ define <2 x i64> @load_i64_v2i64(<2 x i64> %v, ptr %p, i32 %y) nounwind { ; X86AVX2-NEXT: movl (%ecx), %edx ; X86AVX2-NEXT: movl 4(%ecx), %ecx ; X86AVX2-NEXT: vmovaps %xmm0, (%esp) -; X86AVX2-NEXT: addl %eax, %eax -; X86AVX2-NEXT: movl %eax, %esi +; X86AVX2-NEXT: leal (%eax,%eax), %esi ; X86AVX2-NEXT: andl $3, %esi ; X86AVX2-NEXT: movl %edx, (%esp,%esi,4) ; X86AVX2-NEXT: vmovaps (%esp), %xmm0 ; X86AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X86AVX2-NEXT: incl %eax +; X86AVX2-NEXT: leal 1(%eax,%eax), %eax ; X86AVX2-NEXT: andl $3, %eax ; X86AVX2-NEXT: movl %ecx, 16(%esp,%eax,4) ; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 @@ -1744,19 +1742,18 @@ define <4 x i64> @arg_i64_v4i64(<4 x i64> %v, i64 %x, i32 %y) nounwind { ; X86AVX2-NEXT: pushl %esi ; X86AVX2-NEXT: andl $-32, %esp ; X86AVX2-NEXT: subl $96, %esp -; X86AVX2-NEXT: movl 8(%ebp), %edx -; X86AVX2-NEXT: movl 12(%ebp), %eax -; X86AVX2-NEXT: movl 16(%ebp), %ecx +; X86AVX2-NEXT: movl 16(%ebp), %eax +; X86AVX2-NEXT: movl 8(%ebp), %ecx +; X86AVX2-NEXT: movl 12(%ebp), %edx ; X86AVX2-NEXT: vmovaps %ymm0, (%esp) -; X86AVX2-NEXT: addl %ecx, %ecx -; X86AVX2-NEXT: movl %ecx, %esi +; X86AVX2-NEXT: leal (%eax,%eax), %esi ; X86AVX2-NEXT: andl $7, %esi -; X86AVX2-NEXT: movl %edx, (%esp,%esi,4) +; X86AVX2-NEXT: movl %ecx, (%esp,%esi,4) ; X86AVX2-NEXT: vmovaps (%esp), %ymm0 ; X86AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X86AVX2-NEXT: incl %ecx -; X86AVX2-NEXT: andl $7, %ecx -; X86AVX2-NEXT: movl %eax, 32(%esp,%ecx,4) +; X86AVX2-NEXT: leal 1(%eax,%eax), %eax +; X86AVX2-NEXT: andl $7, %eax +; X86AVX2-NEXT: movl %edx, 32(%esp,%eax,4) ; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ; X86AVX2-NEXT: leal -4(%ebp), %esp ; X86AVX2-NEXT: popl %esi @@ -2131,13 +2128,12 @@ define <4 x i64> @load_i64_v4i64(<4 x i64> %v, ptr %p, i32 %y) nounwind { ; X86AVX2-NEXT: movl (%ecx), %edx ; X86AVX2-NEXT: movl 4(%ecx), %ecx ; X86AVX2-NEXT: vmovaps %ymm0, (%esp) -; X86AVX2-NEXT: addl %eax, %eax -; X86AVX2-NEXT: movl %eax, %esi +; X86AVX2-NEXT: leal (%eax,%eax), %esi ; X86AVX2-NEXT: andl $7, %esi ; X86AVX2-NEXT: movl %edx, (%esp,%esi,4) ; X86AVX2-NEXT: vmovaps (%esp), %ymm0 ; X86AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X86AVX2-NEXT: incl %eax +; X86AVX2-NEXT: leal 1(%eax,%eax), %eax ; X86AVX2-NEXT: andl $7, %eax ; X86AVX2-NEXT: movl %ecx, 32(%esp,%eax,4) ; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 diff --git a/llvm/test/CodeGen/X86/known-signbits-shl.ll b/llvm/test/CodeGen/X86/known-signbits-shl.ll index 473fecc307ed4..3290d31bd2e22 100644 --- a/llvm/test/CodeGen/X86/known-signbits-shl.ll +++ b/llvm/test/CodeGen/X86/known-signbits-shl.ll @@ -128,22 +128,22 @@ define void @computeNumSignBits_shl_zext_vec_3(<2 x i8> %x, ptr %p) nounwind { ; X64-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: psubb %xmm1, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,4096,u,u,u,u,u,u] -; X64-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; X64-NEXT: pand %xmm0, %xmm2 -; X64-NEXT: pcmpgtw %xmm0, %xmm1 -; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-NEXT: por %xmm2, %xmm1 -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: paddw %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,4096,0,0,0,0,0,0] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-NEXT: pand %xmm0, %xmm1 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: pcmpgtw %xmm0, %xmm2 +; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-NEXT: por %xmm1, %xmm2 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: paddw %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm3 ; X64-NEXT: psraw $1, %xmm3 ; X64-NEXT: pcmpeqw %xmm0, %xmm3 ; X64-NEXT: movdqa %xmm3, %xmm0 -; X64-NEXT: pandn %xmm1, %xmm0 -; X64-NEXT: pand %xmm2, %xmm3 +; X64-NEXT: pandn %xmm2, %xmm0 +; X64-NEXT: pand %xmm1, %xmm3 ; X64-NEXT: por %xmm0, %xmm3 ; X64-NEXT: movd %xmm3, (%rdi) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 7c9adaf31aff5..1254100e88a82 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -159,21 +159,21 @@ define <8 x i32> @vec256_i32_unsigned_reg_reg(<8 x i32> %a1, <8 x i32> %a2) noun define <8 x i32> @vec256_i32_signed_mem_reg(ptr %a1_addr, <8 x i32> %a2) nounwind { ; AVX1-LABEL: vec256_i32_signed_mem_reg: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpminsd %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vpmaxsd %xmm0, %xmm1, %xmm4 -; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vmovdqa (%rdi), %ymm1 +; AVX1-NEXT: vpminsd %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm0, %xmm1, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpminsd %xmm0, %xmm2, %xmm4 -; AVX1-NEXT: vpmaxsd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpminsd %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vpmaxsd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpmulld %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpmulld %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -190,20 +190,20 @@ define <8 x i32> @vec256_i32_signed_mem_reg(ptr %a1_addr, <8 x i32> %a2) nounwin ; ; XOP-LABEL: vec256_i32_signed_mem_reg: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vmovdqa (%rdi), %xmm2 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpminsd %xmm1, %xmm3, %xmm4 -; XOP-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpsubd %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpminsd %xmm0, %xmm2, %xmm4 -; XOP-NEXT: vpmaxsd %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vmovdqa (%rdi), %ymm1 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOP-NEXT: vpminsd %xmm2, %xmm3, %xmm4 +; XOP-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpsubd %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpminsd %xmm0, %xmm1, %xmm4 +; XOP-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpsubd %xmm4, %xmm0, %xmm0 ; XOP-NEXT: vpsrld $1, %xmm0, %xmm0 -; XOP-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOP-NEXT: vpmacsdd %xmm3, %xmm1, %xmm1, %xmm1 -; XOP-NEXT: vpmacsdd %xmm2, %xmm0, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vpsrld $1, %xmm2, %xmm2 +; XOP-NEXT: vpmacsdd %xmm3, %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpmacsdd %xmm1, %xmm0, %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: vec256_i32_signed_mem_reg: @@ -303,23 +303,23 @@ define <8 x i32> @vec256_i32_signed_reg_mem(<8 x i32> %a1, ptr %a2_addr) nounwin define <8 x i32> @vec256_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i32_signed_mem_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %ymm0 -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpminsd %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vpmaxsd %xmm0, %xmm1, %xmm4 -; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpminsd %xmm0, %xmm2, %xmm4 -; AVX1-NEXT: vpmaxsd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpmulld %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpmulld %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %ymm1 +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpmulld %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i32_signed_mem_mem: @@ -336,20 +336,20 @@ define <8 x i32> @vec256_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; ; XOP-LABEL: vec256_i32_signed_mem_mem: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rsi), %ymm0 -; XOP-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 +; XOP-NEXT: vmovdqa (%rdi), %ymm0 +; XOP-NEXT: vmovdqa (%rsi), %ymm1 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOP-NEXT: vpminsd %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpmaxsd %xmm3, %xmm2, %xmm3 -; XOP-NEXT: vpsubd %xmm4, %xmm3, %xmm3 -; XOP-NEXT: vpminsd %xmm0, %xmm1, %xmm4 -; XOP-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpsubd %xmm4, %xmm0, %xmm0 -; XOP-NEXT: vpsrld $1, %xmm0, %xmm0 -; XOP-NEXT: vpsrld $1, %xmm3, %xmm3 -; XOP-NEXT: vpmacsdd %xmm2, %xmm3, %xmm3, %xmm2 -; XOP-NEXT: vpmacsdd %xmm1, %xmm0, %xmm0, %xmm0 +; XOP-NEXT: vpminsd %xmm2, %xmm3, %xmm4 +; XOP-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpsubd %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpminsd %xmm1, %xmm0, %xmm4 +; XOP-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpsubd %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpsrld $1, %xmm1, %xmm1 +; XOP-NEXT: vpsrld $1, %xmm2, %xmm2 +; XOP-NEXT: vpmacsdd %xmm3, %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpmacsdd %xmm0, %xmm1, %xmm1, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; @@ -727,18 +727,18 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwind { ; AVX1-LABEL: vec256_i64_signed_mem_reg: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa (%rdi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] @@ -749,19 +749,19 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 -; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $33, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 -; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 ; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i64_signed_mem_reg: @@ -787,18 +787,18 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; ; XOP-LABEL: vec256_i64_signed_mem_reg: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vmovdqa (%rdi), %xmm2 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5 -; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vmovdqa (%rdi), %ymm1 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOP-NEXT: vpcomgtq %xmm2, %xmm3, %xmm4 +; XOP-NEXT: vpcomgtq %xmm0, %xmm1, %xmm5 +; XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0 -; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 +; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpsubq %xmm2, %xmm4, %xmm2 +; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] @@ -809,19 +809,19 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 ; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 ; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 -; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOP-NEXT: vpsrlq $33, %xmm2, %xmm2 ; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 -; XOP-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 +; XOP-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 ; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 ; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; XOP-NEXT: vpsllq $32, %xmm2, %xmm2 ; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 ; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i64_signed_mem_reg: @@ -897,27 +897,27 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i64_signed_reg_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm5 -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa (%rdi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm5 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm7 -; AVX1-NEXT: vpsrlq $33, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 +; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 -; AVX1-NEXT: vpmuludq %xmm2, %xmm9, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 ; AVX1-NEXT: vpsrlq $33, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 @@ -927,11 +927,11 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 ; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i64_signed_reg_mem: @@ -957,27 +957,27 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; ; XOP-LABEL: vec256_i64_signed_reg_mem: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vmovdqa (%rdi), %xmm2 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtq %xmm3, %xmm1, %xmm4 -; XOP-NEXT: vpcomgtq %xmm2, %xmm0, %xmm5 -; XOP-NEXT: vpsubq %xmm2, %xmm0, %xmm2 -; XOP-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; XOP-NEXT: vpsubq %xmm2, %xmm5, %xmm2 -; XOP-NEXT: vpsubq %xmm3, %xmm1, %xmm3 +; XOP-NEXT: vmovdqa (%rdi), %ymm1 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOP-NEXT: vpcomgtq %xmm3, %xmm2, %xmm4 +; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm5 +; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 +; XOP-NEXT: vpsubq %xmm3, %xmm2, %xmm3 ; XOP-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpsubq %xmm3, %xmm4, %xmm3 ; XOP-NEXT: vpsrlq $1, %xmm3, %xmm6 -; XOP-NEXT: vpsrlq $1, %xmm2, %xmm7 -; XOP-NEXT: vpsrlq $33, %xmm2, %xmm2 +; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 +; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 -; XOP-NEXT: vpmuludq %xmm2, %xmm9, %xmm2 +; XOP-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; XOP-NEXT: vpsllq $32, %xmm2, %xmm2 +; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 ; XOP-NEXT: vpsrlq $33, %xmm3, %xmm3 ; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 @@ -987,11 +987,11 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 ; XOP-NEXT: vpsllq $32, %xmm3, %xmm3 ; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; XOP-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 -; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i64_signed_reg_mem: @@ -1067,42 +1067,42 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i64_signed_mem_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 -; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm5 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 +; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 -; AVX1-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 -; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $33, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 -; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 ; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i64_signed_mem_mem: @@ -1129,42 +1129,42 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; ; XOP-LABEL: vec256_i64_signed_mem_mem: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rsi), %xmm0 -; XOP-NEXT: vmovdqa 16(%rsi), %xmm1 -; XOP-NEXT: vmovdqa (%rdi), %xmm2 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5 -; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0 -; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 -; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 -; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 +; XOP-NEXT: vmovdqa (%rdi), %ymm0 +; XOP-NEXT: vmovdqa (%rsi), %ymm1 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpcomgtq %xmm2, %xmm3, %xmm4 +; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm5 +; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 +; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpsubq %xmm2, %xmm4, %xmm2 +; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 +; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 +; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 -; XOP-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 +; XOP-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 -; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 +; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 -; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOP-NEXT: vpsrlq $33, %xmm2, %xmm2 ; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 -; XOP-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 +; XOP-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 ; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 ; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; XOP-NEXT: vpsllq $32, %xmm2, %xmm2 ; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 ; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i64_signed_mem_mem: @@ -1499,27 +1499,27 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounwind { ; AVX1-LABEL: vec256_i16_signed_mem_reg: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpminsw %xmm0, %xmm2, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa (%rdi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vpminsw %xmm0, %xmm1, %xmm6 +; AVX1-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsubw %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpminsw %xmm1, %xmm3, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm6 +; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i16_signed_mem_reg: @@ -1537,25 +1537,25 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw ; ; XOP-LABEL: vec256_i16_signed_mem_reg: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vmovdqa (%rdi), %xmm2 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtw %xmm1, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtw %xmm0, %xmm2, %xmm5 -; XOP-NEXT: vpminsw %xmm1, %xmm3, %xmm6 -; XOP-NEXT: vpmaxsw %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpminsw %xmm0, %xmm2, %xmm6 -; XOP-NEXT: vpmaxsw %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vmovdqa (%rdi), %ymm1 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOP-NEXT: vpcomgtw %xmm2, %xmm3, %xmm4 +; XOP-NEXT: vpcomgtw %xmm0, %xmm1, %xmm5 +; XOP-NEXT: vpminsw %xmm2, %xmm3, %xmm6 +; XOP-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpsubw %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpminsw %xmm0, %xmm1, %xmm6 +; XOP-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpsubw %xmm6, %xmm0, %xmm0 ; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0 -; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 -; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpmacsww %xmm2, %xmm5, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpmacsww %xmm1, %xmm5, %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i16_signed_mem_reg: @@ -1627,27 +1627,27 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i16_signed_reg_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm5 -; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa (%rdi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm5 +; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm6 +; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpminsw %xmm3, %xmm2, %xmm6 +; AVX1-NEXT: vpmaxsw %xmm3, %xmm2, %xmm3 ; AVX1-NEXT: vpsubw %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i16_signed_reg_mem: @@ -1665,25 +1665,25 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw ; ; XOP-LABEL: vec256_i16_signed_reg_mem: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vmovdqa (%rdi), %xmm2 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtw %xmm3, %xmm1, %xmm4 -; XOP-NEXT: vpcomgtw %xmm2, %xmm0, %xmm5 -; XOP-NEXT: vpminsw %xmm3, %xmm1, %xmm6 -; XOP-NEXT: vpmaxsw %xmm3, %xmm1, %xmm3 +; XOP-NEXT: vmovdqa (%rdi), %ymm1 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOP-NEXT: vpcomgtw %xmm3, %xmm2, %xmm4 +; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm5 +; XOP-NEXT: vpminsw %xmm3, %xmm2, %xmm6 +; XOP-NEXT: vpmaxsw %xmm3, %xmm2, %xmm3 ; XOP-NEXT: vpsubw %xmm6, %xmm3, %xmm3 -; XOP-NEXT: vpminsw %xmm2, %xmm0, %xmm6 -; XOP-NEXT: vpmaxsw %xmm2, %xmm0, %xmm2 -; XOP-NEXT: vpsubw %xmm6, %xmm2, %xmm2 -; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2 +; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm6 +; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 +; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw $1, %xmm3, %xmm3 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 -; XOP-NEXT: vpmacsww %xmm1, %xmm4, %xmm3, %xmm1 -; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm2, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vpmacsww %xmm2, %xmm4, %xmm3, %xmm2 +; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm1, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i16_signed_reg_mem: @@ -1755,28 +1755,28 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i16_signed_mem_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpminsw %xmm0, %xmm2, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsubw %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpminsw %xmm1, %xmm3, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm5 +; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm6 +; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm6 +; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i16_signed_mem_mem: @@ -1795,26 +1795,26 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin ; ; XOP-LABEL: vec256_i16_signed_mem_mem: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rsi), %xmm0 -; XOP-NEXT: vmovdqa 16(%rsi), %xmm1 -; XOP-NEXT: vmovdqa (%rdi), %xmm2 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtw %xmm1, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtw %xmm0, %xmm2, %xmm5 -; XOP-NEXT: vpminsw %xmm1, %xmm3, %xmm6 -; XOP-NEXT: vpmaxsw %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vmovdqa (%rdi), %ymm0 +; XOP-NEXT: vmovdqa (%rsi), %ymm1 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpcomgtw %xmm2, %xmm3, %xmm4 +; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm5 +; XOP-NEXT: vpminsw %xmm2, %xmm3, %xmm6 +; XOP-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpsubw %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm6 +; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 ; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpminsw %xmm0, %xmm2, %xmm6 -; XOP-NEXT: vpmaxsw %xmm0, %xmm2, %xmm0 -; XOP-NEXT: vpsubw %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 -; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpmacsww %xmm2, %xmm5, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm1, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i16_signed_mem_mem: @@ -2247,9 +2247,9 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind { ; AVX1-LABEL: vec256_i8_signed_mem_reg: ; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm5 ; AVX1-NEXT: vpminsb %xmm0, %xmm1, %xmm6 @@ -2309,19 +2309,19 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; ; XOP-LABEL: vec256_i8_signed_mem_reg: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vmovdqa (%rdi), %xmm2 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtb %xmm1, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtb %xmm0, %xmm2, %xmm5 -; XOP-NEXT: vpminsb %xmm0, %xmm2, %xmm6 -; XOP-NEXT: vpmaxsb %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vmovdqa (%rdi), %ymm1 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOP-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 +; XOP-NEXT: vpcomgtb %xmm0, %xmm1, %xmm5 +; XOP-NEXT: vpminsb %xmm0, %xmm1, %xmm6 +; XOP-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpsubb %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpminsb %xmm1, %xmm3, %xmm6 -; XOP-NEXT: vpmaxsb %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 +; XOP-NEXT: vpminsb %xmm2, %xmm3, %xmm6 +; XOP-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpsubb %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 +; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpshlb %xmm6, %xmm0, %xmm0 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 @@ -2334,13 +2334,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 -; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 +; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 ; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i8_signed_mem_reg: @@ -2425,9 +2425,9 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i8_signed_reg_mem: ; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm6 @@ -2487,38 +2487,38 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; ; XOP-LABEL: vec256_i8_signed_reg_mem: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vmovdqa (%rdi), %xmm2 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtb %xmm3, %xmm1, %xmm4 -; XOP-NEXT: vpcomgtb %xmm2, %xmm0, %xmm5 -; XOP-NEXT: vpminsb %xmm2, %xmm0, %xmm6 -; XOP-NEXT: vpmaxsb %xmm2, %xmm0, %xmm2 -; XOP-NEXT: vpsubb %xmm6, %xmm2, %xmm2 -; XOP-NEXT: vpminsb %xmm3, %xmm1, %xmm6 -; XOP-NEXT: vpmaxsb %xmm3, %xmm1, %xmm3 +; XOP-NEXT: vmovdqa (%rdi), %ymm1 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOP-NEXT: vpcomgtb %xmm3, %xmm2, %xmm4 +; XOP-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5 +; XOP-NEXT: vpminsb %xmm1, %xmm0, %xmm6 +; XOP-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 +; XOP-NEXT: vpminsb %xmm3, %xmm2, %xmm6 +; XOP-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3 ; XOP-NEXT: vpsubb %xmm6, %xmm3, %xmm3 ; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 ; XOP-NEXT: vpshlb %xmm6, %xmm3, %xmm3 -; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 -; XOP-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 +; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2 +; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] -; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm2, %xmm2 +; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm3, %xmm6 ; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm3, %xmm3 -; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i8_signed_reg_mem: @@ -2603,44 +2603,44 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i8_signed_mem_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 +; AVX1-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i8_signed_mem_mem: @@ -2667,39 +2667,39 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; ; XOP-LABEL: vec256_i8_signed_mem_mem: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rsi), %xmm0 -; XOP-NEXT: vmovdqa 16(%rsi), %xmm1 -; XOP-NEXT: vmovdqa (%rdi), %xmm2 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtb %xmm1, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtb %xmm0, %xmm2, %xmm5 -; XOP-NEXT: vpminsb %xmm0, %xmm2, %xmm6 -; XOP-NEXT: vpmaxsb %xmm0, %xmm2, %xmm0 -; XOP-NEXT: vpsubb %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpminsb %xmm1, %xmm3, %xmm6 -; XOP-NEXT: vpmaxsb %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vmovdqa (%rdi), %ymm0 +; XOP-NEXT: vmovdqa (%rsi), %ymm1 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 +; XOP-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5 +; XOP-NEXT: vpminsb %xmm1, %xmm0, %xmm6 +; XOP-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 +; XOP-NEXT: vpminsb %xmm2, %xmm3, %xmm6 +; XOP-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpsubb %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 +; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpshlb %xmm6, %xmm0, %xmm0 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 -; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 +; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] -; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 +; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 -; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 +; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 ; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i8_signed_mem_mem: diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll index a4750b4cd4ad0..582cc9c3e8055 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -426,58 +426,58 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounwind { ; AVX512F-LABEL: vec512_i16_signed_mem_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsw %ymm0, %ymm1, %ymm5 +; AVX512F-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_mem_reg: @@ -507,58 +507,58 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i16_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpminsw %ymm3, %ymm2, %ymm5 +; AVX512F-NEXT: vpmaxsw %ymm3, %ymm2, %ymm3 ; AVX512F-NEXT: vpsubw %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm5 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm2, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm6, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_reg_mem: @@ -588,60 +588,60 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounw define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i16_signed_mem_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_mem_mem: @@ -849,66 +849,64 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vpsubb %ymm0, %ymm6, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm6, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_reg: @@ -939,66 +937,64 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 ; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm3 -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm5 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm7, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_reg_mem: @@ -1029,68 +1025,66 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_mem: diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index f53983036a016..0743bc76d4ea7 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -226,36 +226,36 @@ define void @PR42833() { ; ; AVX1-LABEL: PR42833: ; AVX1: # %bb.0: -; AVX1-NEXT: movl b(%rip), %eax -; AVX1-NEXT: addl c+128(%rip), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vmovdqa c+128(%rip), %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] -; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2 -; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmovups %ymm0, c+128(%rip) -; AVX1-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa d+128(%rip), %xmm1 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa d+176(%rip), %xmm1 -; AVX1-NEXT: vmovdqa c+176(%rip), %xmm3 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa d+160(%rip), %xmm4 -; AVX1-NEXT: vmovdqa c+160(%rip), %xmm5 -; AVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa %xmm2, d+144(%rip) -; AVX1-NEXT: vmovdqa %xmm4, d+160(%rip) -; AVX1-NEXT: vmovdqa %xmm1, d+176(%rip) +; AVX1-NEXT: vmovdqu c+128(%rip), %ymm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: addl b(%rip), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] +; AVX1-NEXT: vmovups %ymm1, c+128(%rip) +; AVX1-NEXT: vmovdqu c+160(%rip), %ymm1 +; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa d+128(%rip), %xmm2 +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa d+176(%rip), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa d+160(%rip), %xmm5 +; AVX1-NEXT: vpsubd %xmm1, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa d+144(%rip), %xmm6 +; AVX1-NEXT: vpsubd %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, d+144(%rip) +; AVX1-NEXT: vmovdqa %xmm5, d+160(%rip) +; AVX1-NEXT: vmovdqa %xmm2, d+176(%rip) ; AVX1-NEXT: vmovdqa %xmm0, d+128(%rip) -; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm0 -; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, c+160(%rip) -; AVX1-NEXT: vmovdqa %xmm0, c+176(%rip) +; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, c+176(%rip) +; AVX1-NEXT: vmovdqa %xmm0, c+160(%rip) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -308,36 +308,36 @@ define void @PR42833() { ; ; XOP-LABEL: PR42833: ; XOP: # %bb.0: -; XOP-NEXT: movl b(%rip), %eax -; XOP-NEXT: addl c+128(%rip), %eax -; XOP-NEXT: vmovd %eax, %xmm0 -; XOP-NEXT: vmovdqa c+128(%rip), %xmm1 -; XOP-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm2 -; XOP-NEXT: vmovdqa c+144(%rip), %xmm3 -; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] -; XOP-NEXT: vmovdqa d+144(%rip), %xmm2 -; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 -; XOP-NEXT: vmovups %ymm0, c+128(%rip) -; XOP-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 -; XOP-NEXT: vmovdqa d+128(%rip), %xmm1 -; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vmovdqa d+176(%rip), %xmm1 -; XOP-NEXT: vmovdqa c+176(%rip), %xmm3 -; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa d+160(%rip), %xmm4 -; XOP-NEXT: vmovdqa c+160(%rip), %xmm5 -; XOP-NEXT: vpsubd %xmm5, %xmm4, %xmm4 -; XOP-NEXT: vmovdqa %xmm2, d+144(%rip) -; XOP-NEXT: vmovdqa %xmm4, d+160(%rip) -; XOP-NEXT: vmovdqa %xmm1, d+176(%rip) +; XOP-NEXT: vmovdqu c+128(%rip), %ymm0 +; XOP-NEXT: vmovd %xmm0, %eax +; XOP-NEXT: addl b(%rip), %eax +; XOP-NEXT: vmovd %eax, %xmm1 +; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm4 +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] +; XOP-NEXT: vmovups %ymm1, c+128(%rip) +; XOP-NEXT: vmovdqu c+160(%rip), %ymm1 +; XOP-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 +; XOP-NEXT: vmovdqa d+128(%rip), %xmm2 +; XOP-NEXT: vpsubd %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vmovdqa d+176(%rip), %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm4 +; XOP-NEXT: vpsubd %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vmovdqa d+160(%rip), %xmm5 +; XOP-NEXT: vpsubd %xmm1, %xmm5, %xmm5 +; XOP-NEXT: vmovdqa d+144(%rip), %xmm6 +; XOP-NEXT: vpsubd %xmm3, %xmm6, %xmm3 +; XOP-NEXT: vmovdqa %xmm3, d+144(%rip) +; XOP-NEXT: vmovdqa %xmm5, d+160(%rip) +; XOP-NEXT: vmovdqa %xmm2, d+176(%rip) ; XOP-NEXT: vmovdqa %xmm0, d+128(%rip) -; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm0 -; XOP-NEXT: vpaddd %xmm5, %xmm5, %xmm1 -; XOP-NEXT: vmovdqa %xmm1, c+160(%rip) -; XOP-NEXT: vmovdqa %xmm0, c+176(%rip) +; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm0 +; XOP-NEXT: vpaddd %xmm4, %xmm4, %xmm1 +; XOP-NEXT: vmovdqa %xmm1, c+176(%rip) +; XOP-NEXT: vmovdqa %xmm0, c+160(%rip) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %1 = load i32, ptr @b, align 4 diff --git a/llvm/test/CodeGen/X86/pr120906.ll b/llvm/test/CodeGen/X86/pr120906.ll index f5f6331bf3bf6..5ba36dea0bcfd 100644 --- a/llvm/test/CodeGen/X86/pr120906.ll +++ b/llvm/test/CodeGen/X86/pr120906.ll @@ -5,23 +5,8 @@ define i32 @PR120906(ptr %p) { ; CHECK-LABEL: PR120906: ; CHECK: # %bb.0: ; CHECK-NEXT: movl $564341309, (%rdi) # imm = 0x21A32A3D -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: paddb %xmm1, %xmm1 -; CHECK-NEXT: paddb %xmm1, %xmm1 -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: pcmpgtb %xmm1, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11,u,u,u,u,u,u,u,u,u,u,u,u] -; CHECK-NEXT: movdqa %xmm1, %xmm3 -; CHECK-NEXT: paddb %xmm1, %xmm3 -; CHECK-NEXT: pand %xmm2, %xmm3 -; CHECK-NEXT: pandn %xmm1, %xmm2 -; CHECK-NEXT: por %xmm1, %xmm2 -; CHECK-NEXT: por %xmm3, %xmm2 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [11,0,11,0,11,0,11,0] +; CHECK-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: por %xmm0, %xmm1 ; CHECK-NEXT: movd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll index 2ac2be5545dfd..d2b292f1a7996 100644 --- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll +++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll @@ -119,8 +119,8 @@ define void @failing(ptr %0, ptr %1) nounwind { ; CHECK-AVX2-NEXT: .LBB0_2: # %vector.body ; CHECK-AVX2-NEXT: # Parent Loop BB0_1 Depth=1 ; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5 -; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6 +; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %ymm5 +; CHECK-AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %rdi ; CHECK-AVX2-NEXT: vpextrq $1, %xmm6, %r8 ; CHECK-AVX2-NEXT: vmovq %xmm5, %r9 diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index a82656e4b7147..d25cb44209f7f 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -1082,25 +1082,25 @@ define i128 @shift_i128_limited_shamt_unknown_lhs(i128 noundef %a, i32 noundef % ; i686-NEXT: pushl %esi ; i686-NEXT: andl $-16, %esp ; i686-NEXT: subl $48, %esp +; i686-NEXT: movl 44(%ebp), %ecx ; i686-NEXT: movl 24(%ebp), %eax ; i686-NEXT: movl 28(%ebp), %edx ; i686-NEXT: movl 32(%ebp), %esi ; i686-NEXT: movl 36(%ebp), %edi -; i686-NEXT: movl 44(%ebp), %ecx ; i686-NEXT: subl 40(%ebp), %ecx ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl $0, (%esp) ; i686-NEXT: movl %ecx, %eax ; i686-NEXT: shrb $3, %al ; i686-NEXT: andb $12, %al ; i686-NEXT: negb %al ; i686-NEXT: movsbl %al, %eax +; i686-NEXT: movl $0, {{[0-9]+}}(%esp) +; i686-NEXT: movl $0, {{[0-9]+}}(%esp) +; i686-NEXT: movl $0, {{[0-9]+}}(%esp) +; i686-NEXT: movl $0, (%esp) ; i686-NEXT: movl 20(%esp,%eax), %edx ; i686-NEXT: movl 24(%esp,%eax), %ebx ; i686-NEXT: movl %ebx, %edi diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll index bf027a7346deb..d55cc993aaf75 100644 --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -1543,9 +1543,10 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: cmpl $3, %eax -; SSE2-NEXT: sete %al +; SSE2-NEXT: movmskpd %xmm0, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrb %al +; SSE2-NEXT: andb %cl, %al ; SSE2-NEXT: retq ; ; SSE42-LABEL: select_v2i8: @@ -1556,9 +1557,10 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) { ; SSE42-NEXT: movd %eax, %xmm1 ; SSE42-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE42-NEXT: pmovsxbq %xmm1, %xmm0 -; SSE42-NEXT: movmskpd %xmm0, %eax -; SSE42-NEXT: cmpl $3, %eax -; SSE42-NEXT: sete %al +; SSE42-NEXT: movmskpd %xmm0, %ecx +; SSE42-NEXT: movl %ecx, %eax +; SSE42-NEXT: shrb %al +; SSE42-NEXT: andb %cl, %al ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: select_v2i8: @@ -1569,21 +1571,16 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) { ; AVX1OR2-NEXT: vmovd %eax, %xmm1 ; AVX1OR2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpmovsxbq %xmm0, %xmm0 -; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vtestpd %xmm1, %xmm0 -; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: andb %cl, %al ; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: select_v2i8: ; AVX512: # %bb.0: -; AVX512-NEXT: movzwl (%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: movzwl (%rsi), %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 -; AVX512-NEXT: knotw %k0, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: testb $3, %al +; AVX512-NEXT: cmpw %ax, (%rdi) ; AVX512-NEXT: sete %al ; AVX512-NEXT: retq %v0 = load <2 x i8>, ptr %s0, align 1 diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll index 2df39d69dbb75..699cebf042a80 100644 --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -1426,9 +1426,10 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al +; SSE2-NEXT: movmskpd %xmm0, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrb %al +; SSE2-NEXT: orb %cl, %al ; SSE2-NEXT: retq ; ; SSE42-LABEL: select_v2i8: @@ -1439,9 +1440,10 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) { ; SSE42-NEXT: movd %eax, %xmm1 ; SSE42-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE42-NEXT: pmovsxbq %xmm1, %xmm0 -; SSE42-NEXT: movmskpd %xmm0, %eax -; SSE42-NEXT: testl %eax, %eax -; SSE42-NEXT: setne %al +; SSE42-NEXT: movmskpd %xmm0, %ecx +; SSE42-NEXT: movl %ecx, %eax +; SSE42-NEXT: shrb %al +; SSE42-NEXT: orb %cl, %al ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: select_v2i8: @@ -1452,8 +1454,10 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) { ; AVX1OR2-NEXT: vmovd %eax, %xmm1 ; AVX1OR2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpmovsxbq %xmm0, %xmm0 -; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 -; AVX1OR2-NEXT: setne %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: orb %cl, %al ; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: select_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index bf525442a419b..339905802159f 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -730,65 +730,65 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { ; SSE2-LABEL: var_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: psllw $5, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: psllw $5, %xmm5 ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: pandn %xmm1, %xmm7 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: paddb %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pandn %xmm3, %xmm7 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: paddb %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pandn %xmm3, %xmm6 -; SSE2-NEXT: psrlw $1, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm5, %xmm2 -; SSE2-NEXT: psllw $5, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtb %xmm5, %xmm6 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pandn %xmm0, %xmm7 ; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: paddb %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtb %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pandn %xmm0, %xmm7 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: paddb %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtb %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: psllw $5, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: paddb %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: psrlw $2, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v16i8: @@ -1020,65 +1020,65 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; X86-SSE2-LABEL: var_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 -; X86-SSE2-NEXT: pand %xmm5, %xmm6 -; X86-SSE2-NEXT: psllw $5, %xmm6 -; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: psllw $5, %xmm5 ; X86-SSE2-NEXT: pxor %xmm3, %xmm3 -; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 -; X86-SSE2-NEXT: pandn %xmm1, %xmm7 -; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 -; X86-SSE2-NEXT: por %xmm7, %xmm3 -; X86-SSE2-NEXT: paddb %xmm6, %xmm6 -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 -; X86-SSE2-NEXT: pandn %xmm3, %xmm7 -; X86-SSE2-NEXT: psrlw $2, %xmm3 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 -; X86-SSE2-NEXT: por %xmm7, %xmm3 -; X86-SSE2-NEXT: paddb %xmm6, %xmm6 -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE2-NEXT: pandn %xmm3, %xmm6 -; X86-SSE2-NEXT: psrlw $1, %xmm3 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 -; X86-SSE2-NEXT: por %xmm6, %xmm3 -; X86-SSE2-NEXT: pandn %xmm5, %xmm2 -; X86-SSE2-NEXT: psllw $5, %xmm2 -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm6, %xmm6 +; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm6 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: pandn %xmm0, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pandn %xmm0, %xmm7 ; X86-SSE2-NEXT: psllw $4, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm6, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: por %xmm7, %xmm0 +; X86-SSE2-NEXT: paddb %xmm5, %xmm5 +; X86-SSE2-NEXT: pxor %xmm6, %xmm6 +; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pandn %xmm0, %xmm7 +; X86-SSE2-NEXT: psllw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm6, %xmm0 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: por %xmm7, %xmm0 +; X86-SSE2-NEXT: paddb %xmm5, %xmm5 +; X86-SSE2-NEXT: pxor %xmm6, %xmm6 +; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm5 +; X86-SSE2-NEXT: pandn %xmm0, %xmm5 +; X86-SSE2-NEXT: paddb %xmm0, %xmm0 +; X86-SSE2-NEXT: pand %xmm6, %xmm0 ; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: psllw $5, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pandn %xmm1, %xmm5 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 +; X86-SSE2-NEXT: por %xmm5, %xmm4 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: pandn %xmm0, %xmm5 -; X86-SSE2-NEXT: psllw $2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: psrlw $2, %xmm4 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 +; X86-SSE2-NEXT: por %xmm5, %xmm4 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: paddb %xmm0, %xmm0 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm3, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm4, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 +; X86-SSE2-NEXT: por %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm4, %xmm0 ; X86-SSE2-NEXT: retl %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ret <16 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-shift-lut.ll b/llvm/test/CodeGen/X86/vector-shift-lut.ll index 0bf2006090893..9c14f6a344bc8 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lut.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lut.ll @@ -196,48 +196,43 @@ define <32 x i8> @uniform_shl_v32i8(<32 x i8> %a) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psllw $5, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pandn %xmm4, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; SSE2-NEXT: pand %xmm5, %xmm7 -; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: por %xmm7, %xmm6 ; SSE2-NEXT: paddb %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: pandn %xmm7, %xmm8 ; SSE2-NEXT: psllw $2, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: por %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pandn %xmm6, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm7 -; SSE2-NEXT: por %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: por %xmm7, %xmm6 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm7, %xmm4 ; SSE2-NEXT: psllw $2, %xmm7 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm7 +; SSE2-NEXT: pandn %xmm6, %xmm2 +; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index da8a3f3fa0d4e..4997518d47c49 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -671,29 +671,29 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; ; AVX1-LABEL: trunc_packus_v8i64_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm4[0,2],xmm0[0,2] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,2] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1628,31 +1628,32 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; ; AVX1-LABEL: trunc_packus_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = [65535,65535] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = [65535,65535] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v8i64_v8i16: @@ -3030,32 +3031,33 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; ; AVX1-LABEL: trunc_packus_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v8i64_v8i8: @@ -3317,33 +3319,34 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; ; AVX1-LABEL: trunc_packus_v8i64_v8i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v8i64_v8i8_store: @@ -3797,55 +3800,56 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; ; AVX1-LABEL: trunc_packus_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = [255,255] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm7, %xmm2, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm2, %xmm8 +; AVX1-NEXT: vmovdqa (%rdi), %ymm2 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX1-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX1-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm0 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm5, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm3, %xmm5, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm8 +; AVX1-NEXT: vblendvpd %xmm8, %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm8 +; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm5, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 ; AVX1-NEXT: vpand %xmm2, %xmm9, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9 -; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5 -; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm8, %xmm9 +; AVX1-NEXT: vpand %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpackusdw %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm8 +; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm5 -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm5 -; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm4 +; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm3 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm4 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm4 ; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v16i64_v16i8: diff --git a/llvm/test/CodeGen/X86/vshift-6.ll b/llvm/test/CodeGen/X86/vshift-6.ll index 912ff750d9e91..b8e8523c58542 100644 --- a/llvm/test/CodeGen/X86/vshift-6.ll +++ b/llvm/test/CodeGen/X86/vshift-6.ll @@ -27,9 +27,10 @@ define <16 x i8> @do_not_crash(ptr, ptr, ptr, i32, i64, i8) { ; X86-LABEL: do_not_crash: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb %al, (%ecx) +; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movd %eax, %xmm1 ; X86-NEXT: psllq $56, %xmm1 ; X86-NEXT: pcmpeqd %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index 81c4d5d71084c..8100bb82d5f69 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -834,7 +834,6 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero @@ -845,26 +844,27 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al -; X86-NO-BMI2-NO-SHLD-NEXT: notb %al -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -908,11 +908,10 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-HAVE-BMI2-NO-SHLD: # %bb.0: -; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -925,28 +924,25 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%esi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> @@ -1453,7 +1449,6 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 @@ -1464,26 +1459,27 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebp,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp,4), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al -; X86-NO-BMI2-NO-SHLD-NEXT: notb %al -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -1527,11 +1523,10 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X86-HAVE-BMI2-NO-SHLD: # %bb.0: -; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $64, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1544,28 +1539,25 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $64, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <16 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> @@ -1716,39 +1708,40 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ebx,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebx,4), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -1826,38 +1819,34 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 8d36eef952a2b..70d47728ff478 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -1095,7 +1095,6 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax @@ -1105,26 +1104,27 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al -; X86-NO-BMI2-NO-SHLD-NEXT: notb %al -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -1167,11 +1167,10 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X86-HAVE-BMI2-NO-SHLD: # %bb.0: -; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1183,28 +1182,25 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%esi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <16 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 @@ -1723,7 +1719,6 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 @@ -1735,26 +1730,27 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebp,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp,4), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al -; X86-NO-BMI2-NO-SHLD-NEXT: notb %al -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -1799,11 +1795,10 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: ; X86-HAVE-BMI2-NO-SHLD: # %bb.0: -; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $64, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1817,28 +1812,25 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $64, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 @@ -1992,39 +1984,40 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ebx,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebx,4), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -2104,38 +2097,34 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/widen_arith-4.ll b/llvm/test/CodeGen/X86/widen_arith-4.ll index ea6bf66fd2923..e464f68bdc100 100644 --- a/llvm/test/CodeGen/X86/widen_arith-4.ll +++ b/llvm/test/CodeGen/X86/widen_arith-4.ll @@ -48,7 +48,7 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind { ; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: movw $0, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: movl $0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [271,271,271,271,271,u,u,u] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [271,271,271,271,271,0,0,0] ; SSE41-NEXT: .p2align 4 ; SSE41-NEXT: .LBB0_1: # %forcond ; SSE41-NEXT: # =>This Inner Loop Header: Depth=1