diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c008135854621..a73c59e361cbf 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4075,18 +4075,11 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { unsigned BitWidth = VT.getScalarSizeInBits(); SDLoc DL(N); - auto PeekThroughFreeze = [](SDValue N) { - if (N->getOpcode() == ISD::FREEZE && N.hasOneUse()) - return N->getOperand(0); - return N; - }; - if (SDValue V = foldSubCtlzNot(N, DAG)) return V; // fold (sub x, x) -> 0 - // FIXME: Refactor this and xor and other similar operations together. - if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1)) + if (N0 == N1) return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); // fold (sub c1, c2) -> c3 @@ -16756,6 +16749,17 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false)) return N0; + // If we have frozen and unfrozen users of N0, update so everything uses N. + if (!N0.isUndef() && !N0.hasOneUse()) { + SDValue FrozenN0(N, 0); + DAG.ReplaceAllUsesOfValueWith(N0, FrozenN0); + // ReplaceAllUsesOfValueWith will have also updated the use in N, thus + // creating a cycle in a DAG. Let's undo that by mutating the freeze. + assert(N->getOperand(0) == FrozenN0 && "Expected cycle in DAG"); + DAG.UpdateNodeOperands(N, N0); + return FrozenN0; + } + // We currently avoid folding freeze over SRA/SRL, due to the problems seen // with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for // example https://reviews.llvm.org/D136529#4120959. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index b3f131d4422d7..a68f521ee59cd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -775,13 +775,6 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( break; } - case ISD::FREEZE: { - SDValue N0 = Op.getOperand(0); - if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts, - /*PoisonOnly=*/false, Depth + 1)) - return N0; - break; - } case ISD::AND: { LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll index 9fe8c50640981..4f2b9c5a62669 100644 --- a/llvm/test/CodeGen/AArch64/vselect-ext.ll +++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll @@ -175,12 +175,12 @@ define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i1 ; CHECK: ; %bb.0: ; CHECK-NEXT: bic.8h v0, #128, lsl #8 ; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: ushll.4s v2, v0, #0 -; CHECK-NEXT: ushll2.4s v0, v0, #0 -; CHECK-NEXT: cmhi.4s v3, v2, v1 -; CHECK-NEXT: cmhi.4s v1, v0, v1 -; CHECK-NEXT: and.16b v1, v1, v0 -; CHECK-NEXT: and.16b v0, v3, v2 +; CHECK-NEXT: ushll2.4s v2, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: cmhi.4s v3, v0, v1 +; CHECK-NEXT: cmhi.4s v1, v2, v1 +; CHECK-NEXT: and.16b v1, v1, v2 +; CHECK-NEXT: and.16b v0, v3, v0 ; CHECK-NEXT: ret %ext = zext <8 x i15> %a to <8 x i32> %cmp = icmp ugt <8 x i15> %a, @@ -289,12 +289,12 @@ define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13> ; CHECK: ; %bb.0: ; CHECK-NEXT: bic.8h v0, #224, lsl #8 ; CHECK-NEXT: movi.4s v1, #10 -; CHECK-NEXT: ushll.4s v2, v0, #0 -; CHECK-NEXT: ushll2.4s v0, v0, #0 -; CHECK-NEXT: cmeq.4s v3, v2, v1 -; CHECK-NEXT: cmeq.4s v1, v0, v1 -; CHECK-NEXT: and.16b v1, v1, v0 -; CHECK-NEXT: and.16b v0, v3, v2 +; CHECK-NEXT: ushll2.4s v2, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: cmeq.4s v3, v0, v1 +; CHECK-NEXT: cmeq.4s v1, v2, v1 +; CHECK-NEXT: and.16b v1, v1, v2 +; CHECK-NEXT: and.16b v0, v3, v0 ; CHECK-NEXT: ret %ext = zext <8 x i13> %a to <8 x i32> %cmp = icmp eq <8 x i13> %a, @@ -429,17 +429,17 @@ define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) { define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13> %a) { ; CHECK-LABEL: same_sext_used_in_cmp_eq_and_select_v8i32_from_v8i13: ; CHECK: ; %bb.0: -; CHECK-NEXT: ushll2.4s v2, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: ushll.4s v2, v0, #0 +; CHECK-NEXT: ushll2.4s v0, v0, #0 ; CHECK-NEXT: movi.4s v1, #10 ; CHECK-NEXT: shl.4s v0, v0, #19 ; CHECK-NEXT: shl.4s v2, v2, #19 ; CHECK-NEXT: sshr.4s v0, v0, #19 ; CHECK-NEXT: sshr.4s v2, v2, #19 -; CHECK-NEXT: cmeq.4s v3, v0, v1 -; CHECK-NEXT: cmeq.4s v1, v2, v1 -; CHECK-NEXT: and.16b v1, v1, v2 -; CHECK-NEXT: and.16b v0, v3, v0 +; CHECK-NEXT: cmeq.4s v3, v2, v1 +; CHECK-NEXT: cmeq.4s v1, v0, v1 +; CHECK-NEXT: and.16b v1, v1, v0 +; CHECK-NEXT: and.16b v0, v3, v2 ; CHECK-NEXT: ret %ext = sext <8 x i13> %a to <8 x i32> %cmp = icmp eq <8 x i13> %a, @@ -493,17 +493,17 @@ entry: define <8 x i32> @same_sext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15(<8 x i15> %a) { ; CHECK-LABEL: same_sext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15: ; CHECK: ; %bb.0: -; CHECK-NEXT: ushll2.4s v2, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: ushll.4s v2, v0, #0 +; CHECK-NEXT: ushll2.4s v0, v0, #0 ; CHECK-NEXT: movi.4s v1, #10 ; CHECK-NEXT: shl.4s v0, v0, #17 ; CHECK-NEXT: shl.4s v2, v2, #17 ; CHECK-NEXT: sshr.4s v0, v0, #17 ; CHECK-NEXT: sshr.4s v2, v2, #17 -; CHECK-NEXT: cmge.4s v3, v0, v1 -; CHECK-NEXT: cmge.4s v1, v2, v1 -; CHECK-NEXT: and.16b v1, v1, v2 -; CHECK-NEXT: and.16b v0, v3, v0 +; CHECK-NEXT: cmge.4s v3, v2, v1 +; CHECK-NEXT: cmge.4s v1, v0, v1 +; CHECK-NEXT: and.16b v1, v1, v0 +; CHECK-NEXT: and.16b v0, v3, v2 ; CHECK-NEXT: ret %ext = sext <8 x i15> %a to <8 x i32> %cmp = icmp sge <8 x i15> %a, diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index e6c38d29be949..55067023116f0 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -495,8 +495,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 @@ -2679,8 +2680,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index ac4f0df7506ae..308e86bbaf8fd 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -5692,10 +5692,6 @@ define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX6-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -5725,10 +5721,6 @@ define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -6351,10 +6343,6 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -6384,10 +6372,6 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -12347,14 +12331,9 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX6-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2 ; GFX6-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GFX6-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -12392,14 +12371,9 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2 ; GFX7-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GFX7-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -12474,11 +12448,7 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v0, v[0:1], off ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_lshrrev_b16 v1, 8, v0 -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-SDAG-NEXT: global_store_byte v[2:3], v4, off offset:2 +; GFX10-SDAG-NEXT: global_store_byte_d16_hi v[2:3], v0, off offset:2 ; GFX10-SDAG-NEXT: global_store_short v[2:3], v0, off ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -12499,36 +12469,15 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX10-GISEL-NEXT: global_store_byte_d16_hi v[2:3], v0, off offset:2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-TRUE16-LABEL: freeze_v3i8: -; GFX11-SDAG-TRUE16: ; %bb.0: -; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_b32 v1, v[0:1], off -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_clause 0x1 -; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[2:3], v4, off offset:2 -; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v[2:3], v0, off -; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-FAKE16-LABEL: freeze_v3i8: -; GFX11-SDAG-FAKE16: ; %bb.0: -; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b16 v1, 8, v0 -; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v0 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX11-SDAG-FAKE16-NEXT: s_clause 0x1 -; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[2:3], v0, off offset:2 -; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v[2:3], v1, off -; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-LABEL: freeze_v3i8: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_d16_hi_b8 v[2:3], v0, off offset:2 +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: freeze_v3i8: ; GFX11-GISEL: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index bfc01ef138721..d59f72ad7a1ac 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -8343,53 +8343,53 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s42, s5, 30 -; GFX6-NEXT: s_lshr_b32 s36, s5, 28 -; GFX6-NEXT: s_lshr_b32 s38, s5, 29 -; GFX6-NEXT: s_lshr_b32 s30, s5, 26 -; GFX6-NEXT: s_lshr_b32 s34, s5, 27 -; GFX6-NEXT: s_lshr_b32 s26, s5, 24 -; GFX6-NEXT: s_lshr_b32 s28, s5, 25 -; GFX6-NEXT: s_lshr_b32 s22, s5, 22 -; GFX6-NEXT: s_lshr_b32 s24, s5, 23 -; GFX6-NEXT: s_lshr_b32 s18, s5, 20 -; GFX6-NEXT: s_lshr_b32 s20, s5, 21 -; GFX6-NEXT: s_lshr_b32 s14, s5, 18 -; GFX6-NEXT: s_lshr_b32 s16, s5, 19 -; GFX6-NEXT: s_lshr_b32 s10, s5, 16 -; GFX6-NEXT: s_lshr_b32 s12, s5, 17 -; GFX6-NEXT: s_lshr_b32 s6, s5, 14 -; GFX6-NEXT: s_lshr_b32 s8, s5, 15 -; GFX6-NEXT: s_mov_b32 s40, s5 +; GFX6-NEXT: s_lshr_b32 s36, s4, 30 +; GFX6-NEXT: s_lshr_b32 s38, s4, 31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 28 +; GFX6-NEXT: s_lshr_b32 s34, s4, 29 +; GFX6-NEXT: s_lshr_b32 s26, s4, 26 +; GFX6-NEXT: s_lshr_b32 s28, s4, 27 +; GFX6-NEXT: s_lshr_b32 s22, s4, 24 +; GFX6-NEXT: s_lshr_b32 s24, s4, 25 +; GFX6-NEXT: s_lshr_b32 s18, s4, 22 +; GFX6-NEXT: s_lshr_b32 s20, s4, 23 +; GFX6-NEXT: s_lshr_b32 s14, s4, 20 +; GFX6-NEXT: s_lshr_b32 s16, s4, 21 +; GFX6-NEXT: s_lshr_b32 s10, s4, 18 +; GFX6-NEXT: s_lshr_b32 s12, s4, 19 +; GFX6-NEXT: s_lshr_b32 s6, s4, 16 +; GFX6-NEXT: s_lshr_b32 s8, s4, 17 ; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v4, s7 -; GFX6-NEXT: s_lshr_b32 s40, s5, 12 +; GFX6-NEXT: s_lshr_b32 s40, s4, 14 ; GFX6-NEXT: v_mov_b32_e32 v0, s44 ; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 +; GFX6-NEXT: s_mov_b32 s44, s5 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v6, s44 ; GFX6-NEXT: v_mov_b32_e32 v7, s45 -; GFX6-NEXT: s_lshr_b32 s44, s5, 13 +; GFX6-NEXT: s_lshr_b32 s44, s4, 15 ; GFX6-NEXT: v_mov_b32_e32 v2, s42 ; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 10 +; GFX6-NEXT: s_lshr_b32 s42, s4, 12 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v8, s36 ; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 11 +; GFX6-NEXT: s_lshr_b32 s36, s4, 13 ; GFX6-NEXT: v_mov_b32_e32 v10, s38 ; GFX6-NEXT: v_mov_b32_e32 v11, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 8 +; GFX6-NEXT: s_lshr_b32 s38, s4, 10 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v12, s30 ; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s5, 9 +; GFX6-NEXT: s_lshr_b32 s30, s4, 11 ; GFX6-NEXT: v_mov_b32_e32 v14, s34 ; GFX6-NEXT: v_mov_b32_e32 v15, s35 -; GFX6-NEXT: s_lshr_b32 s34, s5, 6 +; GFX6-NEXT: s_lshr_b32 s34, s4, 8 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v5, s7 @@ -8397,190 +8397,191 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s26 ; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: s_lshr_b32 s26, s5, 7 +; GFX6-NEXT: s_lshr_b32 s26, s4, 9 ; GFX6-NEXT: v_mov_b32_e32 v4, s28 ; GFX6-NEXT: v_mov_b32_e32 v5, s29 -; GFX6-NEXT: s_lshr_b32 s28, s5, 4 +; GFX6-NEXT: s_lshr_b32 s28, s4, 6 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s22 ; GFX6-NEXT: v_mov_b32_e32 v9, s23 -; GFX6-NEXT: s_lshr_b32 s22, s5, 5 +; GFX6-NEXT: s_lshr_b32 s22, s4, 7 ; GFX6-NEXT: v_mov_b32_e32 v10, s24 ; GFX6-NEXT: v_mov_b32_e32 v11, s25 -; GFX6-NEXT: s_lshr_b32 s24, s5, 2 +; GFX6-NEXT: s_lshr_b32 s24, s4, 4 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s18 ; GFX6-NEXT: v_mov_b32_e32 v13, s19 -; GFX6-NEXT: s_lshr_b32 s18, s5, 3 +; GFX6-NEXT: s_lshr_b32 s18, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v14, s20 ; GFX6-NEXT: v_mov_b32_e32 v15, s21 -; GFX6-NEXT: s_lshr_b32 s20, s5, 1 +; GFX6-NEXT: s_lshr_b32 s20, s4, 2 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:448 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 30 +; GFX6-NEXT: s_lshr_b32 s14, s4, 3 ; GFX6-NEXT: v_mov_b32_e32 v4, s16 ; GFX6-NEXT: v_mov_b32_e32 v5, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 31 +; GFX6-NEXT: s_lshr_b32 s16, s4, 1 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:432 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s10 ; GFX6-NEXT: v_mov_b32_e32 v9, s11 -; GFX6-NEXT: s_lshr_b32 s10, s4, 28 +; GFX6-NEXT: s_lshr_b32 s10, s5, 29 ; GFX6-NEXT: v_mov_b32_e32 v10, s12 ; GFX6-NEXT: v_mov_b32_e32 v11, s13 -; GFX6-NEXT: s_lshr_b32 s12, s4, 29 +; GFX6-NEXT: s_lshr_b32 s12, s5, 28 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:416 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s6 ; GFX6-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NEXT: s_lshr_b32 s46, s4, 26 +; GFX6-NEXT: s_lshr_b32 s6, s5, 26 ; GFX6-NEXT: v_mov_b32_e32 v14, s8 ; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 27 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000 +; GFX6-NEXT: s_lshr_b32 s8, s5, 27 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:400 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s40 ; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 24 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: v_mov_b32_e32 v5, s7 -; GFX6-NEXT: s_lshr_b32 s44, s4, 25 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:384 +; GFX6-NEXT: s_lshr_b32 s40, s5, 25 +; GFX6-NEXT: v_mov_b32_e32 v4, s44 +; GFX6-NEXT: v_mov_b32_e32 v5, s45 +; GFX6-NEXT: s_lshr_b32 s44, s5, 24 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 22 -; GFX6-NEXT: v_mov_b32_e32 v10, s6 -; GFX6-NEXT: v_mov_b32_e32 v11, s7 -; GFX6-NEXT: s_lshr_b32 s42, s4, 23 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[38:39], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:368 +; GFX6-NEXT: v_mov_b32_e32 v8, s42 +; GFX6-NEXT: v_mov_b32_e32 v9, s43 +; GFX6-NEXT: s_lshr_b32 s42, s5, 22 +; GFX6-NEXT: v_mov_b32_e32 v10, s36 +; GFX6-NEXT: v_mov_b32_e32 v11, s37 +; GFX6-NEXT: s_lshr_b32 s36, s5, 23 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 20 -; GFX6-NEXT: v_mov_b32_e32 v14, s6 -; GFX6-NEXT: v_mov_b32_e32 v15, s7 -; GFX6-NEXT: s_lshr_b32 s6, s4, 21 +; GFX6-NEXT: v_mov_b32_e32 v12, s38 +; GFX6-NEXT: v_mov_b32_e32 v13, s39 +; GFX6-NEXT: s_lshr_b32 s38, s5, 20 +; GFX6-NEXT: v_mov_b32_e32 v14, s30 +; GFX6-NEXT: v_mov_b32_e32 v15, s31 +; GFX6-NEXT: s_lshr_b32 s4, s5, 21 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352 -; GFX6-NEXT: v_mov_b32_e32 v16, s34 -; GFX6-NEXT: v_mov_b32_e32 v17, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 18 -; GFX6-NEXT: v_mov_b32_e32 v18, s26 -; GFX6-NEXT: v_mov_b32_e32 v19, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 19 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[34:35], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NEXT: s_lshr_b32 s30, s5, 18 +; GFX6-NEXT: v_mov_b32_e32 v4, s26 +; GFX6-NEXT: v_mov_b32_e32 v5, s27 +; GFX6-NEXT: s_lshr_b32 s26, s5, 19 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:336 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s28 ; GFX6-NEXT: v_mov_b32_e32 v9, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 16 +; GFX6-NEXT: s_lshr_b32 s28, s5, 17 ; GFX6-NEXT: v_mov_b32_e32 v10, s22 ; GFX6-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 17 +; GFX6-NEXT: s_lshr_b32 s22, s5, 16 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:320 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s24 ; GFX6-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 14 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_lshr_b32 s24, s5, 14 ; GFX6-NEXT: v_mov_b32_e32 v14, s18 ; GFX6-NEXT: v_mov_b32_e32 v15, s19 -; GFX6-NEXT: s_lshr_b32 s18, s4, 15 -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 12 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_lshr_b32 s18, s5, 15 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 +; GFX6-NEXT: v_mov_b32_e32 v16, s20 +; GFX6-NEXT: v_mov_b32_e32 v17, s21 +; GFX6-NEXT: s_lshr_b32 s20, s5, 12 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:304 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v18, s14 +; GFX6-NEXT: v_mov_b32_e32 v19, s15 +; GFX6-NEXT: s_lshr_b32 s14, s5, 13 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s14 -; GFX6-NEXT: v_mov_b32_e32 v17, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 13 -; GFX6-NEXT: v_mov_b32_e32 v18, s16 -; GFX6-NEXT: v_mov_b32_e32 v19, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 10 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: s_lshr_b32 s16, s5, 10 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:288 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s10 -; GFX6-NEXT: v_mov_b32_e32 v9, s11 -; GFX6-NEXT: s_lshr_b32 s10, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v10, s12 -; GFX6-NEXT: v_mov_b32_e32 v11, s13 -; GFX6-NEXT: s_lshr_b32 s12, s4, 8 +; GFX6-NEXT: v_mov_b32_e32 v8, s12 +; GFX6-NEXT: v_mov_b32_e32 v9, s13 +; GFX6-NEXT: s_lshr_b32 s12, s5, 11 +; GFX6-NEXT: v_mov_b32_e32 v10, s10 +; GFX6-NEXT: v_mov_b32_e32 v11, s11 +; GFX6-NEXT: s_lshr_b32 s10, s5, 8 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[46:47], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:272 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s38 -; GFX6-NEXT: v_mov_b32_e32 v13, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 9 +; GFX6-NEXT: v_mov_b32_e32 v12, s6 +; GFX6-NEXT: v_mov_b32_e32 v13, s7 +; GFX6-NEXT: s_lshr_b32 s6, s5, 9 ; GFX6-NEXT: v_mov_b32_e32 v14, s8 ; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_lshr_b32 s8, s5, 6 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NEXT: v_mov_b32_e32 v3, s45 -; GFX6-NEXT: s_lshr_b32 s44, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v16, s34 +; GFX6-NEXT: v_mov_b32_e32 v17, s35 +; GFX6-NEXT: s_lshr_b32 s34, s5, 7 +; GFX6-NEXT: v_mov_b32_e32 v18, s40 +; GFX6-NEXT: v_mov_b32_e32 v19, s41 +; GFX6-NEXT: s_lshr_b32 s40, s5, 4 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s36 -; GFX6-NEXT: v_mov_b32_e32 v17, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v18, s42 -; GFX6-NEXT: v_mov_b32_e32 v19, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224 +; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NEXT: s_lshr_b32 s42, s5, 5 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: s_lshr_b32 s36, s5, 2 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s30 -; GFX6-NEXT: v_mov_b32_e32 v9, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 3 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NEXT: s_lshr_b32 s38, s5, 3 +; GFX6-NEXT: s_lshr_b32 s44, s5, 1 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 @@ -8589,71 +8590,71 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GFX6-NEXT: v_mov_b32_e32 v10, s6 -; GFX6-NEXT: v_mov_b32_e32 v11, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GFX6-NEXT: s_waitcnt expcnt(2) -; GFX6-NEXT: v_mov_b32_e32 v0, s34 -; GFX6-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:448 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 +; GFX6-NEXT: v_mov_b32_e32 v10, s4 +; GFX6-NEXT: v_mov_b32_e32 v11, s5 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:416 +; GFX6-NEXT: s_waitcnt expcnt(1) +; GFX6-NEXT: v_mov_b32_e32 v0, s30 +; GFX6-NEXT: v_mov_b32_e32 v1, s31 ; GFX6-NEXT: v_mov_b32_e32 v2, s26 ; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NEXT: v_mov_b32_e32 v3, s23 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GFX6-NEXT: v_mov_b32_e32 v0, s22 +; GFX6-NEXT: v_mov_b32_e32 v1, s23 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s24 ; GFX6-NEXT: v_mov_b32_e32 v1, s25 ; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s20 ; GFX6-NEXT: v_mov_b32_e32 v1, s21 ; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NEXT: v_mov_b32_e32 v3, s11 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s42 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: v_mov_b32_e32 v8, s4 -; GFX6-NEXT: v_mov_b32_e32 v9, s5 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s36 +; GFX6-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 +; GFX6-NEXT: v_mov_b32_e32 v8, s44 +; GFX6-NEXT: v_mov_b32_e32 v9, s45 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 8c8dd83c7a4bf..a135b43bad0fe 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1649,15 +1649,15 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s6, s4, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s6, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -1672,14 +1672,14 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s0, s2, 16 -; GCN-HSA-NEXT: s_ashr_i32 s1, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s0, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s1, s2, 16 ; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -6545,33 +6545,33 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s5 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s5, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s5, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 @@ -6592,8 +6592,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s2, s7 -; GCN-HSA-NEXT: s_mov_b32 s8, s5 -; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s6, 16 +; GCN-HSA-NEXT: s_mov_b32 s10, s5 ; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 ; GCN-HSA-NEXT: s_ashr_i32 s13, s5, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x100000 @@ -6611,25 +6611,25 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -7167,12 +7167,12 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s3 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s1 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x100000 @@ -7180,60 +7180,60 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s1, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[16:17], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s3, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s5, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s7, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[24:25], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s30 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 @@ -7249,19 +7249,19 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s12, s7 +; GCN-HSA-NEXT: s_mov_b32 s10, s7 +; GCN-HSA-NEXT: s_lshr_b32 s12, s6, 16 ; GCN-HSA-NEXT: s_mov_b32 s14, s5 -; GCN-HSA-NEXT: s_mov_b32 s16, s3 -; GCN-HSA-NEXT: s_mov_b32 s18, s1 -; GCN-HSA-NEXT: s_ashr_i32 s27, s1, 31 +; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 16 +; GCN-HSA-NEXT: s_ashr_i32 s25, s1, 31 ; GCN-HSA-NEXT: s_ashr_i32 s29, s3, 31 ; GCN-HSA-NEXT: s_ashr_i32 s30, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s0, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s18, s3 +; GCN-HSA-NEXT: s_lshr_b32 s20, s2, 16 +; GCN-HSA-NEXT: s_mov_b32 s22, s1 +; GCN-HSA-NEXT: s_lshr_b32 s24, s0, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s28, s1, 16 ; GCN-HSA-NEXT: s_ashr_i32 s31, s5, 31 ; GCN-HSA-NEXT: s_ashr_i32 s33, s5, 16 @@ -7272,55 +7272,36 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[24:25], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[22:23], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s24, s8, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s25, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s8, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s22, s8, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s23, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s34 -; GCN-HSA-NEXT: s_add_u32 s14, s8, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s31 +; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: s_add_u32 s14, s8, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -7329,17 +7310,35 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s8, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 @@ -8313,148 +8312,151 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s7 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s1, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s3, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s3, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s9, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s9, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s11, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s11, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s3, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s9, 31 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s13, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s15, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s15, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s54, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s11, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s13, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s15, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s4, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s2, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s53 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[54:55], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[56:57], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s23 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s39 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s35 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: @@ -8466,47 +8468,47 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s34, s15 -; GCN-HSA-NEXT: s_ashr_i32 s41, s3, 31 -; GCN-HSA-NEXT: s_ashr_i32 s42, s3, 16 -; GCN-HSA-NEXT: s_ashr_i32 s57, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s59, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s61, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s63, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s65, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s67, s11, 31 -; GCN-HSA-NEXT: s_ashr_i32 s69, s11, 16 -; GCN-HSA-NEXT: s_mov_b32 s44, s13 -; GCN-HSA-NEXT: s_mov_b32 s46, s11 -; GCN-HSA-NEXT: s_mov_b32 s48, s9 -; GCN-HSA-NEXT: s_mov_b32 s50, s7 -; GCN-HSA-NEXT: s_mov_b32 s52, s5 -; GCN-HSA-NEXT: s_mov_b32 s38, s3 -; GCN-HSA-NEXT: s_mov_b32 s36, s1 -; GCN-HSA-NEXT: s_lshr_b32 s54, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s56, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s58, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s60, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s62, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s64, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s2, 16 +; GCN-HSA-NEXT: s_mov_b32 s24, s15 +; GCN-HSA-NEXT: s_ashr_i32 s37, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s38, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s57, s11, 16 +; GCN-HSA-NEXT: s_ashr_i32 s59, s13, 31 +; GCN-HSA-NEXT: s_ashr_i32 s61, s13, 16 +; GCN-HSA-NEXT: s_ashr_i32 s63, s15, 31 +; GCN-HSA-NEXT: s_ashr_i32 s65, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16 +; GCN-HSA-NEXT: s_mov_b32 s48, s13 +; GCN-HSA-NEXT: s_lshr_b32 s50, s12, 16 +; GCN-HSA-NEXT: s_mov_b32 s52, s11 +; GCN-HSA-NEXT: s_lshr_b32 s34, s10, 16 +; GCN-HSA-NEXT: s_mov_b32 s30, s9 +; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 16 +; GCN-HSA-NEXT: s_mov_b32 s54, s7 +; GCN-HSA-NEXT: s_lshr_b32 s56, s6, 16 +; GCN-HSA-NEXT: s_mov_b32 s58, s5 +; GCN-HSA-NEXT: s_lshr_b32 s60, s4, 16 +; GCN-HSA-NEXT: s_mov_b32 s62, s3 +; GCN-HSA-NEXT: s_lshr_b32 s64, s2, 16 +; GCN-HSA-NEXT: s_mov_b32 s66, s1 ; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[34:35], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GCN-HSA-NEXT: s_ashr_i32 s40, s1, 16 -; GCN-HSA-NEXT: s_ashr_i32 s43, s5, 31 -; GCN-HSA-NEXT: s_ashr_i32 s70, s13, 31 -; GCN-HSA-NEXT: s_ashr_i32 s71, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s72, s15, 31 -; GCN-HSA-NEXT: s_ashr_i32 s73, s15, 16 +; GCN-HSA-NEXT: s_ashr_i32 s36, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s39, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s40, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s41, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s42, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s43, s9, 31 +; GCN-HSA-NEXT: s_ashr_i32 s44, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s45, s11, 31 ; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[70:71], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[72:73], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[74:75], s[14:15], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x100000 @@ -8516,149 +8518,149 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[54:55], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[54:55], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 ; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 +; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 +; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47 +; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47 +; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 +; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xb0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s73 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s72 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s63 +; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s71 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 -; GCN-HSA-NEXT: s_add_u32 s38, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s70 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s69 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s67 -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 -; GCN-HSA-NEXT: s_add_u32 s36, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s37 -; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s37 -; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 -; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s30 -; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s74 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s75 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: s_addc_u32 s31, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s44 -; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 +; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 +; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 +; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s59 +; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s72 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 +; GCN-HSA-NEXT: s_add_u32 s24, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s73 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 +; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s38 +; GCN-HSA-NEXT: s_add_u32 s14, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s36 -; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s70 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s71 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_add_u32 s10, s16, 0x80 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 64 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 5c4bc95578bb4..b534c2c267fad 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -6398,41 +6398,41 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s10, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s16, s5 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s5, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s17 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s11 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64: @@ -6445,11 +6445,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s4, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s6, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s8, s3 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s6, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s3, 8 +; GFX7-HSA-NEXT: s_mov_b32 s14, s3 ; GFX7-HSA-NEXT: s_ashr_i32 s5, s3, 31 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 @@ -6465,32 +6465,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -6502,11 +6502,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s8, s3 -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s14, s3 ; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s3, 31 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 @@ -6522,32 +6522,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -6615,34 +6615,34 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s4, s3, 16 -; GFX12-NEXT: s_lshr_b32 s6, s3, 8 -; GFX12-NEXT: s_mov_b32 s8, s3 -; GFX12-NEXT: s_lshr_b32 s10, s2, 16 -; GFX12-NEXT: s_lshr_b32 s12, s2, 24 +; GFX12-NEXT: s_lshr_b32 s6, s2, 16 +; GFX12-NEXT: s_lshr_b32 s8, s2, 24 +; GFX12-NEXT: s_lshr_b32 s10, s2, 8 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX12-NEXT: s_lshr_b32 s12, s3, 8 +; GFX12-NEXT: s_mov_b32 s14, s3 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX12-NEXT: s_ashr_i32 s15, s3, 31 ; GFX12-NEXT: s_ashr_i32 s18, s3, 24 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX12-NEXT: s_lshr_b32 s14, s2, 8 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s15 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s7 +; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s9 +; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v7, s11 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v5, s17 ; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s9 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s7 -; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s11 -; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13 -; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v7, s3 -; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v13, s3 +; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v15, s13 +; GFX12-NEXT: v_mov_b32_e32 v14, s12 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:32 ; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = sext <8 x i8> %load to <8 x i64> @@ -7033,80 +7033,81 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s14, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s26, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s4, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s26, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s8, s5 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 24 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s7, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s38, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s33 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: @@ -7118,31 +7119,33 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s12, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 8 -; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s31, s5, 24 -; GFX7-HSA-NEXT: s_mov_b32 s24, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s2, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s6, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 +; GFX7-HSA-NEXT: s_ashr_i32 s27, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 24 +; GFX7-HSA-NEXT: s_mov_b32 s22, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s28, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s36, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s34, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 @@ -7150,73 +7153,70 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s26, s0, 0x70 -; GFX7-HSA-NEXT: s_addc_u32 s27, s1, 0 +; GFX7-HSA-NEXT: s_add_u32 s24, s0, 0x50 +; GFX7-HSA-NEXT: s_addc_u32 s25, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -7225,107 +7225,109 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s22, s7 -; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s6, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s6, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s5, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s5, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s14, s5 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 8 -; GFX8-NOHSA-NEXT: s_ashr_i32 s19, s5, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s31, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s10, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s10, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s8, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s8, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s11, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s11, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s28, s11 +; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s9, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s9, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s2, s9 +; GFX8-NOHSA-NEXT: s_ashr_i32 s25, s9, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s29, s9, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[28:29], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s28, s11, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s11, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[24:25], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s30, s7, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s7, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[18:19], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 64 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 16 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -7435,64 +7437,64 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s8, s7, 16 -; GFX12-NEXT: s_lshr_b32 s10, s7, 8 -; GFX12-NEXT: s_mov_b32 s12, s7 -; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GFX12-NEXT: s_ashr_i32 s33, s7, 31 -; GFX12-NEXT: s_ashr_i32 s36, s7, 24 +; GFX12-NEXT: s_lshr_b32 s2, s6, 16 +; GFX12-NEXT: s_lshr_b32 s8, s6, 24 +; GFX12-NEXT: s_lshr_b32 s10, s6, 8 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX12-NEXT: s_lshr_b32 s14, s6, 16 -; GFX12-NEXT: s_lshr_b32 s16, s6, 24 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX12-NEXT: s_lshr_b32 s12, s4, 16 +; GFX12-NEXT: s_lshr_b32 s14, s4, 24 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v3, s33 -; GFX12-NEXT: s_lshr_b32 s18, s6, 8 -; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s35 -; GFX12-NEXT: v_dual_mov_b32 v4, s34 :: v_dual_mov_b32 v1, s9 -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v9, s13 -; GFX12-NEXT: s_lshr_b32 s20, s5, 16 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35 +; GFX12-NEXT: s_lshr_b32 s16, s4, 8 +; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v9, s3 +; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s9 +; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v3, s11 +; GFX12-NEXT: s_lshr_b32 s18, s7, 16 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s11 -; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s15 -; GFX12-NEXT: s_lshr_b32 s22, s5, 8 -; GFX12-NEXT: s_mov_b32 s24, s5 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v5, s31 +; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v13, s13 +; GFX12-NEXT: s_lshr_b32 s20, s7, 8 +; GFX12-NEXT: s_mov_b32 s22, s7 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX12-NEXT: s_lshr_b32 s24, s5, 16 +; GFX12-NEXT: s_ashr_i32 s33, s7, 31 +; GFX12-NEXT: s_ashr_i32 s36, s7, 24 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: s_lshr_b32 s26, s4, 16 -; GFX12-NEXT: s_lshr_b32 s28, s4, 24 -; GFX12-NEXT: s_ashr_i32 s29, s5, 31 -; GFX12-NEXT: s_ashr_i32 s31, s5, 24 +; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 +; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v7, s17 +; GFX12-NEXT: s_lshr_b32 s26, s5, 8 +; GFX12-NEXT: s_mov_b32 s28, s5 +; GFX12-NEXT: s_ashr_i32 s27, s5, 31 +; GFX12-NEXT: s_ashr_i32 s29, s5, 24 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s17 -; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v7, s19 -; GFX12-NEXT: s_lshr_b32 s30, s4, 8 +; GFX12-NEXT: v_mov_b32_e32 v6, s16 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: v_mov_b32_e32 v6, s18 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: v_dual_mov_b32 v0, s18 :: v_dual_mov_b32 v3, s33 +; GFX12-NEXT: v_dual_mov_b32 v1, s19 :: v_dual_mov_b32 v2, s36 +; GFX12-NEXT: v_mov_b32_e32 v9, s23 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s21 +; GFX12-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v17, s25 +; GFX12-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v19, s27 +; GFX12-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v21, s5 +; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s7 +; GFX12-NEXT: v_mov_b32_e32 v22, s6 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s29 -; GFX12-NEXT: v_dual_mov_b32 v1, s21 :: v_dual_mov_b32 v2, s31 -; GFX12-NEXT: v_mov_b32_e32 v9, s25 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v11, s23 -; GFX12-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v17, s27 -; GFX12-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v19, s7 -; GFX12-NEXT: v_dual_mov_b32 v18, s6 :: v_dual_mov_b32 v21, s3 -; GFX12-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v23, s5 -; GFX12-NEXT: v_mov_b32_e32 v22, s4 -; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:32 ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = sext <16 x i8> %load to <16 x i64> @@ -8204,157 +8206,157 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s30, s7 ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s34, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s0, 24 +; GFX6-NOHSA-NEXT: s_mov_b32 s34, s7 ; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s1, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s1, 24 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s44, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s1, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s52, s1 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s0, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s54, s0, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 24 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s0, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s46, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s3, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s50, s3 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 16 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s1, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s6, s1 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s56 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s56 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s57 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s54 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s55 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s52 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s53 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s19 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s6 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s8 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[54:55], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:144 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[50:51], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[44:45], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[40:41], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[40:41], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s31 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s31 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s39 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s26 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: @@ -8366,211 +8368,212 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s42, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s6, 16 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s1, 24 -; GFX7-HSA-NEXT: s_ashr_i32 s43, s3, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s45, s3, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s46, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s54, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s36, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s34, s4, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s26, s3 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s2, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s1, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s1, 8 -; GFX7-HSA-NEXT: s_mov_b32 s12, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s0, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s62, s0, 8 -; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24 ; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s66, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s67, s5, 24 -; GFX7-HSA-NEXT: s_ashr_i32 s68, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s69, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[64:65], s[6:7], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[60:61], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[58:59], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[56:57], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s37, s1, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s34, s0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s30, s0, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s0, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s66, s1, 8 +; GFX7-HSA-NEXT: s_mov_b32 s68, s1 +; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s36, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 16 +; GFX7-HSA-NEXT: s_ashr_i32 s41, s3, 31 +; GFX7-HSA-NEXT: s_lshr_b32 s50, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s54, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s42, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s24, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s16, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s3, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s3, 8 +; GFX7-HSA-NEXT: s_mov_b32 s62, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s44, s3, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s45, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s46, s5, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s47, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s48, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[2:3], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[70:71], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[60:61], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[44:45], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[42:43], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xf0 +; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[40:41], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[36:37], 0x80000 +; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xd0 ; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s58 -; GFX7-HSA-NEXT: s_add_u32 s58, s8, 0xe0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s59 -; GFX7-HSA-NEXT: s_addc_u32 s59, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xd0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s49 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s49 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s49 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s63 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s69 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s58 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s38 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xa0 +; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0xc0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s50 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 +; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s38 +; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s39 ; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s72 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX7-HSA-NEXT: s_add_u32 s34, s8, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s73 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX7-HSA-NEXT: s_addc_u32 s35, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s30 -; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s31 -; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s31 -; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x80 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s57 -; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 -; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29 -; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s38 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s24 -; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 +; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 +; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s28 -; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s35 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s25 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: s_add_u32 s16, s8, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: s_addc_u32 s17, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23 +; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xf0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23 +; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xe0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s29 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s21 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: s_add_u32 s14, s8, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: s_add_u32 s14, s8, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX7-HSA-NEXT: s_addc_u32 s15, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s8, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: s_add_u32 s12, s8, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s8, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s8, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -8580,140 +8583,175 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s54, s7 -; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s6, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s6, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s62, s5, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s5, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s46, s5 -; GFX8-NOHSA-NEXT: s_lshr_b32 s42, s4, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s4, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s64, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s3, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s30, s3 -; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s2, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s1, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s1, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s18, s1 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s0, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s0, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s0, 8 -; GFX8-NOHSA-NEXT: s_ashr_i32 s65, s3, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[44:45], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s6, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s4, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s54, s4, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s4, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s0, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s0, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s0, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s62, s7, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s22, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s64, s5, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s5, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s18, s5 +; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s3, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s14, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s1, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s1, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s10, s1 +; GFX8-NOHSA-NEXT: s_ashr_i32 s63, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[0:1], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[68:69], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s1, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s42, s1, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[44:45], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s43, s3, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s44, s3, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s1, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s45, s5, 31 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s64, s7, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s65, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s3, 31 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[64:65], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s64, s5, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s66, s7, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s7, s7, 24 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s50 -; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s51 -; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s66 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 -; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xe0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s54 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s55 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s52 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 -; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xd0 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xd0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 +; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s68 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s69 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0x90 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s56 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s57 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s59 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 +; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s52 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s54 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44 -; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xc0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s45 -; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s38 +; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x80 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s56 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39 +; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x50 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s58 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s61 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xb0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s63 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xa0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX8-NOHSA-NEXT: s_add_u32 s40, s8, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX8-NOHSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s42 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s43 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s41 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 +; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 16 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX8-NOHSA-NEXT: s_add_u32 s36, s8, 0x80 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NOHSA-NEXT: s_addc_u32 s37, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s36 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s37 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xf0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xe0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xb0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s63 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -8723,33 +8761,15 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 64 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -8760,32 +8780,16 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -8984,122 +8988,120 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s40, s7, 16 -; GFX12-NEXT: s_lshr_b32 s50, s6, 8 -; GFX12-NEXT: s_lshr_b32 s62, s3, 16 -; GFX12-NEXT: s_ashr_i32 s51, s3, 24 -; GFX12-NEXT: s_lshr_b32 s42, s7, 8 -; GFX12-NEXT: s_mov_b32 s44, s7 -; GFX12-NEXT: s_lshr_b32 s46, s6, 16 -; GFX12-NEXT: s_lshr_b32 s48, s6, 24 -; GFX12-NEXT: s_lshr_b32 s38, s5, 16 -; GFX12-NEXT: s_lshr_b32 s52, s5, 8 -; GFX12-NEXT: s_mov_b32 s54, s5 -; GFX12-NEXT: s_lshr_b32 s56, s4, 16 -; GFX12-NEXT: s_lshr_b32 s58, s4, 24 -; GFX12-NEXT: s_lshr_b32 s60, s4, 8 -; GFX12-NEXT: s_lshr_b32 s36, s3, 8 -; GFX12-NEXT: s_mov_b32 s34, s3 -; GFX12-NEXT: s_lshr_b32 s28, s2, 16 -; GFX12-NEXT: s_lshr_b32 s26, s2, 24 -; GFX12-NEXT: s_lshr_b32 s24, s2, 8 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GFX12-NEXT: s_lshr_b32 s34, s6, 16 +; GFX12-NEXT: s_lshr_b32 s36, s6, 24 +; GFX12-NEXT: s_lshr_b32 s38, s6, 8 +; GFX12-NEXT: s_lshr_b32 s40, s4, 16 +; GFX12-NEXT: s_lshr_b32 s42, s4, 24 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX12-NEXT: s_lshr_b32 s44, s4, 8 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000 -; GFX12-NEXT: s_ashr_i32 s39, s3, 31 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 -; GFX12-NEXT: s_ashr_i32 s62, s5, 31 -; GFX12-NEXT: s_ashr_i32 s63, s5, 24 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000 -; GFX12-NEXT: s_ashr_i32 s50, s7, 31 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX12-NEXT: s_ashr_i32 s7, s7, 24 +; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s37 +; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s67 +; GFX12-NEXT: s_lshr_b32 s28, s2, 16 +; GFX12-NEXT: s_lshr_b32 s46, s2, 24 +; GFX12-NEXT: s_bfe_i64 s[64:65], s[4:5], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s41 -; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s66 :: v_dual_mov_b32 v7, s39 +; GFX12-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v9, s41 +; GFX12-NEXT: s_lshr_b32 s48, s2, 8 +; GFX12-NEXT: v_dual_mov_b32 v8, s40 :: v_dual_mov_b32 v11, s43 +; GFX12-NEXT: v_dual_mov_b32 v10, s42 :: v_dual_mov_b32 v13, s65 +; GFX12-NEXT: s_lshr_b32 s50, s0, 16 +; GFX12-NEXT: s_lshr_b32 s52, s0, 24 ; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v3, s50 -; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s45 -; GFX12-NEXT: v_dual_mov_b32 v4, s44 :: v_dual_mov_b32 v7, s43 -; GFX12-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v9, s47 -; GFX12-NEXT: v_dual_mov_b32 v8, s46 :: v_dual_mov_b32 v11, s49 -; GFX12-NEXT: v_dual_mov_b32 v10, s48 :: v_dual_mov_b32 v13, s67 -; GFX12-NEXT: v_dual_mov_b32 v12, s66 :: v_dual_mov_b32 v15, s5 -; GFX12-NEXT: v_mov_b32_e32 v14, s4 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[38:39], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s64 :: v_dual_mov_b32 v15, s45 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX12-NEXT: v_mov_b32_e32 v14, s44 +; GFX12-NEXT: s_lshr_b32 s54, s0, 8 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX12-NEXT: s_lshr_b32 s56, s7, 16 +; GFX12-NEXT: s_lshr_b32 s58, s5, 16 +; GFX12-NEXT: s_lshr_b32 s60, s1, 8 +; GFX12-NEXT: s_mov_b32 s62, s1 +; GFX12-NEXT: s_ashr_i32 s57, s1, 24 +; GFX12-NEXT: s_ashr_i32 s59, s3, 31 +; GFX12-NEXT: s_ashr_i32 s61, s3, 24 +; GFX12-NEXT: s_ashr_i32 s63, s5, 31 ; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:240 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:224 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:208 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:192 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s62 -; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s63 -; GFX12-NEXT: v_mov_b32_e32 v5, s55 -; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s54 :: v_dual_mov_b32 v7, s53 -; GFX12-NEXT: v_dual_mov_b32 v6, s52 :: v_dual_mov_b32 v9, s57 -; GFX12-NEXT: v_dual_mov_b32 v8, s56 :: v_dual_mov_b32 v11, s59 -; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s31 -; GFX12-NEXT: s_lshr_b32 s22, s1, 16 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s30 :: v_dual_mov_b32 v15, s61 -; GFX12-NEXT: v_dual_mov_b32 v14, s60 :: v_dual_mov_b32 v17, s3 -; GFX12-NEXT: s_lshr_b32 s16, s1, 8 -; GFX12-NEXT: s_mov_b32 s18, s1 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s39 -; GFX12-NEXT: v_dual_mov_b32 v18, s51 :: v_dual_mov_b32 v21, s35 -; GFX12-NEXT: s_lshr_b32 s14, s0, 16 -; GFX12-NEXT: s_lshr_b32 s12, s0, 24 -; GFX12-NEXT: s_ashr_i32 s6, s1, 31 -; GFX12-NEXT: s_ashr_i32 s33, s1, 24 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v23, s37 -; GFX12-NEXT: v_mov_b32_e32 v22, s36 -; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:192 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:144 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:128 -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:112 -; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s27 -; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s26 -; GFX12-NEXT: v_mov_b32_e32 v5, s21 -; GFX12-NEXT: s_lshr_b32 s64, s0, 8 +; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s47 +; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s46 +; GFX12-NEXT: v_mov_b32_e32 v5, s31 +; GFX12-NEXT: s_lshr_b32 s26, s7, 8 +; GFX12-NEXT: s_mov_b32 s24, s7 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v7, s49 +; GFX12-NEXT: v_dual_mov_b32 v6, s48 :: v_dual_mov_b32 v9, s51 +; GFX12-NEXT: s_lshr_b32 s18, s5, 8 +; GFX12-NEXT: s_mov_b32 s20, s5 +; GFX12-NEXT: s_lshr_b32 s16, s3, 16 +; GFX12-NEXT: s_lshr_b32 s12, s3, 8 +; GFX12-NEXT: s_mov_b32 s14, s3 +; GFX12-NEXT: s_lshr_b32 s10, s1, 16 +; GFX12-NEXT: s_ashr_i32 s33, s1, 31 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[60:61], 0x80000 +; GFX12-NEXT: s_ashr_i32 s60, s5, 24 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x80000 +; GFX12-NEXT: s_ashr_i32 s58, s7, 31 +; GFX12-NEXT: s_ashr_i32 s62, s7, 24 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s50 :: v_dual_mov_b32 v11, s53 +; GFX12-NEXT: v_dual_mov_b32 v10, s52 :: v_dual_mov_b32 v13, s23 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v15, s55 +; GFX12-NEXT: v_dual_mov_b32 v14, s54 :: v_dual_mov_b32 v17, s7 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v19, s58 +; GFX12-NEXT: v_dual_mov_b32 v18, s62 :: v_dual_mov_b32 v21, s25 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s25 -; GFX12-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v9, s23 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s6 -; GFX12-NEXT: v_dual_mov_b32 v10, s33 :: v_dual_mov_b32 v13, s19 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[0:1], s[64:65], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s17 -; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v17, s15 -; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v19, s13 -; GFX12-NEXT: v_dual_mov_b32 v18, s12 :: v_dual_mov_b32 v21, s11 -; GFX12-NEXT: v_dual_mov_b32 v20, s10 :: v_dual_mov_b32 v23, s1 -; GFX12-NEXT: v_mov_b32_e32 v22, s0 +; GFX12-NEXT: v_dual_mov_b32 v20, s24 :: v_dual_mov_b32 v23, s27 +; GFX12-NEXT: v_mov_b32_e32 v22, s26 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:80 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:64 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:48 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:32 -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:16 -; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:224 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s63 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s60 +; GFX12-NEXT: v_mov_b32_e32 v5, s21 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s19 +; GFX12-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s17 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v11, s59 +; GFX12-NEXT: v_dual_mov_b32 v10, s61 :: v_dual_mov_b32 v13, s15 +; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s13 +; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v17, s11 +; GFX12-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v19, s33 +; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v21, s3 +; GFX12-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v23, s1 +; GFX12-NEXT: v_mov_b32_e32 v22, s0 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:96 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:32 ; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = sext <32 x i8> %load to <32 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index ff5b9aadc87fb..09d3c3b01b809 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1749,8 +1749,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v4, 0, 16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -1769,8 +1769,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] @@ -6376,8 +6376,8 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 @@ -6401,28 +6401,28 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 ; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 @@ -6431,9 +6431,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -6975,58 +6975,59 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v3 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v18, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v17, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: @@ -7048,31 +7049,31 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 @@ -7080,36 +7081,36 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v5 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 ; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v9, v16, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v9, v18, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_bfe_i32 v6, v17, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v6, v19, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 ; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v16, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v18, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v15, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 @@ -7118,9 +7119,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -8111,113 +8112,115 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v13 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v13 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v11 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v1 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v26, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v5 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v5, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v11 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v12, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v9 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v15 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v10, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v19, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v0, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 31, v30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: @@ -8229,180 +8232,179 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13 +; GCN-HSA-NEXT: v_bfe_i32 v12, v9, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: v_bfe_i32 v16, v29, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v14, v18, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v10, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v15 -; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v10, v20, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_bfe_i32 v8, v21, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11] +; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v26, v26, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-HSA-NEXT: v_bfe_i32 v14, v27, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v7 +; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[19:22] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v0, v19, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19] -; GCN-HSA-NEXT: v_bfe_i32 v23, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v10, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v25, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v25, v9, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_bfe_i32 v13, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v3, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v11, v14, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[23:26] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[3:4], v[13:16] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v21, v21, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-HSA-NEXT: v_bfe_i32 v17, v22, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v5, v12, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index 6a39df95f6aba..f879dc660203f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -6274,12 +6274,12 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s4, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s5, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 8 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s4, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s4, 24 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 @@ -6294,19 +6294,19 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v8i8_to_v8i64: @@ -6325,11 +6325,12 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v1 ; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v0 ; GCN-HSA-NEXT: s_lshr_b32 s4, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 8 -; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 24 -; GCN-HSA-NEXT: s_lshr_b32 s12, s3, 8 +; GCN-HSA-NEXT: s_lshr_b32 s6, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 24 +; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 8 +; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 8 ; GCN-HSA-NEXT: s_ashr_i32 s13, s2, 31 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_ashr_i32 s16, s2, 24 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x80000 @@ -6337,38 +6338,37 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i8_to_v8i64: @@ -6388,10 +6388,10 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s4, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 8 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s4, 31 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s4, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 @@ -6408,18 +6408,18 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v8i8_to_v8i64: @@ -6934,84 +6934,85 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3 ; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s4, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s9, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s9 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s8, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s8, 8 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s7, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 24 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s9, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s9, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i8_to_v16i64: @@ -7024,41 +7025,41 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-HSA-NEXT: s_lshr_b32 s6, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 8 -; GCN-HSA-NEXT: s_mov_b32 s10, s3 -; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s2, 24 -; GCN-HSA-NEXT: s_lshr_b32 s16, s2, 8 +; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s2, 24 +; GCN-HSA-NEXT: s_lshr_b32 s10, s2, 8 +; GCN-HSA-NEXT: s_lshr_b32 s18, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 8 +; GCN-HSA-NEXT: s_mov_b32 s22, s3 ; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31 ; GCN-HSA-NEXT: s_ashr_i32 s9, s3, 24 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s18, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s5, 8 -; GCN-HSA-NEXT: s_mov_b32 s22, s5 +; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24 +; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s4, s5, 24 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_lshr_b32 s2, s5, 16 ; GCN-HSA-NEXT: s_ashr_i32 s3, s5, 31 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_ashr_i32 s7, s5, 24 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GCN-HSA-NEXT: s_lshr_b32 s24, s4, 24 -; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: s_lshr_b32 s4, s5, 8 +; GCN-HSA-NEXT: s_mov_b32 s24, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 @@ -7069,66 +7070,66 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s15 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i8_to_v16i64: @@ -7142,83 +7143,84 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s5, 16 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s5, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, s5 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s4, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s4, 8 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s35, s5, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s6, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s9, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s9 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s8, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s8, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s7, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s7 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s9, 31 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s7, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s7, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s9, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s5, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s29 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s13 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s27 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i8_to_v16i64: @@ -8174,166 +8176,166 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s39, v3 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s38, v2 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s37, v1 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s36, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v7 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v6 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s11, v5 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v4 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s39, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s39, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s39 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s38, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s38, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s38, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s37, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s37, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s37 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s36, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s36, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s36, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s7, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s7 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s39, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s39, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[38:39], 0x80000 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v6 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v7 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s18, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s18, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s18, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s22, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s22, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s22, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s24, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s24, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s24, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s12, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s12, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s19, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[18:19], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[22:23], 0x80000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s6, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[24:25], 0x80000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s45 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s9 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s37, 31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s37, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s19, 8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s43 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s23, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s41 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s6, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s37 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s11, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s23, 8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s23 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s31 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s11, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s25, 16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s11 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s25, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s25, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s23, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 24 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s19, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s19, 24 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s25, 24 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s25 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s29 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s27 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s10, 24 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s10, 8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s13, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s27 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s7, 31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s7, 24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s9 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s11, 31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s11, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[10:11], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[6:7], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[22:23], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[24:25], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s13, 31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s13, 24 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s13 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[22:23], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[44:45], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[40:41], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[38:39], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s11 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s19 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s37 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i64: @@ -8346,225 +8348,223 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v9 -; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v8 -; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v7 -; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v6 -; GCN-HSA-NEXT: s_lshr_b32 s16, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s7, 8 -; GCN-HSA-NEXT: s_mov_b32 s24, s7 -; GCN-HSA-NEXT: s_lshr_b32 s22, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s6, 24 -; GCN-HSA-NEXT: s_lshr_b32 s2, s6, 8 -; GCN-HSA-NEXT: s_lshr_b32 s4, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s9, 8 -; GCN-HSA-NEXT: s_mov_b32 s12, s9 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s6, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 24 -; GCN-HSA-NEXT: s_lshr_b32 s30, s8, 8 -; GCN-HSA-NEXT: s_ashr_i32 s43, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s52, s9, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[8:9], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[16:17], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[24:25], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v6 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v4 +; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v5 +; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v7 +; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 24 +; GCN-HSA-NEXT: s_lshr_b32 s10, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s8, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 8 +; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 8 +; GCN-HSA-NEXT: s_lshr_b32 s12, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 8 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s6, s9, 16 +; GCN-HSA-NEXT: s_mov_b32 s28, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[18:19], 0x80000 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s25, v5 -; GCN-HSA-NEXT: v_readfirstlane_b32 s24, v4 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-HSA-NEXT: v_readfirstlane_b32 s37, v3 -; GCN-HSA-NEXT: v_readfirstlane_b32 s36, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s40, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s41, v3 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s44, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s45, v1 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[30:31], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[28:29], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[10:11], 0x80000 +; GCN-HSA-NEXT: s_mov_b32 s22, s7 +; GCN-HSA-NEXT: s_lshr_b32 s8, s9, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[16:17], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[12:13], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[4:5], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_lshr_b32 s42, s44, 16 +; GCN-HSA-NEXT: s_lshr_b32 s48, s44, 24 +; GCN-HSA-NEXT: s_lshr_b32 s28, s44, 8 +; GCN-HSA-NEXT: s_lshr_b32 s6, s45, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s45, 8 +; GCN-HSA-NEXT: s_mov_b32 s4, s45 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s44, s40, 16 +; GCN-HSA-NEXT: s_lshr_b32 s50, s40, 24 +; GCN-HSA-NEXT: s_lshr_b32 s52, s40, 8 +; GCN-HSA-NEXT: s_lshr_b32 s20, s41, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s41, 8 +; GCN-HSA-NEXT: s_mov_b32 s14, s41 +; GCN-HSA-NEXT: s_ashr_i32 s33, s9, 31 +; GCN-HSA-NEXT: s_ashr_i32 s37, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s38, s7, 24 +; GCN-HSA-NEXT: s_ashr_i32 s34, s9, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_lshr_b32 s14, s37, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s37, 8 -; GCN-HSA-NEXT: s_mov_b32 s12, s37 -; GCN-HSA-NEXT: s_lshr_b32 s8, s36, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s36, 24 -; GCN-HSA-NEXT: s_lshr_b32 s4, s36, 8 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[36:37], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s38, s25, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s25, 8 -; GCN-HSA-NEXT: s_mov_b32 s36, s25 -; GCN-HSA-NEXT: s_lshr_b32 s48, s24, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s24, 24 -; GCN-HSA-NEXT: s_lshr_b32 s18, s24, 8 -; GCN-HSA-NEXT: s_ashr_i32 s50, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s51, s7, 24 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s15 -; GCN-HSA-NEXT: s_ashr_i32 s33, s37, 31 -; GCN-HSA-NEXT: s_ashr_i32 s42, s37, 24 -; GCN-HSA-NEXT: s_ashr_i32 s53, s25, 31 -; GCN-HSA-NEXT: s_ashr_i32 s54, s25, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[24:25], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s30, s45, 31 +; GCN-HSA-NEXT: s_ashr_i32 s31, s45, 24 +; GCN-HSA-NEXT: s_ashr_i32 s35, s41, 31 +; GCN-HSA-NEXT: s_ashr_i32 s36, s41, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[48:49], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s0, 64 -; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s44 -; GCN-HSA-NEXT: s_add_u32 s44, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s45 -; GCN-HSA-NEXT: s_addc_u32 s45, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s34 -; GCN-HSA-NEXT: s_add_u32 s34, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s44 -; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s43 +; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55 +; GCN-HSA-NEXT: s_add_u32 s54, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55 +; GCN-HSA-NEXT: s_add_u32 s54, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 +; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s41 +; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41 +; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55 +; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: s_add_u32 s20, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s21 -; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xf0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s21 -; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[20:23] -; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s20 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s29 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24 +; GCN-HSA-NEXT: s_add_u32 s24, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25 +; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18 +; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 +; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xc0 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0xb0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -8584,155 +8584,155 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s9, 16 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s9, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s9 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 8 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s9, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s9, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s11, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v7 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v6 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s15, v5 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s14, v4 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s11, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s11 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s69, s11, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s11, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s39 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s9, 8 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s11, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s6, 8 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s10, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s11, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s66, s11 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[4:5], 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s11, 31 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s8, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s10, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s13, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s13, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s13 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s13, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s13, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s12, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s12, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s12, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s5 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s5, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s5, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s15, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s7, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, s7 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s7, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s7, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s15, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s15 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s15, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s15, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s14, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s14, 8 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[62:63], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s8, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s63 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s8, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[66:67], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[64:65], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[68:69], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s42 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s9, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s45 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s47 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v32i8_to_v32i64: @@ -10530,33 +10530,34 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, 0xffff ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s5, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s5 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s4 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s9, s5, 0x80000 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s4 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s10, s4, 0x80000 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 24 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s7, s7, 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s6, s6, 0x80000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s9, 0xffff, s9 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, 0xffff, s10 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, 0xffff, s7 ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, 0xffff, s6 -; GCN-NOHSA-VI-NEXT: v_and_b32_sdwa v0, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8 +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s10, s11 ; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s7, s5 ; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s6, s4 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, s10, v0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 @@ -11344,27 +11345,27 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s7, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s13, s7, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s5 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s4 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s6 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s6, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s6, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s17, s5, 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s19, s4, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s5, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s13, s5, 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s7 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s6 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s4 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s4, 0x80000 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s17, s7, 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s19, s6, 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 24 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s6, 24 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s11, s11, 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s10, s10, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s16, s16, 8 @@ -11373,12 +11374,12 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s8, s8, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s12, s12, 8 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s17, 0xffff, s17 ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, 0xffff, s19 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s11, 0xffff, s11 ; GCN-NOHSA-VI-NEXT: s_and_b32 s10, 0xffff, s10 ; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff0000 @@ -11389,22 +11390,22 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15 ; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s11, s7 -; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s10, s6 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s11, s5 +; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s10, s4 ; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s17, s16 ; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s19, s18 -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s9, s5 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s8, s4 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s9, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s8, s6 ; GCN-NOHSA-VI-NEXT: s_or_b32 s12, s13, s12 ; GCN-NOHSA-VI-NEXT: s_or_b32 s13, s15, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -12843,8 +12844,8 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) @@ -12852,38 +12853,39 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s6, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s7 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s29, s7, 0x80000 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s6 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s31, s6, 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s6, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s14, s14, 0x80000 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s29, s7, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 24 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s15, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s14, 0xffff, s14 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v4 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v5 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15 ; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s14, s6 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s5 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s11 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s10 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s9 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s8 ; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s15, s7 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s5, 0x80000 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s21, s11, 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s23, s10, 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s21, s9, 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s23, s8, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s20, s20, 8 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s22, s22, 8 ; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v6 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v7 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s4, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s21, 0xffff, s21 @@ -12893,11 +12895,11 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_or_b32 s14, s15, s14 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s4 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s9 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s25, s9, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s11, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s11 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s25, s11, 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s11, s11, 16 ; GCN-NOHSA-VI-NEXT: s_or_b32 s20, s21, s20 ; GCN-NOHSA-VI-NEXT: s_or_b32 s21, s23, s22 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s15, s15, 8 @@ -12906,13 +12908,12 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s13, s13, 0x80000 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s12, s12, 0x80000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s11, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s8 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s27, s8, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s8, 24 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s9, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s10 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s27, s10, 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s10, s10, 24 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s17, s17, 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s16, s16, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s28, s28, 8 @@ -12923,16 +12924,16 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_and_b32 s13, 0xffff, s13 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s12, 0xffff, s12 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s10, s10, 24 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s9, 24 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s8, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s19, s19, 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s18, s18, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s24, s24, 8 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s26, s26, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s29, 0xffff, s29 ; GCN-NOHSA-VI-NEXT: s_and_b32 s31, 0xffff, s31 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s17, 0xffff, s17 ; GCN-NOHSA-VI-NEXT: s_and_b32 s16, 0xffff, s16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff0000 @@ -12940,45 +12941,45 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_or_b32 s15, s22, s15 ; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s13, s5 ; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s12, s4 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s25, 0xffff, s25 ; GCN-NOHSA-VI-NEXT: s_and_b32 s27, 0xffff, s27 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, 0xffff, s19 ; GCN-NOHSA-VI-NEXT: s_and_b32 s18, 0xffff, s18 ; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s17, s9 -; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s16, s8 +; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s17, s11 +; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s16, s10 ; GCN-NOHSA-VI-NEXT: s_or_b32 s16, s29, s28 ; GCN-NOHSA-VI-NEXT: s_or_b32 s17, s31, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s19, s11 -; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s18, s10 +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s19, s9 +; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s18, s8 ; GCN-NOHSA-VI-NEXT: s_or_b32 s18, s25, s24 ; GCN-NOHSA-VI-NEXT: s_or_b32 s19, s27, s26 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v32i8_to_v32i16: diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index a209de78cd299..ddd1ce66c013a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -1083,12 +1083,12 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(ptr addrspace(3) %out, ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 -; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v0 -; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v0, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 +; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v1 +; SI-NEXT: v_bfe_i32 v2, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v4, v1, 0, 16 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset1:1 +; SI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i32: @@ -6153,11 +6153,11 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; SI-NEXT: v_bfe_i32 v8, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v9, 0, 16 -; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 -; SI-NEXT: v_bfe_i32 v14, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; SI-NEXT: v_bfe_i32 v14, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 @@ -6819,10 +6819,10 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: v_mov_b32_e32 v18, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_mov_b32_e32 v12, v3 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v7 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v7 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v5 @@ -6845,24 +6845,24 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v5, v6, 0, 16 -; SI-NEXT: v_bfe_i32 v7, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v10, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v10, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v7, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v12, v19, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: v_bfe_i32 v14, v17, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_bfe_i32 v16, v16, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: ds_write2_b64 v18, v[3:4], v[8:9] offset0:2 offset1:3 ; SI-NEXT: v_bfe_i32 v3, v15, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: ds_write2_b64 v18, v[10:11], v[3:4] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v18, v[7:8], v[16:17] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v18, v[7:8], v[3:4] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v18, v[10:11], v[16:17] offset0:8 offset1:9 ; SI-NEXT: ds_write2_b64 v18, v[5:6], v[14:15] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v18, v[1:2], v[12:13] offset1:1 ; SI-NEXT: s_endpgm @@ -8114,16 +8114,16 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5 ; SI-NEXT: v_bfe_i32 v11, v6, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SI-NEXT: v_bfe_i32 v13, v4, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: v_bfe_i32 v15, v15, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_bfe_i32 v16, v14, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; SI-NEXT: ds_write2_b64 v7, v[9:10], v[16:17] offset1:1 ; SI-NEXT: v_bfe_i32 v17, v18, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 8fe68ba748971..5087bdb9d8f7b 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -533,8 +533,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 @@ -1912,8 +1913,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 676359fcec462..5c0f813c8c829 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -391,144 +391,156 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 -; GCN-NEXT: v_xor_b32_e32 v4, v0, v2 -; GCN-NEXT: v_xor_b32_e32 v7, v1, v3 -; GCN-NEXT: v_max_i32_e32 v2, v2, v6 -; GCN-NEXT: v_max_i32_e32 v3, v3, v9 -; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v9, v3 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GCN-NEXT: v_max_i32_e32 v0, v0, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v9 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 -; GCN-NEXT: v_mul_lo_u32 v9, v9, v6 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v5 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 -; GCN-NEXT: v_mul_hi_u32 v9, v6, v9 -; GCN-NEXT: v_max_i32_e32 v1, v1, v8 -; GCN-NEXT: v_mul_hi_u32 v8, v5, v10 -; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 -; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GCN-NEXT: v_mul_lo_u32 v8, v6, v2 -; GCN-NEXT: v_mul_lo_u32 v10, v5, v3 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v6 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, v0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v9, vcc, v1, v3 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v6 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] -; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v5 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v7 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_readfirstlane_b32 s0, v2 +; GCN-NEXT: s_abs_i32 s1, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-NEXT: s_sub_i32 s6, 0, s1 +; GCN-NEXT: v_readfirstlane_b32 s8, v3 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_lo_u32 v4, s6, v2 +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: s_abs_i32 s7, s6 +; GCN-NEXT: s_xor_b32 s0, s6, s0 +; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 +; GCN-NEXT: s_ashr_i32 s6, s0, 31 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_mul_i32 s0, s0, s1 +; GCN-NEXT: s_sub_i32 s0, s7, s0 +; GCN-NEXT: s_sub_i32 s7, s0, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GCN-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: s_cselect_b32 s0, s7, s0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GCN-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_abs_i32 s7, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_sub_i32 s4, 0, s7 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v3 +; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: s_xor_b32 s5, s4, s8 +; GCN-NEXT: s_abs_i32 s4, s4 +; GCN-NEXT: v_mul_hi_u32 v1, v3, v4 +; GCN-NEXT: s_ashr_i32 s5, s5, 31 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-NEXT: v_mul_hi_u32 v1, s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: s_mul_i32 s6, s6, s7 +; GCN-NEXT: s_sub_i32 s4, s4, s6 +; GCN-NEXT: s_sub_i32 s6, s4, s7 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 +; GCN-NEXT: s_cmp_ge_u32 s4, s7 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: s_cselect_b32 s4, s6, s4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 +; GCN-NEXT: s_cmp_ge_u32 s4, s7 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_xor_b32_e32 v1, s5, v1 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v1 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_v2i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3 -; TONGA-NEXT: v_xor_b32_e32 v4, v0, v2 -; TONGA-NEXT: v_xor_b32_e32 v7, v1, v3 -; TONGA-NEXT: v_max_i32_e32 v2, v2, v6 -; TONGA-NEXT: v_max_i32_e32 v3, v3, v9 -; TONGA-NEXT: v_cvt_f32_u32_e32 v6, v2 -; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v3 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; TONGA-NEXT: v_max_i32_e32 v0, v0, v5 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v9 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 -; TONGA-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; TONGA-NEXT: v_cvt_u32_f32_e32 v6, v6 -; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v3 -; TONGA-NEXT: v_mul_lo_u32 v9, v9, v6 -; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, 0, v1 -; TONGA-NEXT: v_mul_hi_u32 v9, v6, v9 -; TONGA-NEXT: v_max_i32_e32 v1, v1, v8 -; TONGA-NEXT: v_mul_hi_u32 v8, v5, v10 -; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v9 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v8 -; TONGA-NEXT: v_mul_hi_u32 v6, v0, v6 -; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 -; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; TONGA-NEXT: v_mul_lo_u32 v8, v6, v2 -; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v6 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 -; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 -; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v6 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] -; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v5 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v7 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v7 -; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; TONGA-NEXT: v_readfirstlane_b32 s0, v2 +; TONGA-NEXT: s_abs_i32 s1, s0 +; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s1 +; TONGA-NEXT: s_sub_i32 s6, 0, s1 +; TONGA-NEXT: v_readfirstlane_b32 s8, v3 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2 +; TONGA-NEXT: v_mul_lo_u32 v4, s6, v2 +; TONGA-NEXT: v_readfirstlane_b32 s6, v0 +; TONGA-NEXT: s_abs_i32 s7, s6 +; TONGA-NEXT: s_xor_b32 s0, s6, s0 +; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4 +; TONGA-NEXT: s_ashr_i32 s6, s0, 31 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v4 +; TONGA-NEXT: v_mul_hi_u32 v0, s7, v0 +; TONGA-NEXT: v_readfirstlane_b32 s0, v0 +; TONGA-NEXT: s_mul_i32 s0, s0, s1 +; TONGA-NEXT: s_sub_i32 s0, s7, s0 +; TONGA-NEXT: s_sub_i32 s7, s0, s1 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; TONGA-NEXT: s_cmp_ge_u32 s0, s1 +; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; TONGA-NEXT: s_cselect_b32 s0, s7, s0 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; TONGA-NEXT: s_cmp_ge_u32 s0, s1 +; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 +; TONGA-NEXT: s_abs_i32 s7, s8 +; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s7 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_sub_i32 s4, 0, s7 +; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; TONGA-NEXT: v_xor_b32_e32 v0, s6, v0 +; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s6, v0 +; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 +; TONGA-NEXT: v_mul_lo_u32 v4, s4, v3 +; TONGA-NEXT: v_readfirstlane_b32 s4, v1 +; TONGA-NEXT: s_xor_b32 s5, s4, s8 +; TONGA-NEXT: s_abs_i32 s4, s4 +; TONGA-NEXT: v_mul_hi_u32 v1, v3, v4 +; TONGA-NEXT: s_ashr_i32 s5, s5, 31 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1 +; TONGA-NEXT: v_readfirstlane_b32 s6, v1 +; TONGA-NEXT: s_mul_i32 s6, s6, s7 +; TONGA-NEXT: s_sub_i32 s4, s4, s6 +; TONGA-NEXT: s_sub_i32 s6, s4, s7 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1 +; TONGA-NEXT: s_cmp_ge_u32 s4, s7 +; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; TONGA-NEXT: s_cselect_b32 s4, s6, s4 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1 +; TONGA-NEXT: s_cmp_ge_u32 s4, s7 +; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; TONGA-NEXT: v_xor_b32_e32 v1, s5, v1 +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s5, v1 +; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32: @@ -546,44 +558,44 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_readfirstlane_b32 s0, v2 ; GFX9-NEXT: s_abs_i32 s1, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_xor_b32 s0, s5, s0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_xor_b32 s0, s4, s0 ; GFX9-NEXT: s_ashr_i32 s6, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_sub_i32 s0, 0, s1 -; GFX9-NEXT: s_abs_i32 s5, s5 -; GFX9-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9-NEXT: s_abs_i32 s4, s4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_mul_i32 s0, s0, s7 ; GFX9-NEXT: s_mul_hi_u32 s0, s7, s0 ; GFX9-NEXT: s_add_i32 s7, s7, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s7 +; GFX9-NEXT: s_mul_hi_u32 s0, s4, s7 ; GFX9-NEXT: s_mul_i32 s7, s0, s1 -; GFX9-NEXT: s_sub_i32 s5, s5, s7 +; GFX9-NEXT: s_sub_i32 s4, s4, s7 ; GFX9-NEXT: s_add_i32 s10, s0, 1 -; GFX9-NEXT: s_sub_i32 s7, s5, s1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 +; GFX9-NEXT: s_sub_i32 s7, s4, s1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s1 ; GFX9-NEXT: s_cselect_b32 s0, s10, s0 -; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 ; GFX9-NEXT: s_add_i32 s7, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s5, s7, s0 -; GFX9-NEXT: s_abs_i32 s7, s4 +; GFX9-NEXT: s_cmp_ge_u32 s4, s1 +; GFX9-NEXT: s_cselect_b32 s4, s7, s0 +; GFX9-NEXT: s_abs_i32 s7, s5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_xor_b32 s5, s5, s6 +; GFX9-NEXT: s_xor_b32 s4, s4, s6 ; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: s_sub_i32 s9, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 ; GFX9-NEXT: s_mov_b32 s0, s8 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 +; GFX9-NEXT: s_xor_b32 s5, s8, s5 ; GFX9-NEXT: s_abs_i32 s8, s8 -; GFX9-NEXT: s_ashr_i32 s4, s4, 31 +; GFX9-NEXT: s_ashr_i32 s5, s5, 31 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 ; GFX9-NEXT: s_mul_i32 s9, s9, s6 ; GFX9-NEXT: s_mul_hi_u32 s9, s6, s9 @@ -599,10 +611,10 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_add_i32 s9, s6, 1 ; GFX9-NEXT: s_cmp_ge_u32 s8, s7 ; GFX9-NEXT: s_cselect_b32 s6, s9, s6 -; GFX9-NEXT: s_xor_b32 s6, s6, s4 -; GFX9-NEXT: s_sub_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_xor_b32 s6, s6, s5 +; GFX9-NEXT: s_sub_i32 s5, s6, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -792,255 +804,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: sdiv_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s6, s10 +; GCN-NEXT: s_mov_b32 s7, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s8, s0 +; GCN-NEXT: s_mov_b32 s9, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; GCN-NEXT: v_xor_b32_e32 v8, v0, v4 -; GCN-NEXT: v_max_i32_e32 v4, v4, v10 -; GCN-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 -; GCN-NEXT: v_xor_b32_e32 v11, v1, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GCN-NEXT: v_max_i32_e32 v5, v5, v13 -; GCN-NEXT: v_cvt_f32_u32_e32 v13, v5 -; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GCN-NEXT: v_rcp_iflag_f32_e32 v13, v13 -; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v10 -; GCN-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13 -; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GCN-NEXT: v_max_i32_e32 v0, v0, v9 -; GCN-NEXT: v_mul_hi_u32 v16, v10, v16 -; GCN-NEXT: v_max_i32_e32 v1, v1, v12 -; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16 -; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v13 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v10 -; GCN-NEXT: v_xor_b32_e32 v14, v2, v6 -; GCN-NEXT: v_max_i32_e32 v6, v6, v15 -; GCN-NEXT: v_mul_hi_u32 v12, v13, v16 -; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6 -; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GCN-NEXT: v_mul_lo_u32 v13, v10, v4 -; GCN-NEXT: v_mul_hi_u32 v12, v1, v12 -; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15 -; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v13, vcc, v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_mul_lo_u32 v0, v12, v5 -; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 -; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] -; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v4, v4, v9 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7 -; GCN-NEXT: v_max_i32_e32 v5, v7, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5 -; GCN-NEXT: v_mul_hi_u32 v4, v9, v4 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-NEXT: s_abs_i32 s13, s0 +; GCN-NEXT: s_abs_i32 s14, s1 +; GCN-NEXT: s_abs_i32 s15, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GCN-NEXT: v_readfirstlane_b32 s6, v3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 -; GCN-NEXT: v_max_i32_e32 v2, v2, v9 -; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: s_abs_i32 s17, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s17 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1 -; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v10, v10, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc -; GCN-NEXT: v_mul_hi_u32 v4, v9, v10 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 -; GCN-NEXT: v_max_i32_e32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v6, v4 -; GCN-NEXT: v_xor_b32_e32 v2, v2, v14 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 -; GCN-NEXT: v_mul_lo_u32 v8, v4, v5 -; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GCN-NEXT: v_xor_b32_e32 v4, v4, v3 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s3, v4 +; GCN-NEXT: v_readfirstlane_b32 s4, v5 +; GCN-NEXT: v_readfirstlane_b32 s5, v6 +; GCN-NEXT: s_xor_b32 s12, s3, s0 +; GCN-NEXT: s_xor_b32 s0, s4, s1 +; GCN-NEXT: s_xor_b32 s1, s5, s2 +; GCN-NEXT: s_sub_i32 s2, 0, s13 +; GCN-NEXT: s_ashr_i32 s18, s0, 31 +; GCN-NEXT: s_sub_i32 s0, 0, s14 +; GCN-NEXT: s_ashr_i32 s19, s1, 31 +; GCN-NEXT: s_sub_i32 s1, 0, s15 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s0, v1 +; GCN-NEXT: v_mul_lo_u32 v6, s1, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 +; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 +; GCN-NEXT: s_sub_i32 s20, 0, s17 +; GCN-NEXT: v_readfirstlane_b32 s7, v7 +; GCN-NEXT: s_abs_i32 s3, s3 +; GCN-NEXT: s_abs_i32 s4, s4 +; GCN-NEXT: s_abs_i32 s5, s5 +; GCN-NEXT: v_mul_lo_u32 v7, s20, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v1, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v2, s5, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v7 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s13 +; GCN-NEXT: v_mul_lo_u32 v6, v1, s14 +; GCN-NEXT: v_mul_lo_u32 v8, v2, s15 +; GCN-NEXT: s_abs_i32 s16, s7 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GCN-NEXT: v_mul_hi_u32 v3, s16, v3 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v4 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, s4, v6 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v1 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v2 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6 +; GCN-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8 +; GCN-NEXT: v_subrev_i32_e32 v10, vcc, s13, v4 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] +; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s14, v6 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] +; GCN-NEXT: v_subrev_i32_e32 v7, vcc, s15, v8 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] +; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3] +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v1 +; GCN-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5] +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v2 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v3, s17 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v5 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; GCN-NEXT: s_ashr_i32 s12, s12, 31 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s18, v1 +; GCN-NEXT: v_xor_b32_e32 v2, s19, v2 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s16, v4 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s18, v1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s19, v2 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; GCN-NEXT: v_subrev_i32_e32 v6, vcc, s17, v4 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; GCN-NEXT: s_xor_b32 s0, s7, s6 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 +; GCN-NEXT: s_ashr_i32 s0, s0, 31 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN-NEXT: v_xor_b32_e32 v3, s0, v3 +; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v3 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_v4i32: ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_mov_b32 s11, 0xf000 +; TONGA-NEXT: s_mov_b32 s10, -1 +; TONGA-NEXT: s_mov_b32 s6, s10 +; TONGA-NEXT: s_mov_b32 s7, s11 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s4, s2 +; TONGA-NEXT: s_mov_b32 s5, s3 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 +; TONGA-NEXT: s_mov_b32 s8, s0 +; TONGA-NEXT: s_mov_b32 s9, s1 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 -; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 -; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4 -; TONGA-NEXT: v_max_i32_e32 v4, v4, v10 -; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v4 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5 -; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; TONGA-NEXT: v_max_i32_e32 v5, v5, v13 -; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v5 -; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4 -; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v13, v13 -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1 -; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10 -; TONGA-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13 -; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v13 -; TONGA-NEXT: v_max_i32_e32 v0, v0, v9 -; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16 -; TONGA-NEXT: v_max_i32_e32 v1, v1, v12 -; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16 -; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5 -; TONGA-NEXT: v_mul_lo_u32 v16, v16, v13 -; TONGA-NEXT: v_mul_hi_u32 v10, v0, v10 -; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6 -; TONGA-NEXT: v_max_i32_e32 v6, v6, v15 -; TONGA-NEXT: v_mul_hi_u32 v12, v13, v16 -; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6 -; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v13, v12 -; TONGA-NEXT: v_mul_lo_u32 v13, v10, v4 -; TONGA-NEXT: v_mul_hi_u32 v12, v1, v12 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15 -; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v13 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_mul_lo_u32 v0, v12, v5 -; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 -; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v6 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v5 -; TONGA-NEXT: v_mul_lo_u32 v4, v4, v9 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7 -; TONGA-NEXT: v_max_i32_e32 v5, v7, v0 -; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5 -; TONGA-NEXT: v_mul_hi_u32 v4, v9, v4 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 +; TONGA-NEXT: v_readfirstlane_b32 s0, v0 +; TONGA-NEXT: v_readfirstlane_b32 s1, v1 +; TONGA-NEXT: v_readfirstlane_b32 s2, v2 +; TONGA-NEXT: s_abs_i32 s13, s0 +; TONGA-NEXT: s_abs_i32 s14, s1 +; TONGA-NEXT: s_abs_i32 s15, s2 +; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s13 +; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s14 +; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s15 +; TONGA-NEXT: v_readfirstlane_b32 s6, v3 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 -; TONGA-NEXT: v_max_i32_e32 v2, v2, v9 -; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; TONGA-NEXT: s_abs_i32 s17, s6 +; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s17 ; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1] -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 -; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11 -; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11 -; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc -; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10 -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3 -; TONGA-NEXT: v_max_i32_e32 v6, v3, v6 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 -; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4 -; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14 -; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5 -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0 +; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1 +; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2 +; TONGA-NEXT: s_waitcnt vmcnt(0) +; TONGA-NEXT: v_readfirstlane_b32 s3, v4 +; TONGA-NEXT: v_readfirstlane_b32 s4, v5 +; TONGA-NEXT: v_readfirstlane_b32 s5, v6 +; TONGA-NEXT: s_xor_b32 s12, s3, s0 +; TONGA-NEXT: s_xor_b32 s0, s4, s1 +; TONGA-NEXT: s_xor_b32 s1, s5, s2 +; TONGA-NEXT: s_sub_i32 s2, 0, s13 +; TONGA-NEXT: s_ashr_i32 s18, s0, 31 +; TONGA-NEXT: s_sub_i32 s0, 0, s14 +; TONGA-NEXT: s_ashr_i32 s19, s1, 31 +; TONGA-NEXT: s_sub_i32 s1, 0, s15 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; TONGA-NEXT: v_mul_lo_u32 v4, s2, v0 +; TONGA-NEXT: v_mul_lo_u32 v5, s0, v1 +; TONGA-NEXT: v_mul_lo_u32 v6, s1, v2 +; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 +; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4 +; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 +; TONGA-NEXT: v_mul_hi_u32 v6, v2, v6 +; TONGA-NEXT: s_sub_i32 s20, 0, s17 +; TONGA-NEXT: v_readfirstlane_b32 s7, v7 +; TONGA-NEXT: s_abs_i32 s3, s3 +; TONGA-NEXT: s_abs_i32 s4, s4 +; TONGA-NEXT: s_abs_i32 s5, s5 +; TONGA-NEXT: v_mul_lo_u32 v7, s20, v3 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; TONGA-NEXT: v_mul_hi_u32 v0, s3, v0 +; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1 +; TONGA-NEXT: v_mul_hi_u32 v2, s5, v2 +; TONGA-NEXT: v_mul_hi_u32 v7, v3, v7 +; TONGA-NEXT: v_mul_lo_u32 v4, v0, s13 +; TONGA-NEXT: v_mul_lo_u32 v6, v1, s14 +; TONGA-NEXT: v_mul_lo_u32 v8, v2, s15 +; TONGA-NEXT: s_abs_i32 s16, s7 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 +; TONGA-NEXT: v_mul_hi_u32 v3, s16, v3 +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s3, v4 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, s4, v6 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, s5, v8 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v0 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v1 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v2 +; TONGA-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6 +; TONGA-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8 +; TONGA-NEXT: v_subrev_u32_e32 v10, vcc, s13, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] +; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, s14, v6 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] +; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, s15, v8 +; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5] +; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v0 +; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3] +; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v1 +; TONGA-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5] +; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v2 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 +; TONGA-NEXT: v_mul_lo_u32 v4, v3, s17 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s14, v5 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; TONGA-NEXT: s_ashr_i32 s12, s12, 31 +; TONGA-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; TONGA-NEXT: v_xor_b32_e32 v0, s12, v0 +; TONGA-NEXT: v_xor_b32_e32 v1, s18, v1 +; TONGA-NEXT: v_xor_b32_e32 v2, s19, v2 +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s16, v4 +; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s12, v0 +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s18, v1 +; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, s19, v2 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 +; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, s17, v4 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 +; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 +; TONGA-NEXT: s_xor_b32 s0, s7, s6 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 +; TONGA-NEXT: s_ashr_i32 s0, s0, 31 +; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; TONGA-NEXT: v_xor_b32_e32 v3, s0, v3 +; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32: @@ -2002,7 +2014,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -2049,7 +2061,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 +; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll index 49dec15f9f7d7..584d26ed41893 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll @@ -42,34 +42,35 @@ define amdgpu_kernel void @v_sext_in_reg_i8_i16_shuffle_vector(ptr addrspace(1) ; GFX11-FAKE16-LABEL: v_sext_in_reg_i8_i16_shuffle_vector: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v2, 24, v1 +; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 24, v1 ; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v5, 24, v0 ; GFX11-FAKE16-NEXT: v_ashrrev_i16 v6, 8, v1 ; GFX11-FAKE16-NEXT: v_bfe_i32 v7, v0, 0, 8 ; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, 8, v0 ; GFX11-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX11-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX11-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v7, v7 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v1, v1 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v6, v6 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v5, v5 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v8, v2 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v2, v4 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v8, v3 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v4, v4 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v9, v2 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v3, v0, v7 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v6, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v5, v2 -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v8, v4 -; GFX11-FAKE16-NEXT: global_store_b128 v9, v[0:3], s[0:1] +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v5, v4 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v8, v9 +; GFX11-FAKE16-NEXT: global_store_b128 v10, v[0:3], s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <{ [0 x i8] }>, ptr addrspace(1) %ptr, i64 0, i32 0, i32 %tid diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll index 9a051b3fd8bb7..df32e2a4cfad2 100644 --- a/llvm/test/CodeGen/NVPTX/i1-select.ll +++ b/llvm/test/CodeGen/NVPTX/i1-select.ll @@ -108,9 +108,9 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i ; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3]; ; CHECK-NEXT: xor.pred %p6, %p1, %p3; ; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4]; -; CHECK-NEXT: and.pred %p7, %p6, %p4; +; CHECK-NEXT: and.pred %p8, %p6, %p4; ; CHECK-NEXT: and.pred %p9, %p2, %p4; -; CHECK-NEXT: and.pred %p10, %p3, %p7; +; CHECK-NEXT: and.pred %p10, %p3, %p8; ; CHECK-NEXT: or.pred %p11, %p10, %p9; ; CHECK-NEXT: xor.pred %p12, %p11, %p3; ; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12; diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll index 28a95ef4f8de9..f11a9c854c465 100644 --- a/llvm/test/CodeGen/RISCV/abds.ll +++ b/llvm/test/CodeGen/RISCV/abds.ll @@ -2011,50 +2011,50 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind { define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_subnsw_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a4, 0(a2) -; RV32I-NEXT: lw a3, 4(a2) +; RV32I-NEXT: lw a3, 0(a2) +; RV32I-NEXT: lw a4, 4(a2) ; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a6, 12(a2) +; RV32I-NEXT: lw a2, 12(a2) ; RV32I-NEXT: lw a7, 8(a1) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw a6, 0(a1) ; RV32I-NEXT: lw a1, 4(a1) ; RV32I-NEXT: sltu t1, a7, a5 -; RV32I-NEXT: sub t0, t0, a6 -; RV32I-NEXT: sltu a6, a2, a4 +; RV32I-NEXT: sub t0, t0, a2 +; RV32I-NEXT: sltu a2, a6, a3 ; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: mv t1, a6 -; RV32I-NEXT: beq a1, a3, .LBB31_2 +; RV32I-NEXT: mv t1, a2 +; RV32I-NEXT: beq a1, a4, .LBB31_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a1, a3 +; RV32I-NEXT: sltu t1, a1, a4 ; RV32I-NEXT: .LBB31_2: ; RV32I-NEXT: sub a5, a7, a5 -; RV32I-NEXT: sub a3, a1, a3 -; RV32I-NEXT: sltu a1, a5, t1 +; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: sltu a4, a5, t1 ; RV32I-NEXT: sub a5, a5, t1 -; RV32I-NEXT: sub a1, t0, a1 -; RV32I-NEXT: sub a3, a3, a6 -; RV32I-NEXT: sub a2, a2, a4 -; RV32I-NEXT: bgez a1, .LBB31_4 +; RV32I-NEXT: sub a4, t0, a4 +; RV32I-NEXT: sub a2, a1, a2 +; RV32I-NEXT: sub a1, a6, a3 +; RV32I-NEXT: bgez a4, .LBB31_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: snez a4, a3 -; RV32I-NEXT: snez a6, a2 +; RV32I-NEXT: snez a3, a2 +; RV32I-NEXT: snez a6, a1 ; RV32I-NEXT: neg a7, a5 ; RV32I-NEXT: snez a5, a5 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: add a4, a4, a5 +; RV32I-NEXT: add a2, a2, a6 +; RV32I-NEXT: sltu a6, a7, a3 +; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: sub a5, a7, a3 ; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: add a1, a1, a5 -; RV32I-NEXT: add a3, a3, a6 -; RV32I-NEXT: sltu a6, a7, a4 +; RV32I-NEXT: sub a4, a4, a6 ; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a5, a7, a4 -; RV32I-NEXT: sub a1, a1, a6 -; RV32I-NEXT: neg a3, a3 ; RV32I-NEXT: .LBB31_4: -; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a2, 4(a0) ; RV32I-NEXT: sw a5, 8(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_subnsw_i128: @@ -2074,50 +2074,50 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; ; RV32ZBB-LABEL: abd_subnsw_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a4, 0(a2) -; RV32ZBB-NEXT: lw a3, 4(a2) +; RV32ZBB-NEXT: lw a3, 0(a2) +; RV32ZBB-NEXT: lw a4, 4(a2) ; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a6, 12(a2) +; RV32ZBB-NEXT: lw a2, 12(a2) ; RV32ZBB-NEXT: lw a7, 8(a1) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a2, 0(a1) +; RV32ZBB-NEXT: lw a6, 0(a1) ; RV32ZBB-NEXT: lw a1, 4(a1) ; RV32ZBB-NEXT: sltu t1, a7, a5 -; RV32ZBB-NEXT: sub t0, t0, a6 -; RV32ZBB-NEXT: sltu a6, a2, a4 +; RV32ZBB-NEXT: sub t0, t0, a2 +; RV32ZBB-NEXT: sltu a2, a6, a3 ; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: mv t1, a6 -; RV32ZBB-NEXT: beq a1, a3, .LBB31_2 +; RV32ZBB-NEXT: mv t1, a2 +; RV32ZBB-NEXT: beq a1, a4, .LBB31_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a1, a3 +; RV32ZBB-NEXT: sltu t1, a1, a4 ; RV32ZBB-NEXT: .LBB31_2: ; RV32ZBB-NEXT: sub a5, a7, a5 -; RV32ZBB-NEXT: sub a3, a1, a3 -; RV32ZBB-NEXT: sltu a1, a5, t1 +; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: sltu a4, a5, t1 ; RV32ZBB-NEXT: sub a5, a5, t1 -; RV32ZBB-NEXT: sub a1, t0, a1 -; RV32ZBB-NEXT: sub a3, a3, a6 -; RV32ZBB-NEXT: sub a2, a2, a4 -; RV32ZBB-NEXT: bgez a1, .LBB31_4 +; RV32ZBB-NEXT: sub a4, t0, a4 +; RV32ZBB-NEXT: sub a2, a1, a2 +; RV32ZBB-NEXT: sub a1, a6, a3 +; RV32ZBB-NEXT: bgez a4, .LBB31_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: snez a4, a3 -; RV32ZBB-NEXT: snez a6, a2 +; RV32ZBB-NEXT: snez a3, a2 +; RV32ZBB-NEXT: snez a6, a1 ; RV32ZBB-NEXT: neg a7, a5 ; RV32ZBB-NEXT: snez a5, a5 +; RV32ZBB-NEXT: or a3, a6, a3 +; RV32ZBB-NEXT: add a4, a4, a5 +; RV32ZBB-NEXT: add a2, a2, a6 +; RV32ZBB-NEXT: sltu a6, a7, a3 +; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: sub a5, a7, a3 ; RV32ZBB-NEXT: neg a2, a2 -; RV32ZBB-NEXT: or a4, a6, a4 -; RV32ZBB-NEXT: add a1, a1, a5 -; RV32ZBB-NEXT: add a3, a3, a6 -; RV32ZBB-NEXT: sltu a6, a7, a4 +; RV32ZBB-NEXT: sub a4, a4, a6 ; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a5, a7, a4 -; RV32ZBB-NEXT: sub a1, a1, a6 -; RV32ZBB-NEXT: neg a3, a3 ; RV32ZBB-NEXT: .LBB31_4: -; RV32ZBB-NEXT: sw a2, 0(a0) -; RV32ZBB-NEXT: sw a3, 4(a0) +; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) ; RV32ZBB-NEXT: sw a5, 8(a0) -; RV32ZBB-NEXT: sw a1, 12(a0) +; RV32ZBB-NEXT: sw a4, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_subnsw_i128: @@ -2142,50 +2142,50 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_subnsw_i128_undef: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a4, 0(a2) -; RV32I-NEXT: lw a3, 4(a2) +; RV32I-NEXT: lw a3, 0(a2) +; RV32I-NEXT: lw a4, 4(a2) ; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a6, 12(a2) +; RV32I-NEXT: lw a2, 12(a2) ; RV32I-NEXT: lw a7, 8(a1) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw a6, 0(a1) ; RV32I-NEXT: lw a1, 4(a1) ; RV32I-NEXT: sltu t1, a7, a5 -; RV32I-NEXT: sub t0, t0, a6 -; RV32I-NEXT: sltu a6, a2, a4 +; RV32I-NEXT: sub t0, t0, a2 +; RV32I-NEXT: sltu a2, a6, a3 ; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: mv t1, a6 -; RV32I-NEXT: beq a1, a3, .LBB32_2 +; RV32I-NEXT: mv t1, a2 +; RV32I-NEXT: beq a1, a4, .LBB32_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a1, a3 +; RV32I-NEXT: sltu t1, a1, a4 ; RV32I-NEXT: .LBB32_2: ; RV32I-NEXT: sub a5, a7, a5 -; RV32I-NEXT: sub a3, a1, a3 -; RV32I-NEXT: sltu a1, a5, t1 +; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: sltu a4, a5, t1 ; RV32I-NEXT: sub a5, a5, t1 -; RV32I-NEXT: sub a1, t0, a1 -; RV32I-NEXT: sub a3, a3, a6 -; RV32I-NEXT: sub a2, a2, a4 -; RV32I-NEXT: bgez a1, .LBB32_4 +; RV32I-NEXT: sub a4, t0, a4 +; RV32I-NEXT: sub a2, a1, a2 +; RV32I-NEXT: sub a1, a6, a3 +; RV32I-NEXT: bgez a4, .LBB32_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: snez a4, a3 -; RV32I-NEXT: snez a6, a2 +; RV32I-NEXT: snez a3, a2 +; RV32I-NEXT: snez a6, a1 ; RV32I-NEXT: neg a7, a5 ; RV32I-NEXT: snez a5, a5 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: add a4, a4, a5 +; RV32I-NEXT: add a2, a2, a6 +; RV32I-NEXT: sltu a6, a7, a3 +; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: sub a5, a7, a3 ; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: add a1, a1, a5 -; RV32I-NEXT: add a3, a3, a6 -; RV32I-NEXT: sltu a6, a7, a4 +; RV32I-NEXT: sub a4, a4, a6 ; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a5, a7, a4 -; RV32I-NEXT: sub a1, a1, a6 -; RV32I-NEXT: neg a3, a3 ; RV32I-NEXT: .LBB32_4: -; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a2, 4(a0) ; RV32I-NEXT: sw a5, 8(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_subnsw_i128_undef: @@ -2205,50 +2205,50 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; ; RV32ZBB-LABEL: abd_subnsw_i128_undef: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a4, 0(a2) -; RV32ZBB-NEXT: lw a3, 4(a2) +; RV32ZBB-NEXT: lw a3, 0(a2) +; RV32ZBB-NEXT: lw a4, 4(a2) ; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a6, 12(a2) +; RV32ZBB-NEXT: lw a2, 12(a2) ; RV32ZBB-NEXT: lw a7, 8(a1) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a2, 0(a1) +; RV32ZBB-NEXT: lw a6, 0(a1) ; RV32ZBB-NEXT: lw a1, 4(a1) ; RV32ZBB-NEXT: sltu t1, a7, a5 -; RV32ZBB-NEXT: sub t0, t0, a6 -; RV32ZBB-NEXT: sltu a6, a2, a4 +; RV32ZBB-NEXT: sub t0, t0, a2 +; RV32ZBB-NEXT: sltu a2, a6, a3 ; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: mv t1, a6 -; RV32ZBB-NEXT: beq a1, a3, .LBB32_2 +; RV32ZBB-NEXT: mv t1, a2 +; RV32ZBB-NEXT: beq a1, a4, .LBB32_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a1, a3 +; RV32ZBB-NEXT: sltu t1, a1, a4 ; RV32ZBB-NEXT: .LBB32_2: ; RV32ZBB-NEXT: sub a5, a7, a5 -; RV32ZBB-NEXT: sub a3, a1, a3 -; RV32ZBB-NEXT: sltu a1, a5, t1 +; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: sltu a4, a5, t1 ; RV32ZBB-NEXT: sub a5, a5, t1 -; RV32ZBB-NEXT: sub a1, t0, a1 -; RV32ZBB-NEXT: sub a3, a3, a6 -; RV32ZBB-NEXT: sub a2, a2, a4 -; RV32ZBB-NEXT: bgez a1, .LBB32_4 +; RV32ZBB-NEXT: sub a4, t0, a4 +; RV32ZBB-NEXT: sub a2, a1, a2 +; RV32ZBB-NEXT: sub a1, a6, a3 +; RV32ZBB-NEXT: bgez a4, .LBB32_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: snez a4, a3 -; RV32ZBB-NEXT: snez a6, a2 +; RV32ZBB-NEXT: snez a3, a2 +; RV32ZBB-NEXT: snez a6, a1 ; RV32ZBB-NEXT: neg a7, a5 ; RV32ZBB-NEXT: snez a5, a5 +; RV32ZBB-NEXT: or a3, a6, a3 +; RV32ZBB-NEXT: add a4, a4, a5 +; RV32ZBB-NEXT: add a2, a2, a6 +; RV32ZBB-NEXT: sltu a6, a7, a3 +; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: sub a5, a7, a3 ; RV32ZBB-NEXT: neg a2, a2 -; RV32ZBB-NEXT: or a4, a6, a4 -; RV32ZBB-NEXT: add a1, a1, a5 -; RV32ZBB-NEXT: add a3, a3, a6 -; RV32ZBB-NEXT: sltu a6, a7, a4 +; RV32ZBB-NEXT: sub a4, a4, a6 ; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a5, a7, a4 -; RV32ZBB-NEXT: sub a1, a1, a6 -; RV32ZBB-NEXT: neg a3, a3 ; RV32ZBB-NEXT: .LBB32_4: -; RV32ZBB-NEXT: sw a2, 0(a0) -; RV32ZBB-NEXT: sw a3, 4(a0) +; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) ; RV32ZBB-NEXT: sw a5, 8(a0) -; RV32ZBB-NEXT: sw a1, 12(a0) +; RV32ZBB-NEXT: sw a4, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_subnsw_i128_undef: diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll index 117e3e4aac45d..519f1e851a832 100644 --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -1110,15 +1110,15 @@ define i64 @stest_f64i64(double %x) { ; RV32IF-NEXT: .LBB18_3: # %entry ; RV32IF-NEXT: slti a6, a4, 0 ; RV32IF-NEXT: .LBB18_4: # %entry -; RV32IF-NEXT: addi a7, a6, -1 -; RV32IF-NEXT: neg t0, a6 +; RV32IF-NEXT: neg a7, a6 +; RV32IF-NEXT: addi t0, a6, -1 ; RV32IF-NEXT: bnez a6, .LBB18_6 ; RV32IF-NEXT: # %bb.5: # %entry ; RV32IF-NEXT: mv a1, a5 ; RV32IF-NEXT: .LBB18_6: # %entry -; RV32IF-NEXT: or a3, a7, a3 -; RV32IF-NEXT: and a4, t0, a4 -; RV32IF-NEXT: and a2, t0, a2 +; RV32IF-NEXT: or a3, t0, a3 +; RV32IF-NEXT: and a4, a7, a4 +; RV32IF-NEXT: and a2, a7, a2 ; RV32IF-NEXT: beq a1, a0, .LBB18_8 ; RV32IF-NEXT: # %bb.7: # %entry ; RV32IF-NEXT: sltu a0, a0, a1 @@ -1213,15 +1213,15 @@ define i64 @stest_f64i64(double %x) { ; RV32IFD-NEXT: .LBB18_3: # %entry ; RV32IFD-NEXT: slti a6, a4, 0 ; RV32IFD-NEXT: .LBB18_4: # %entry -; RV32IFD-NEXT: addi a7, a6, -1 -; RV32IFD-NEXT: neg t0, a6 +; RV32IFD-NEXT: neg a7, a6 +; RV32IFD-NEXT: addi t0, a6, -1 ; RV32IFD-NEXT: bnez a6, .LBB18_6 ; RV32IFD-NEXT: # %bb.5: # %entry ; RV32IFD-NEXT: mv a1, a5 ; RV32IFD-NEXT: .LBB18_6: # %entry -; RV32IFD-NEXT: or a3, a7, a3 -; RV32IFD-NEXT: and a4, t0, a4 -; RV32IFD-NEXT: and a2, t0, a2 +; RV32IFD-NEXT: or a3, t0, a3 +; RV32IFD-NEXT: and a4, a7, a4 +; RV32IFD-NEXT: and a2, a7, a2 ; RV32IFD-NEXT: beq a1, a0, .LBB18_8 ; RV32IFD-NEXT: # %bb.7: # %entry ; RV32IFD-NEXT: sltu a0, a0, a1 @@ -1378,8 +1378,8 @@ define i64 @ustest_f64i64(double %x) { ; RV32IF-NEXT: # %bb.4: # %entry ; RV32IF-NEXT: li a0, 1 ; RV32IF-NEXT: .LBB20_5: # %entry -; RV32IF-NEXT: lw a3, 8(sp) -; RV32IF-NEXT: lw a4, 12(sp) +; RV32IF-NEXT: lw a4, 8(sp) +; RV32IF-NEXT: lw a3, 12(sp) ; RV32IF-NEXT: and a5, a2, a1 ; RV32IF-NEXT: beqz a5, .LBB20_7 ; RV32IF-NEXT: # %bb.6: # %entry @@ -1393,12 +1393,12 @@ define i64 @ustest_f64i64(double %x) { ; RV32IF-NEXT: and a2, a2, a3 ; RV32IF-NEXT: bnez a0, .LBB20_10 ; RV32IF-NEXT: # %bb.9: -; RV32IF-NEXT: or a0, a2, a4 +; RV32IF-NEXT: or a0, a4, a2 ; RV32IF-NEXT: snez a1, a0 ; RV32IF-NEXT: .LBB20_10: # %entry ; RV32IF-NEXT: neg a1, a1 -; RV32IF-NEXT: and a0, a1, a2 -; RV32IF-NEXT: and a1, a1, a4 +; RV32IF-NEXT: and a0, a1, a4 +; RV32IF-NEXT: and a1, a1, a2 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: .cfi_restore ra ; RV32IF-NEXT: addi sp, sp, 32 @@ -1461,8 +1461,8 @@ define i64 @ustest_f64i64(double %x) { ; RV32IFD-NEXT: # %bb.4: # %entry ; RV32IFD-NEXT: li a0, 1 ; RV32IFD-NEXT: .LBB20_5: # %entry -; RV32IFD-NEXT: lw a3, 8(sp) -; RV32IFD-NEXT: lw a4, 12(sp) +; RV32IFD-NEXT: lw a4, 8(sp) +; RV32IFD-NEXT: lw a3, 12(sp) ; RV32IFD-NEXT: and a5, a2, a1 ; RV32IFD-NEXT: beqz a5, .LBB20_7 ; RV32IFD-NEXT: # %bb.6: # %entry @@ -1476,12 +1476,12 @@ define i64 @ustest_f64i64(double %x) { ; RV32IFD-NEXT: and a2, a2, a3 ; RV32IFD-NEXT: bnez a0, .LBB20_10 ; RV32IFD-NEXT: # %bb.9: -; RV32IFD-NEXT: or a0, a2, a4 +; RV32IFD-NEXT: or a0, a4, a2 ; RV32IFD-NEXT: snez a1, a0 ; RV32IFD-NEXT: .LBB20_10: # %entry ; RV32IFD-NEXT: neg a1, a1 -; RV32IFD-NEXT: and a0, a1, a2 -; RV32IFD-NEXT: and a1, a1, a4 +; RV32IFD-NEXT: and a0, a1, a4 +; RV32IFD-NEXT: and a1, a1, a2 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: .cfi_restore ra ; RV32IFD-NEXT: addi sp, sp, 32 @@ -1525,15 +1525,15 @@ define i64 @stest_f32i64(float %x) { ; RV32-NEXT: .LBB21_3: # %entry ; RV32-NEXT: slti a6, a4, 0 ; RV32-NEXT: .LBB21_4: # %entry -; RV32-NEXT: addi a7, a6, -1 -; RV32-NEXT: neg t0, a6 +; RV32-NEXT: neg a7, a6 +; RV32-NEXT: addi t0, a6, -1 ; RV32-NEXT: bnez a6, .LBB21_6 ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB21_6: # %entry -; RV32-NEXT: or a3, a7, a3 -; RV32-NEXT: and a4, t0, a4 -; RV32-NEXT: and a2, t0, a2 +; RV32-NEXT: or a3, t0, a3 +; RV32-NEXT: and a4, a7, a4 +; RV32-NEXT: and a2, a7, a2 ; RV32-NEXT: beq a1, a0, .LBB21_8 ; RV32-NEXT: # %bb.7: # %entry ; RV32-NEXT: sltu a0, a0, a1 @@ -1658,8 +1658,8 @@ define i64 @ustest_f32i64(float %x) { ; RV32-NEXT: # %bb.4: # %entry ; RV32-NEXT: li a0, 1 ; RV32-NEXT: .LBB23_5: # %entry -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: lw a4, 12(sp) +; RV32-NEXT: lw a4, 8(sp) +; RV32-NEXT: lw a3, 12(sp) ; RV32-NEXT: and a5, a2, a1 ; RV32-NEXT: beqz a5, .LBB23_7 ; RV32-NEXT: # %bb.6: # %entry @@ -1673,12 +1673,12 @@ define i64 @ustest_f32i64(float %x) { ; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: bnez a0, .LBB23_10 ; RV32-NEXT: # %bb.9: -; RV32-NEXT: or a0, a2, a4 +; RV32-NEXT: or a0, a4, a2 ; RV32-NEXT: snez a1, a0 ; RV32-NEXT: .LBB23_10: # %entry ; RV32-NEXT: neg a1, a1 -; RV32-NEXT: and a0, a1, a2 -; RV32-NEXT: and a1, a1, a4 +; RV32-NEXT: and a0, a1, a4 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -1752,15 +1752,15 @@ define i64 @stest_f16i64(half %x) { ; RV32-NEXT: .LBB24_3: # %entry ; RV32-NEXT: slti a6, a4, 0 ; RV32-NEXT: .LBB24_4: # %entry -; RV32-NEXT: addi a7, a6, -1 -; RV32-NEXT: neg t0, a6 +; RV32-NEXT: neg a7, a6 +; RV32-NEXT: addi t0, a6, -1 ; RV32-NEXT: bnez a6, .LBB24_6 ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB24_6: # %entry -; RV32-NEXT: or a3, a7, a3 -; RV32-NEXT: and a4, t0, a4 -; RV32-NEXT: and a2, t0, a2 +; RV32-NEXT: or a3, t0, a3 +; RV32-NEXT: and a4, a7, a4 +; RV32-NEXT: and a2, a7, a2 ; RV32-NEXT: beq a1, a0, .LBB24_8 ; RV32-NEXT: # %bb.7: # %entry ; RV32-NEXT: sltu a0, a0, a1 @@ -1921,8 +1921,8 @@ define i64 @ustest_f16i64(half %x) { ; RV32-NEXT: # %bb.4: # %entry ; RV32-NEXT: li a0, 1 ; RV32-NEXT: .LBB26_5: # %entry -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: lw a4, 12(sp) +; RV32-NEXT: lw a4, 8(sp) +; RV32-NEXT: lw a3, 12(sp) ; RV32-NEXT: and a5, a2, a1 ; RV32-NEXT: beqz a5, .LBB26_7 ; RV32-NEXT: # %bb.6: # %entry @@ -1936,12 +1936,12 @@ define i64 @ustest_f16i64(half %x) { ; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: bnez a0, .LBB26_10 ; RV32-NEXT: # %bb.9: -; RV32-NEXT: or a0, a2, a4 +; RV32-NEXT: or a0, a4, a2 ; RV32-NEXT: snez a1, a0 ; RV32-NEXT: .LBB26_10: # %entry ; RV32-NEXT: neg a1, a1 -; RV32-NEXT: and a0, a1, a2 -; RV32-NEXT: and a1, a1, a4 +; RV32-NEXT: and a0, a1, a4 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -3046,15 +3046,15 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IF-NEXT: .LBB45_3: # %entry ; RV32IF-NEXT: slti a6, a4, 0 ; RV32IF-NEXT: .LBB45_4: # %entry -; RV32IF-NEXT: addi a7, a6, -1 -; RV32IF-NEXT: neg t0, a6 +; RV32IF-NEXT: neg a7, a6 +; RV32IF-NEXT: addi t0, a6, -1 ; RV32IF-NEXT: bnez a6, .LBB45_6 ; RV32IF-NEXT: # %bb.5: # %entry ; RV32IF-NEXT: mv a1, a5 ; RV32IF-NEXT: .LBB45_6: # %entry -; RV32IF-NEXT: or a3, a7, a3 -; RV32IF-NEXT: and a4, t0, a4 -; RV32IF-NEXT: and a2, t0, a2 +; RV32IF-NEXT: or a3, t0, a3 +; RV32IF-NEXT: and a4, a7, a4 +; RV32IF-NEXT: and a2, a7, a2 ; RV32IF-NEXT: beq a1, a0, .LBB45_8 ; RV32IF-NEXT: # %bb.7: # %entry ; RV32IF-NEXT: sltu a0, a0, a1 @@ -3149,15 +3149,15 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IFD-NEXT: .LBB45_3: # %entry ; RV32IFD-NEXT: slti a6, a4, 0 ; RV32IFD-NEXT: .LBB45_4: # %entry -; RV32IFD-NEXT: addi a7, a6, -1 -; RV32IFD-NEXT: neg t0, a6 +; RV32IFD-NEXT: neg a7, a6 +; RV32IFD-NEXT: addi t0, a6, -1 ; RV32IFD-NEXT: bnez a6, .LBB45_6 ; RV32IFD-NEXT: # %bb.5: # %entry ; RV32IFD-NEXT: mv a1, a5 ; RV32IFD-NEXT: .LBB45_6: # %entry -; RV32IFD-NEXT: or a3, a7, a3 -; RV32IFD-NEXT: and a4, t0, a4 -; RV32IFD-NEXT: and a2, t0, a2 +; RV32IFD-NEXT: or a3, t0, a3 +; RV32IFD-NEXT: and a4, a7, a4 +; RV32IFD-NEXT: and a2, a7, a2 ; RV32IFD-NEXT: beq a1, a0, .LBB45_8 ; RV32IFD-NEXT: # %bb.7: # %entry ; RV32IFD-NEXT: sltu a0, a0, a1 @@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixdfti -; RV32IF-NEXT: lw a0, 20(sp) -; RV32IF-NEXT: lw a1, 8(sp) -; RV32IF-NEXT: lw a2, 12(sp) +; RV32IF-NEXT: lw a0, 8(sp) +; RV32IF-NEXT: lw a1, 12(sp) +; RV32IF-NEXT: lw a2, 20(sp) ; RV32IF-NEXT: lw a3, 16(sp) -; RV32IF-NEXT: beqz a0, .LBB47_2 +; RV32IF-NEXT: beqz a2, .LBB47_2 ; RV32IF-NEXT: # %bb.1: # %entry -; RV32IF-NEXT: slti a4, a0, 0 +; RV32IF-NEXT: slti a4, a2, 0 ; RV32IF-NEXT: j .LBB47_3 ; RV32IF-NEXT: .LBB47_2: ; RV32IF-NEXT: seqz a4, a3 ; RV32IF-NEXT: .LBB47_3: # %entry ; RV32IF-NEXT: xori a3, a3, 1 -; RV32IF-NEXT: or a3, a3, a0 +; RV32IF-NEXT: or a3, a3, a2 ; RV32IF-NEXT: seqz a3, a3 ; RV32IF-NEXT: addi a3, a3, -1 ; RV32IF-NEXT: and a3, a3, a4 ; RV32IF-NEXT: neg a3, a3 -; RV32IF-NEXT: and a2, a3, a2 ; RV32IF-NEXT: and a1, a3, a1 ; RV32IF-NEXT: and a0, a3, a0 -; RV32IF-NEXT: slti a0, a0, 0 -; RV32IF-NEXT: addi a3, a0, -1 -; RV32IF-NEXT: and a0, a3, a1 -; RV32IF-NEXT: and a1, a3, a2 +; RV32IF-NEXT: and a2, a3, a2 +; RV32IF-NEXT: slti a2, a2, 0 +; RV32IF-NEXT: addi a2, a2, -1 +; RV32IF-NEXT: and a0, a2, a0 +; RV32IF-NEXT: and a1, a2, a1 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: .cfi_restore ra ; RV32IF-NEXT: addi sp, sp, 32 @@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixdfti -; RV32IFD-NEXT: lw a0, 20(sp) -; RV32IFD-NEXT: lw a1, 8(sp) -; RV32IFD-NEXT: lw a2, 12(sp) +; RV32IFD-NEXT: lw a0, 8(sp) +; RV32IFD-NEXT: lw a1, 12(sp) +; RV32IFD-NEXT: lw a2, 20(sp) ; RV32IFD-NEXT: lw a3, 16(sp) -; RV32IFD-NEXT: beqz a0, .LBB47_2 +; RV32IFD-NEXT: beqz a2, .LBB47_2 ; RV32IFD-NEXT: # %bb.1: # %entry -; RV32IFD-NEXT: slti a4, a0, 0 +; RV32IFD-NEXT: slti a4, a2, 0 ; RV32IFD-NEXT: j .LBB47_3 ; RV32IFD-NEXT: .LBB47_2: ; RV32IFD-NEXT: seqz a4, a3 ; RV32IFD-NEXT: .LBB47_3: # %entry ; RV32IFD-NEXT: xori a3, a3, 1 -; RV32IFD-NEXT: or a3, a3, a0 +; RV32IFD-NEXT: or a3, a3, a2 ; RV32IFD-NEXT: seqz a3, a3 ; RV32IFD-NEXT: addi a3, a3, -1 ; RV32IFD-NEXT: and a3, a3, a4 ; RV32IFD-NEXT: neg a3, a3 -; RV32IFD-NEXT: and a2, a3, a2 ; RV32IFD-NEXT: and a1, a3, a1 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: slti a0, a0, 0 -; RV32IFD-NEXT: addi a3, a0, -1 -; RV32IFD-NEXT: and a0, a3, a1 -; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: and a2, a3, a2 +; RV32IFD-NEXT: slti a2, a2, 0 +; RV32IFD-NEXT: addi a2, a2, -1 +; RV32IFD-NEXT: and a0, a2, a0 +; RV32IFD-NEXT: and a1, a2, a1 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: .cfi_restore ra ; RV32IFD-NEXT: addi sp, sp, 32 @@ -3419,15 +3419,15 @@ define i64 @stest_f32i64_mm(float %x) { ; RV32-NEXT: .LBB48_3: # %entry ; RV32-NEXT: slti a6, a4, 0 ; RV32-NEXT: .LBB48_4: # %entry -; RV32-NEXT: addi a7, a6, -1 -; RV32-NEXT: neg t0, a6 +; RV32-NEXT: neg a7, a6 +; RV32-NEXT: addi t0, a6, -1 ; RV32-NEXT: bnez a6, .LBB48_6 ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB48_6: # %entry -; RV32-NEXT: or a3, a7, a3 -; RV32-NEXT: and a4, t0, a4 -; RV32-NEXT: and a2, t0, a2 +; RV32-NEXT: or a3, t0, a3 +; RV32-NEXT: and a4, a7, a4 +; RV32-NEXT: and a2, a7, a2 ; RV32-NEXT: beq a1, a0, .LBB48_8 ; RV32-NEXT: # %bb.7: # %entry ; RV32-NEXT: sltu a0, a0, a1 @@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a0, 20(sp) -; RV32-NEXT: lw a1, 8(sp) -; RV32-NEXT: lw a2, 12(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: lw a2, 20(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a0, .LBB50_2 +; RV32-NEXT: beqz a2, .LBB50_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a0, 0 +; RV32-NEXT: slti a4, a2, 0 ; RV32-NEXT: j .LBB50_3 ; RV32-NEXT: .LBB50_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB50_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a0 +; RV32-NEXT: or a3, a3, a2 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 -; RV32-NEXT: and a2, a3, a2 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 -; RV32-NEXT: slti a0, a0, 0 -; RV32-NEXT: addi a3, a0, -1 -; RV32-NEXT: and a0, a3, a1 -; RV32-NEXT: and a1, a3, a2 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: slti a2, a2, 0 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -3620,15 +3620,15 @@ define i64 @stest_f16i64_mm(half %x) { ; RV32-NEXT: .LBB51_3: # %entry ; RV32-NEXT: slti a6, a4, 0 ; RV32-NEXT: .LBB51_4: # %entry -; RV32-NEXT: addi a7, a6, -1 -; RV32-NEXT: neg t0, a6 +; RV32-NEXT: neg a7, a6 +; RV32-NEXT: addi t0, a6, -1 ; RV32-NEXT: bnez a6, .LBB51_6 ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB51_6: # %entry -; RV32-NEXT: or a3, a7, a3 -; RV32-NEXT: and a4, t0, a4 -; RV32-NEXT: and a2, t0, a2 +; RV32-NEXT: or a3, t0, a3 +; RV32-NEXT: and a4, a7, a4 +; RV32-NEXT: and a2, a7, a2 ; RV32-NEXT: beq a1, a0, .LBB51_8 ; RV32-NEXT: # %bb.7: # %entry ; RV32-NEXT: sltu a0, a0, a1 @@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) { ; RV32-NEXT: call __extendhfsf2 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a0, 20(sp) -; RV32-NEXT: lw a1, 8(sp) -; RV32-NEXT: lw a2, 12(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: lw a2, 20(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a0, .LBB53_2 +; RV32-NEXT: beqz a2, .LBB53_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a0, 0 +; RV32-NEXT: slti a4, a2, 0 ; RV32-NEXT: j .LBB53_3 ; RV32-NEXT: .LBB53_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB53_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a0 +; RV32-NEXT: or a3, a3, a2 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 -; RV32-NEXT: and a2, a3, a2 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 -; RV32-NEXT: slti a0, a0, 0 -; RV32-NEXT: addi a3, a0, -1 -; RV32-NEXT: and a0, a3, a1 -; RV32-NEXT: and a1, a3, a2 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: slti a2, a2, 0 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll index 774f1a1608821..c157c63722cb4 100644 --- a/llvm/test/CodeGen/RISCV/iabs.ll +++ b/llvm/test/CodeGen/RISCV/iabs.ll @@ -301,58 +301,58 @@ define i64 @select_abs64(i64 %x) { define i128 @abs128(i128 %x) { ; RV32I-LABEL: abs128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 12(a1) -; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw a2, 12(a1) +; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a1, 8(a1) -; RV32I-NEXT: bgez a3, .LBB8_2 +; RV32I-NEXT: bgez a2, .LBB8_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: neg a5, a1 ; RV32I-NEXT: snez a6, a4 -; RV32I-NEXT: snez a7, a2 +; RV32I-NEXT: snez a7, a3 ; RV32I-NEXT: snez a1, a1 ; RV32I-NEXT: neg a4, a4 ; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add a1, a2, a1 ; RV32I-NEXT: sub a4, a4, a7 -; RV32I-NEXT: sltu a3, a5, a6 +; RV32I-NEXT: sltu a2, a5, a6 ; RV32I-NEXT: neg a7, a1 ; RV32I-NEXT: sub a1, a5, a6 -; RV32I-NEXT: sub a3, a7, a3 -; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a2, a7, a2 +; RV32I-NEXT: neg a3, a3 ; RV32I-NEXT: .LBB8_2: -; RV32I-NEXT: sw a2, 0(a0) +; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a4, 4(a0) ; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: abs128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a3, 12(a1) -; RV32ZBB-NEXT: lw a2, 0(a1) +; RV32ZBB-NEXT: lw a2, 12(a1) +; RV32ZBB-NEXT: lw a3, 0(a1) ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a1, 8(a1) -; RV32ZBB-NEXT: bgez a3, .LBB8_2 +; RV32ZBB-NEXT: bgez a2, .LBB8_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: neg a5, a1 ; RV32ZBB-NEXT: snez a6, a4 -; RV32ZBB-NEXT: snez a7, a2 +; RV32ZBB-NEXT: snez a7, a3 ; RV32ZBB-NEXT: snez a1, a1 ; RV32ZBB-NEXT: neg a4, a4 ; RV32ZBB-NEXT: or a6, a7, a6 -; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: add a1, a2, a1 ; RV32ZBB-NEXT: sub a4, a4, a7 -; RV32ZBB-NEXT: sltu a3, a5, a6 +; RV32ZBB-NEXT: sltu a2, a5, a6 ; RV32ZBB-NEXT: neg a7, a1 ; RV32ZBB-NEXT: sub a1, a5, a6 -; RV32ZBB-NEXT: sub a3, a7, a3 -; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a2, a7, a2 +; RV32ZBB-NEXT: neg a3, a3 ; RV32ZBB-NEXT: .LBB8_2: -; RV32ZBB-NEXT: sw a2, 0(a0) +; RV32ZBB-NEXT: sw a3, 0(a0) ; RV32ZBB-NEXT: sw a4, 4(a0) ; RV32ZBB-NEXT: sw a1, 8(a0) -; RV32ZBB-NEXT: sw a3, 12(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: abs128: @@ -383,58 +383,58 @@ define i128 @abs128(i128 %x) { define i128 @select_abs128(i128 %x) { ; RV32I-LABEL: select_abs128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 12(a1) -; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw a2, 12(a1) +; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a1, 8(a1) -; RV32I-NEXT: bgez a3, .LBB9_2 +; RV32I-NEXT: bgez a2, .LBB9_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: neg a5, a1 ; RV32I-NEXT: snez a6, a4 -; RV32I-NEXT: snez a7, a2 +; RV32I-NEXT: snez a7, a3 ; RV32I-NEXT: snez a1, a1 ; RV32I-NEXT: neg a4, a4 ; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add a1, a2, a1 ; RV32I-NEXT: sub a4, a4, a7 -; RV32I-NEXT: sltu a3, a5, a6 +; RV32I-NEXT: sltu a2, a5, a6 ; RV32I-NEXT: neg a7, a1 ; RV32I-NEXT: sub a1, a5, a6 -; RV32I-NEXT: sub a3, a7, a3 -; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a2, a7, a2 +; RV32I-NEXT: neg a3, a3 ; RV32I-NEXT: .LBB9_2: -; RV32I-NEXT: sw a2, 0(a0) +; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a4, 4(a0) ; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: select_abs128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a3, 12(a1) -; RV32ZBB-NEXT: lw a2, 0(a1) +; RV32ZBB-NEXT: lw a2, 12(a1) +; RV32ZBB-NEXT: lw a3, 0(a1) ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a1, 8(a1) -; RV32ZBB-NEXT: bgez a3, .LBB9_2 +; RV32ZBB-NEXT: bgez a2, .LBB9_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: neg a5, a1 ; RV32ZBB-NEXT: snez a6, a4 -; RV32ZBB-NEXT: snez a7, a2 +; RV32ZBB-NEXT: snez a7, a3 ; RV32ZBB-NEXT: snez a1, a1 ; RV32ZBB-NEXT: neg a4, a4 ; RV32ZBB-NEXT: or a6, a7, a6 -; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: add a1, a2, a1 ; RV32ZBB-NEXT: sub a4, a4, a7 -; RV32ZBB-NEXT: sltu a3, a5, a6 +; RV32ZBB-NEXT: sltu a2, a5, a6 ; RV32ZBB-NEXT: neg a7, a1 ; RV32ZBB-NEXT: sub a1, a5, a6 -; RV32ZBB-NEXT: sub a3, a7, a3 -; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a2, a7, a2 +; RV32ZBB-NEXT: neg a3, a3 ; RV32ZBB-NEXT: .LBB9_2: -; RV32ZBB-NEXT: sw a2, 0(a0) +; RV32ZBB-NEXT: sw a3, 0(a0) ; RV32ZBB-NEXT: sw a4, 4(a0) ; RV32ZBB-NEXT: sw a1, 8(a0) -; RV32ZBB-NEXT: sw a3, 12(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: select_abs128: diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index 8dd63015971d0..eb8b769b6d083 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -1587,59 +1587,59 @@ define i64 @sub_if_uge_i64(i64 %x, i64 %y) { define i128 @sub_if_uge_i128(i128 %x, i128 %y) { ; CHECK-LABEL: sub_if_uge_i128: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a7, 4(a2) -; CHECK-NEXT: lw a6, 8(a2) -; CHECK-NEXT: lw t0, 12(a2) ; CHECK-NEXT: lw a3, 4(a1) -; CHECK-NEXT: lw a4, 12(a1) -; CHECK-NEXT: lw a5, 8(a1) -; CHECK-NEXT: beq a4, t0, .LBB53_2 +; CHECK-NEXT: lw a4, 8(a1) +; CHECK-NEXT: lw a5, 12(a1) +; CHECK-NEXT: lw a6, 4(a2) +; CHECK-NEXT: lw t0, 12(a2) +; CHECK-NEXT: lw a7, 8(a2) +; CHECK-NEXT: beq a5, t0, .LBB53_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: sltu t1, a4, t0 +; CHECK-NEXT: sltu t1, a5, t0 ; CHECK-NEXT: j .LBB53_3 ; CHECK-NEXT: .LBB53_2: -; CHECK-NEXT: sltu t1, a5, a6 +; CHECK-NEXT: sltu t1, a4, a7 ; CHECK-NEXT: .LBB53_3: -; CHECK-NEXT: lw a2, 0(a2) ; CHECK-NEXT: lw a1, 0(a1) -; CHECK-NEXT: beq a3, a7, .LBB53_5 +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: beq a3, a6, .LBB53_5 ; CHECK-NEXT: # %bb.4: -; CHECK-NEXT: sltu t2, a3, a7 +; CHECK-NEXT: sltu t2, a3, a6 ; CHECK-NEXT: j .LBB53_6 ; CHECK-NEXT: .LBB53_5: ; CHECK-NEXT: sltu t2, a1, a2 ; CHECK-NEXT: .LBB53_6: -; CHECK-NEXT: xor t3, a4, t0 -; CHECK-NEXT: xor t4, a5, a6 +; CHECK-NEXT: xor t3, a5, t0 +; CHECK-NEXT: xor t4, a4, a7 ; CHECK-NEXT: or t3, t4, t3 ; CHECK-NEXT: beqz t3, .LBB53_8 ; CHECK-NEXT: # %bb.7: ; CHECK-NEXT: mv t2, t1 ; CHECK-NEXT: .LBB53_8: -; CHECK-NEXT: addi t2, t2, -1 -; CHECK-NEXT: and t1, t2, t0 -; CHECK-NEXT: and t0, t2, a2 -; CHECK-NEXT: and a7, t2, a7 +; CHECK-NEXT: addi t3, t2, -1 +; CHECK-NEXT: and t2, t3, t0 +; CHECK-NEXT: and t0, t3, a2 +; CHECK-NEXT: and t1, t3, a6 ; CHECK-NEXT: sltu a2, a1, t0 -; CHECK-NEXT: and t2, t2, a6 +; CHECK-NEXT: and a7, t3, a7 ; CHECK-NEXT: mv a6, a2 -; CHECK-NEXT: beq a3, a7, .LBB53_10 +; CHECK-NEXT: beq a3, t1, .LBB53_10 ; CHECK-NEXT: # %bb.9: -; CHECK-NEXT: sltu a6, a3, a7 +; CHECK-NEXT: sltu a6, a3, t1 ; CHECK-NEXT: .LBB53_10: -; CHECK-NEXT: sub t3, a5, t2 -; CHECK-NEXT: sltu a5, a5, t2 -; CHECK-NEXT: sub a4, a4, t1 -; CHECK-NEXT: sub a3, a3, a7 +; CHECK-NEXT: sub t3, a4, a7 +; CHECK-NEXT: sltu a4, a4, a7 +; CHECK-NEXT: sub a5, a5, t2 +; CHECK-NEXT: sub a3, a3, t1 ; CHECK-NEXT: sub a1, a1, t0 ; CHECK-NEXT: sltu a7, t3, a6 -; CHECK-NEXT: sub a4, a4, a5 -; CHECK-NEXT: sub a5, t3, a6 +; CHECK-NEXT: sub a5, a5, a4 +; CHECK-NEXT: sub a4, t3, a6 ; CHECK-NEXT: sub a3, a3, a2 -; CHECK-NEXT: sub a2, a4, a7 +; CHECK-NEXT: sub a2, a5, a7 ; CHECK-NEXT: sw a1, 0(a0) ; CHECK-NEXT: sw a3, 4(a0) -; CHECK-NEXT: sw a5, 8(a0) +; CHECK-NEXT: sw a4, 8(a0) ; CHECK-NEXT: sw a2, 12(a0) ; CHECK-NEXT: ret %cmp = icmp ult i128 %x, %y diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll index 1a3beeb79b85b..e3728bffacf80 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbs.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll @@ -785,16 +785,16 @@ define i32 @bset_trailing_ones_i32_no_mask(i32 %a) nounwind { define i64 @bset_trailing_ones_i64_mask(i64 %a) nounwind { ; CHECK-LABEL: bset_trailing_ones_i64_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, -1 -; CHECK-NEXT: andi a3, a0, 63 -; CHECK-NEXT: addi a1, a3, -32 -; CHECK-NEXT: sll a0, a2, a0 +; CHECK-NEXT: andi a2, a0, 63 +; CHECK-NEXT: li a3, -1 +; CHECK-NEXT: addi a1, a2, -32 +; CHECK-NEXT: sll a0, a3, a0 ; CHECK-NEXT: bltz a1, .LBB43_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: sll a2, a2, a3 +; CHECK-NEXT: sll a2, a3, a2 ; CHECK-NEXT: j .LBB43_3 ; CHECK-NEXT: .LBB43_2: -; CHECK-NEXT: not a2, a3 +; CHECK-NEXT: not a2, a2 ; CHECK-NEXT: lui a3, 524288 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: srl a2, a3, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index f481f9cff5de1..9ef7f9441171c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -89,17 +89,17 @@ entry: define <2 x i32> @ustest_f64i32(<2 x double> %x) { ; CHECK-NOV-LABEL: ustest_f64i32: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.l.d a1, fa1, rtz +; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz ; CHECK-NOV-NEXT: li a2, -1 ; CHECK-NOV-NEXT: srli a2, a2, 32 -; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz -; CHECK-NOV-NEXT: blt a1, a2, .LBB2_2 +; CHECK-NOV-NEXT: fcvt.l.d a1, fa1, rtz +; CHECK-NOV-NEXT: blt a0, a2, .LBB2_2 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: mv a1, a2 +; CHECK-NOV-NEXT: mv a0, a2 ; CHECK-NOV-NEXT: .LBB2_2: # %entry -; CHECK-NOV-NEXT: blt a0, a2, .LBB2_4 +; CHECK-NOV-NEXT: blt a1, a2, .LBB2_4 ; CHECK-NOV-NEXT: # %bb.3: # %entry -; CHECK-NOV-NEXT: mv a0, a2 +; CHECK-NOV-NEXT: mv a1, a2 ; CHECK-NOV-NEXT: .LBB2_4: # %entry ; CHECK-NOV-NEXT: sgtz a2, a1 ; CHECK-NOV-NEXT: sgtz a3, a0 @@ -254,50 +254,50 @@ entry: define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i32: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a1, fa3, rtz -; CHECK-NOV-NEXT: li a4, -1 -; CHECK-NOV-NEXT: srli a4, a4, 32 -; CHECK-NOV-NEXT: fcvt.l.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a4, .LBB5_6 +; CHECK-NOV-NEXT: fcvt.l.s a1, fa0, rtz +; CHECK-NOV-NEXT: li a5, -1 +; CHECK-NOV-NEXT: srli a5, a5, 32 +; CHECK-NOV-NEXT: fcvt.l.s a2, fa1, rtz +; CHECK-NOV-NEXT: bge a1, a5, .LBB5_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a4, .LBB5_7 +; CHECK-NOV-NEXT: fcvt.l.s a3, fa2, rtz +; CHECK-NOV-NEXT: bge a2, a5, .LBB5_7 ; CHECK-NOV-NEXT: .LBB5_2: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NOV-NEXT: bge a3, a4, .LBB5_8 +; CHECK-NOV-NEXT: fcvt.l.s a4, fa3, rtz +; CHECK-NOV-NEXT: bge a3, a5, .LBB5_8 ; CHECK-NOV-NEXT: .LBB5_3: # %entry -; CHECK-NOV-NEXT: blt a5, a4, .LBB5_5 +; CHECK-NOV-NEXT: blt a4, a5, .LBB5_5 ; CHECK-NOV-NEXT: .LBB5_4: # %entry -; CHECK-NOV-NEXT: mv a5, a4 +; CHECK-NOV-NEXT: mv a4, a5 ; CHECK-NOV-NEXT: .LBB5_5: # %entry -; CHECK-NOV-NEXT: sgtz a4, a1 -; CHECK-NOV-NEXT: sgtz a6, a2 -; CHECK-NOV-NEXT: sgtz a7, a3 -; CHECK-NOV-NEXT: sgtz t0, a5 +; CHECK-NOV-NEXT: sgtz a5, a4 +; CHECK-NOV-NEXT: sgtz a6, a3 +; CHECK-NOV-NEXT: sgtz a7, a2 +; CHECK-NOV-NEXT: sgtz t0, a1 ; CHECK-NOV-NEXT: neg t0, t0 ; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: neg a6, a6 -; CHECK-NOV-NEXT: neg a4, a4 -; CHECK-NOV-NEXT: and a5, t0, a5 -; CHECK-NOV-NEXT: and a3, a7, a3 -; CHECK-NOV-NEXT: and a2, a6, a2 -; CHECK-NOV-NEXT: and a1, a4, a1 -; CHECK-NOV-NEXT: sw a5, 0(a0) -; CHECK-NOV-NEXT: sw a3, 4(a0) -; CHECK-NOV-NEXT: sw a2, 8(a0) -; CHECK-NOV-NEXT: sw a1, 12(a0) +; CHECK-NOV-NEXT: neg a5, a5 +; CHECK-NOV-NEXT: and a1, t0, a1 +; CHECK-NOV-NEXT: and a2, a7, a2 +; CHECK-NOV-NEXT: and a3, a6, a3 +; CHECK-NOV-NEXT: and a4, a5, a4 +; CHECK-NOV-NEXT: sw a1, 0(a0) +; CHECK-NOV-NEXT: sw a2, 4(a0) +; CHECK-NOV-NEXT: sw a3, 8(a0) +; CHECK-NOV-NEXT: sw a4, 12(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB5_6: # %entry -; CHECK-NOV-NEXT: mv a1, a4 -; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a4, .LBB5_2 +; CHECK-NOV-NEXT: mv a1, a5 +; CHECK-NOV-NEXT: fcvt.l.s a3, fa2, rtz +; CHECK-NOV-NEXT: blt a2, a5, .LBB5_2 ; CHECK-NOV-NEXT: .LBB5_7: # %entry -; CHECK-NOV-NEXT: mv a2, a4 -; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NOV-NEXT: blt a3, a4, .LBB5_3 +; CHECK-NOV-NEXT: mv a2, a5 +; CHECK-NOV-NEXT: fcvt.l.s a4, fa3, rtz +; CHECK-NOV-NEXT: blt a3, a5, .LBB5_3 ; CHECK-NOV-NEXT: .LBB5_8: # %entry -; CHECK-NOV-NEXT: mv a3, a4 -; CHECK-NOV-NEXT: bge a5, a4, .LBB5_4 +; CHECK-NOV-NEXT: mv a3, a5 +; CHECK-NOV-NEXT: bge a4, a5, .LBB5_4 ; CHECK-NOV-NEXT: j .LBB5_5 ; ; CHECK-V-LABEL: ustest_f32i32: @@ -720,8 +720,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 ; CHECK-NOV-NEXT: .cfi_remember_state ; CHECK-NOV-NEXT: lhu s1, 0(a1) -; CHECK-NOV-NEXT: lhu s2, 8(a1) -; CHECK-NOV-NEXT: lhu a2, 16(a1) +; CHECK-NOV-NEXT: lhu a2, 8(a1) +; CHECK-NOV-NEXT: lhu s2, 16(a1) ; CHECK-NOV-NEXT: lhu s3, 24(a1) ; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: fmv.w.x fa0, a2 @@ -730,43 +730,43 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s1 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NOV-NEXT: li a2, -1 -; CHECK-NOV-NEXT: srli a2, a2, 32 -; CHECK-NOV-NEXT: bge a0, a2, .LBB8_6 +; CHECK-NOV-NEXT: li a3, -1 +; CHECK-NOV-NEXT: srli a3, a3, 32 +; CHECK-NOV-NEXT: bge a0, a3, .LBB8_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a1, fs1, rtz -; CHECK-NOV-NEXT: bge s1, a2, .LBB8_7 +; CHECK-NOV-NEXT: bge s1, a3, .LBB8_7 ; CHECK-NOV-NEXT: .LBB8_2: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz -; CHECK-NOV-NEXT: bge a1, a2, .LBB8_8 +; CHECK-NOV-NEXT: fcvt.l.s a2, fs0, rtz +; CHECK-NOV-NEXT: bge a1, a3, .LBB8_8 ; CHECK-NOV-NEXT: .LBB8_3: # %entry -; CHECK-NOV-NEXT: blt a3, a2, .LBB8_5 +; CHECK-NOV-NEXT: blt a2, a3, .LBB8_5 ; CHECK-NOV-NEXT: .LBB8_4: # %entry -; CHECK-NOV-NEXT: mv a3, a2 +; CHECK-NOV-NEXT: mv a2, a3 ; CHECK-NOV-NEXT: .LBB8_5: # %entry -; CHECK-NOV-NEXT: sgtz a2, a0 -; CHECK-NOV-NEXT: sgtz a4, s1 -; CHECK-NOV-NEXT: sgtz a5, a1 -; CHECK-NOV-NEXT: sgtz a6, a3 +; CHECK-NOV-NEXT: sgtz a3, a2 +; CHECK-NOV-NEXT: sgtz a4, a1 +; CHECK-NOV-NEXT: sgtz a5, s1 +; CHECK-NOV-NEXT: sgtz a6, a0 ; CHECK-NOV-NEXT: neg a6, a6 ; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: neg a4, a4 -; CHECK-NOV-NEXT: neg a2, a2 -; CHECK-NOV-NEXT: and a3, a6, a3 -; CHECK-NOV-NEXT: and a1, a5, a1 -; CHECK-NOV-NEXT: and a4, a4, s1 -; CHECK-NOV-NEXT: and a0, a2, a0 -; CHECK-NOV-NEXT: sw a3, 0(s0) -; CHECK-NOV-NEXT: sw a1, 4(s0) -; CHECK-NOV-NEXT: sw a4, 8(s0) -; CHECK-NOV-NEXT: sw a0, 12(s0) +; CHECK-NOV-NEXT: neg a3, a3 +; CHECK-NOV-NEXT: and a0, a6, a0 +; CHECK-NOV-NEXT: and a5, a5, s1 +; CHECK-NOV-NEXT: and a1, a4, a1 +; CHECK-NOV-NEXT: and a2, a3, a2 +; CHECK-NOV-NEXT: sw a0, 0(s0) +; CHECK-NOV-NEXT: sw a5, 4(s0) +; CHECK-NOV-NEXT: sw a1, 8(s0) +; CHECK-NOV-NEXT: sw a2, 12(s0) ; CHECK-NOV-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 48(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 40(sp) # 8-byte Folded Reload @@ -788,16 +788,16 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB8_6: # %entry ; CHECK-NOV-NEXT: .cfi_restore_state -; CHECK-NOV-NEXT: mv a0, a2 +; CHECK-NOV-NEXT: mv a0, a3 ; CHECK-NOV-NEXT: fcvt.l.s a1, fs1, rtz -; CHECK-NOV-NEXT: blt s1, a2, .LBB8_2 +; CHECK-NOV-NEXT: blt s1, a3, .LBB8_2 ; CHECK-NOV-NEXT: .LBB8_7: # %entry -; CHECK-NOV-NEXT: mv s1, a2 -; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz -; CHECK-NOV-NEXT: blt a1, a2, .LBB8_3 +; CHECK-NOV-NEXT: mv s1, a3 +; CHECK-NOV-NEXT: fcvt.l.s a2, fs0, rtz +; CHECK-NOV-NEXT: blt a1, a3, .LBB8_3 ; CHECK-NOV-NEXT: .LBB8_8: # %entry -; CHECK-NOV-NEXT: mv a1, a2 -; CHECK-NOV-NEXT: bge a3, a2, .LBB8_4 +; CHECK-NOV-NEXT: mv a1, a3 +; CHECK-NOV-NEXT: bge a2, a3, .LBB8_4 ; CHECK-NOV-NEXT: j .LBB8_5 ; ; CHECK-V-LABEL: ustest_f16i32: @@ -977,17 +977,17 @@ entry: define <2 x i16> @ustest_f64i16(<2 x double> %x) { ; CHECK-NOV-LABEL: ustest_f64i16: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.w.d a1, fa1, rtz +; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz ; CHECK-NOV-NEXT: lui a2, 16 ; CHECK-NOV-NEXT: addi a2, a2, -1 -; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz -; CHECK-NOV-NEXT: blt a1, a2, .LBB11_2 +; CHECK-NOV-NEXT: fcvt.w.d a1, fa1, rtz +; CHECK-NOV-NEXT: blt a0, a2, .LBB11_2 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: mv a1, a2 +; CHECK-NOV-NEXT: mv a0, a2 ; CHECK-NOV-NEXT: .LBB11_2: # %entry -; CHECK-NOV-NEXT: blt a0, a2, .LBB11_4 +; CHECK-NOV-NEXT: blt a1, a2, .LBB11_4 ; CHECK-NOV-NEXT: # %bb.3: # %entry -; CHECK-NOV-NEXT: mv a0, a2 +; CHECK-NOV-NEXT: mv a1, a2 ; CHECK-NOV-NEXT: .LBB11_4: # %entry ; CHECK-NOV-NEXT: sgtz a2, a1 ; CHECK-NOV-NEXT: sgtz a3, a0 @@ -1146,50 +1146,50 @@ entry: define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i16: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.w.s a1, fa3, rtz -; CHECK-NOV-NEXT: lui a4, 16 -; CHECK-NOV-NEXT: addi a4, a4, -1 -; CHECK-NOV-NEXT: fcvt.w.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a4, .LBB14_6 +; CHECK-NOV-NEXT: fcvt.w.s a1, fa0, rtz +; CHECK-NOV-NEXT: lui a5, 16 +; CHECK-NOV-NEXT: addi a5, a5, -1 +; CHECK-NOV-NEXT: fcvt.w.s a2, fa1, rtz +; CHECK-NOV-NEXT: bge a1, a5, .LBB14_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a4, .LBB14_7 +; CHECK-NOV-NEXT: fcvt.w.s a3, fa2, rtz +; CHECK-NOV-NEXT: bge a2, a5, .LBB14_7 ; CHECK-NOV-NEXT: .LBB14_2: # %entry -; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz -; CHECK-NOV-NEXT: bge a3, a4, .LBB14_8 +; CHECK-NOV-NEXT: fcvt.w.s a4, fa3, rtz +; CHECK-NOV-NEXT: bge a3, a5, .LBB14_8 ; CHECK-NOV-NEXT: .LBB14_3: # %entry -; CHECK-NOV-NEXT: blt a5, a4, .LBB14_5 +; CHECK-NOV-NEXT: blt a4, a5, .LBB14_5 ; CHECK-NOV-NEXT: .LBB14_4: # %entry -; CHECK-NOV-NEXT: mv a5, a4 +; CHECK-NOV-NEXT: mv a4, a5 ; CHECK-NOV-NEXT: .LBB14_5: # %entry -; CHECK-NOV-NEXT: sgtz a4, a1 -; CHECK-NOV-NEXT: sgtz a6, a2 -; CHECK-NOV-NEXT: sgtz a7, a3 -; CHECK-NOV-NEXT: sgtz t0, a5 +; CHECK-NOV-NEXT: sgtz a5, a4 +; CHECK-NOV-NEXT: sgtz a6, a3 +; CHECK-NOV-NEXT: sgtz a7, a2 +; CHECK-NOV-NEXT: sgtz t0, a1 ; CHECK-NOV-NEXT: neg t0, t0 ; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: neg a6, a6 -; CHECK-NOV-NEXT: neg a4, a4 -; CHECK-NOV-NEXT: and a5, t0, a5 -; CHECK-NOV-NEXT: and a3, a7, a3 -; CHECK-NOV-NEXT: and a2, a6, a2 -; CHECK-NOV-NEXT: and a1, a4, a1 -; CHECK-NOV-NEXT: sh a5, 0(a0) -; CHECK-NOV-NEXT: sh a3, 2(a0) -; CHECK-NOV-NEXT: sh a2, 4(a0) -; CHECK-NOV-NEXT: sh a1, 6(a0) +; CHECK-NOV-NEXT: neg a5, a5 +; CHECK-NOV-NEXT: and a1, t0, a1 +; CHECK-NOV-NEXT: and a2, a7, a2 +; CHECK-NOV-NEXT: and a3, a6, a3 +; CHECK-NOV-NEXT: and a4, a5, a4 +; CHECK-NOV-NEXT: sh a1, 0(a0) +; CHECK-NOV-NEXT: sh a2, 2(a0) +; CHECK-NOV-NEXT: sh a3, 4(a0) +; CHECK-NOV-NEXT: sh a4, 6(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB14_6: # %entry -; CHECK-NOV-NEXT: mv a1, a4 -; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a4, .LBB14_2 +; CHECK-NOV-NEXT: mv a1, a5 +; CHECK-NOV-NEXT: fcvt.w.s a3, fa2, rtz +; CHECK-NOV-NEXT: blt a2, a5, .LBB14_2 ; CHECK-NOV-NEXT: .LBB14_7: # %entry -; CHECK-NOV-NEXT: mv a2, a4 -; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz -; CHECK-NOV-NEXT: blt a3, a4, .LBB14_3 +; CHECK-NOV-NEXT: mv a2, a5 +; CHECK-NOV-NEXT: fcvt.w.s a4, fa3, rtz +; CHECK-NOV-NEXT: blt a3, a5, .LBB14_3 ; CHECK-NOV-NEXT: .LBB14_8: # %entry -; CHECK-NOV-NEXT: mv a3, a4 -; CHECK-NOV-NEXT: bge a5, a4, .LBB14_4 +; CHECK-NOV-NEXT: mv a3, a5 +; CHECK-NOV-NEXT: bge a4, a5, .LBB14_4 ; CHECK-NOV-NEXT: j .LBB14_5 ; ; CHECK-V-LABEL: ustest_f32i16: @@ -1974,72 +1974,72 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_remember_state ; CHECK-NOV-NEXT: lhu s1, 32(a1) ; CHECK-NOV-NEXT: lhu s2, 40(a1) -; CHECK-NOV-NEXT: lhu a2, 48(a1) -; CHECK-NOV-NEXT: lhu s3, 56(a1) -; CHECK-NOV-NEXT: lhu s4, 0(a1) -; CHECK-NOV-NEXT: lhu s5, 8(a1) +; CHECK-NOV-NEXT: lhu s3, 48(a1) +; CHECK-NOV-NEXT: lhu s4, 56(a1) +; CHECK-NOV-NEXT: lhu s5, 0(a1) +; CHECK-NOV-NEXT: lhu a2, 8(a1) ; CHECK-NOV-NEXT: lhu s6, 16(a1) ; CHECK-NOV-NEXT: lhu s7, 24(a1) ; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs6, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs5, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s1 +; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs4, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s7 +; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs3, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s6 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s5 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NOV-NEXT: lui a4, 16 -; CHECK-NOV-NEXT: addi a4, a4, -1 -; CHECK-NOV-NEXT: bge a0, a4, .LBB17_10 +; CHECK-NOV-NEXT: lui a5, 16 +; CHECK-NOV-NEXT: addi a5, a5, -1 +; CHECK-NOV-NEXT: bge a0, a5, .LBB17_10 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NOV-NEXT: bge s1, a4, .LBB17_11 +; CHECK-NOV-NEXT: bge s1, a5, .LBB17_11 ; CHECK-NOV-NEXT: .LBB17_2: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NOV-NEXT: bge a1, a4, .LBB17_12 +; CHECK-NOV-NEXT: bge a1, a5, .LBB17_12 ; CHECK-NOV-NEXT: .LBB17_3: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz -; CHECK-NOV-NEXT: bge a2, a4, .LBB17_13 +; CHECK-NOV-NEXT: bge a2, a5, .LBB17_13 ; CHECK-NOV-NEXT: .LBB17_4: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz -; CHECK-NOV-NEXT: bge a3, a4, .LBB17_14 +; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz +; CHECK-NOV-NEXT: bge a3, a5, .LBB17_14 ; CHECK-NOV-NEXT: .LBB17_5: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz -; CHECK-NOV-NEXT: bge a5, a4, .LBB17_15 +; CHECK-NOV-NEXT: bge a4, a5, .LBB17_15 ; CHECK-NOV-NEXT: .LBB17_6: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz -; CHECK-NOV-NEXT: bge a6, a4, .LBB17_16 +; CHECK-NOV-NEXT: bge a6, a5, .LBB17_16 ; CHECK-NOV-NEXT: .LBB17_7: # %entry -; CHECK-NOV-NEXT: blt a7, a4, .LBB17_9 +; CHECK-NOV-NEXT: blt a7, a5, .LBB17_9 ; CHECK-NOV-NEXT: .LBB17_8: # %entry -; CHECK-NOV-NEXT: mv a7, a4 +; CHECK-NOV-NEXT: mv a7, a5 ; CHECK-NOV-NEXT: .LBB17_9: # %entry -; CHECK-NOV-NEXT: sgtz a4, a0 -; CHECK-NOV-NEXT: sgtz t0, s1 -; CHECK-NOV-NEXT: sgtz t1, a1 -; CHECK-NOV-NEXT: sgtz t2, a2 -; CHECK-NOV-NEXT: sgtz t3, a3 -; CHECK-NOV-NEXT: sgtz t4, a5 -; CHECK-NOV-NEXT: sgtz t5, a6 -; CHECK-NOV-NEXT: sgtz t6, a7 +; CHECK-NOV-NEXT: sgtz a5, a7 +; CHECK-NOV-NEXT: sgtz t0, a6 +; CHECK-NOV-NEXT: sgtz t1, a4 +; CHECK-NOV-NEXT: sgtz t2, a3 +; CHECK-NOV-NEXT: sgtz t3, a2 +; CHECK-NOV-NEXT: sgtz t4, a1 +; CHECK-NOV-NEXT: sgtz t5, s1 +; CHECK-NOV-NEXT: sgtz t6, a0 ; CHECK-NOV-NEXT: neg t6, t6 ; CHECK-NOV-NEXT: neg t5, t5 ; CHECK-NOV-NEXT: neg t4, t4 @@ -2047,23 +2047,23 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: neg t2, t2 ; CHECK-NOV-NEXT: neg t1, t1 ; CHECK-NOV-NEXT: neg t0, t0 -; CHECK-NOV-NEXT: neg a4, a4 -; CHECK-NOV-NEXT: and a7, t6, a7 -; CHECK-NOV-NEXT: and a6, t5, a6 -; CHECK-NOV-NEXT: and a5, t4, a5 -; CHECK-NOV-NEXT: and a3, t3, a3 -; CHECK-NOV-NEXT: and a2, t2, a2 -; CHECK-NOV-NEXT: and a1, t1, a1 -; CHECK-NOV-NEXT: and t0, t0, s1 -; CHECK-NOV-NEXT: and a0, a4, a0 -; CHECK-NOV-NEXT: sh a2, 8(s0) -; CHECK-NOV-NEXT: sh a1, 10(s0) -; CHECK-NOV-NEXT: sh t0, 12(s0) -; CHECK-NOV-NEXT: sh a0, 14(s0) -; CHECK-NOV-NEXT: sh a7, 0(s0) -; CHECK-NOV-NEXT: sh a6, 2(s0) -; CHECK-NOV-NEXT: sh a5, 4(s0) -; CHECK-NOV-NEXT: sh a3, 6(s0) +; CHECK-NOV-NEXT: neg a5, a5 +; CHECK-NOV-NEXT: and a0, t6, a0 +; CHECK-NOV-NEXT: and t5, t5, s1 +; CHECK-NOV-NEXT: and a1, t4, a1 +; CHECK-NOV-NEXT: and a2, t3, a2 +; CHECK-NOV-NEXT: and a3, t2, a3 +; CHECK-NOV-NEXT: and a4, t1, a4 +; CHECK-NOV-NEXT: and a6, t0, a6 +; CHECK-NOV-NEXT: and a5, a5, a7 +; CHECK-NOV-NEXT: sh a3, 8(s0) +; CHECK-NOV-NEXT: sh a4, 10(s0) +; CHECK-NOV-NEXT: sh a6, 12(s0) +; CHECK-NOV-NEXT: sh a5, 14(s0) +; CHECK-NOV-NEXT: sh a0, 0(s0) +; CHECK-NOV-NEXT: sh t5, 2(s0) +; CHECK-NOV-NEXT: sh a1, 4(s0) +; CHECK-NOV-NEXT: sh a2, 6(s0) ; CHECK-NOV-NEXT: ld ra, 120(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 112(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 104(sp) # 8-byte Folded Reload @@ -2101,32 +2101,32 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB17_10: # %entry ; CHECK-NOV-NEXT: .cfi_restore_state -; CHECK-NOV-NEXT: mv a0, a4 +; CHECK-NOV-NEXT: mv a0, a5 ; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NOV-NEXT: blt s1, a4, .LBB17_2 +; CHECK-NOV-NEXT: blt s1, a5, .LBB17_2 ; CHECK-NOV-NEXT: .LBB17_11: # %entry -; CHECK-NOV-NEXT: mv s1, a4 +; CHECK-NOV-NEXT: mv s1, a5 ; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NOV-NEXT: blt a1, a4, .LBB17_3 +; CHECK-NOV-NEXT: blt a1, a5, .LBB17_3 ; CHECK-NOV-NEXT: .LBB17_12: # %entry -; CHECK-NOV-NEXT: mv a1, a4 +; CHECK-NOV-NEXT: mv a1, a5 ; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz -; CHECK-NOV-NEXT: blt a2, a4, .LBB17_4 +; CHECK-NOV-NEXT: blt a2, a5, .LBB17_4 ; CHECK-NOV-NEXT: .LBB17_13: # %entry -; CHECK-NOV-NEXT: mv a2, a4 -; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz -; CHECK-NOV-NEXT: blt a3, a4, .LBB17_5 +; CHECK-NOV-NEXT: mv a2, a5 +; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz +; CHECK-NOV-NEXT: blt a3, a5, .LBB17_5 ; CHECK-NOV-NEXT: .LBB17_14: # %entry -; CHECK-NOV-NEXT: mv a3, a4 +; CHECK-NOV-NEXT: mv a3, a5 ; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz -; CHECK-NOV-NEXT: blt a5, a4, .LBB17_6 +; CHECK-NOV-NEXT: blt a4, a5, .LBB17_6 ; CHECK-NOV-NEXT: .LBB17_15: # %entry -; CHECK-NOV-NEXT: mv a5, a4 +; CHECK-NOV-NEXT: mv a4, a5 ; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz -; CHECK-NOV-NEXT: blt a6, a4, .LBB17_7 +; CHECK-NOV-NEXT: blt a6, a5, .LBB17_7 ; CHECK-NOV-NEXT: .LBB17_16: # %entry -; CHECK-NOV-NEXT: mv a6, a4 -; CHECK-NOV-NEXT: bge a7, a4, .LBB17_8 +; CHECK-NOV-NEXT: mv a6, a5 +; CHECK-NOV-NEXT: bge a7, a5, .LBB17_8 ; CHECK-NOV-NEXT: j .LBB17_9 ; ; CHECK-V-LABEL: ustest_f16i16: diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll index d0b184bd853ee..afe918bd66648 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll @@ -13,22 +13,22 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) { ; RV32: # %bb.0: ; RV32-NEXT: lw a0, 0(a0) ; RV32-NEXT: srli a2, a0, 16 -; RV32-NEXT: slli a3, a0, 16 -; RV32-NEXT: slli a4, a0, 24 -; RV32-NEXT: slli a5, a0, 8 -; RV32-NEXT: srli a6, a3, 24 -; RV32-NEXT: srai a3, a3, 24 +; RV32-NEXT: srli a3, a0, 8 +; RV32-NEXT: slli a4, a0, 16 +; RV32-NEXT: slli a5, a0, 24 +; RV32-NEXT: slli a6, a0, 8 ; RV32-NEXT: srai a4, a4, 24 ; RV32-NEXT: srai a5, a5, 24 +; RV32-NEXT: srai a6, a6, 24 +; RV32-NEXT: sgtz a6, a6 ; RV32-NEXT: sgtz a5, a5 ; RV32-NEXT: sgtz a4, a4 -; RV32-NEXT: sgtz a3, a3 -; RV32-NEXT: neg a3, a3 ; RV32-NEXT: neg a4, a4 ; RV32-NEXT: neg a5, a5 -; RV32-NEXT: and a3, a3, a6 -; RV32-NEXT: and a0, a4, a0 -; RV32-NEXT: and a2, a5, a2 +; RV32-NEXT: neg a6, a6 +; RV32-NEXT: and a3, a4, a3 +; RV32-NEXT: and a0, a5, a0 +; RV32-NEXT: and a2, a6, a2 ; RV32-NEXT: slli a3, a3, 8 ; RV32-NEXT: zext.b a0, a0 ; RV32-NEXT: or a0, a0, a3 @@ -39,23 +39,23 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) { ; RV64-LABEL: vec3_setcc_crash: ; RV64: # %bb.0: ; RV64-NEXT: lw a0, 0(a0) -; RV64-NEXT: srliw a2, a0, 16 -; RV64-NEXT: slli a3, a0, 48 -; RV64-NEXT: slli a4, a0, 56 -; RV64-NEXT: slli a5, a0, 40 -; RV64-NEXT: srli a6, a3, 56 -; RV64-NEXT: srai a3, a3, 56 +; RV64-NEXT: srli a2, a0, 16 +; RV64-NEXT: srli a3, a0, 8 +; RV64-NEXT: slli a4, a0, 48 +; RV64-NEXT: slli a5, a0, 56 +; RV64-NEXT: slli a6, a0, 40 ; RV64-NEXT: srai a4, a4, 56 ; RV64-NEXT: srai a5, a5, 56 +; RV64-NEXT: srai a6, a6, 56 +; RV64-NEXT: sgtz a6, a6 ; RV64-NEXT: sgtz a5, a5 ; RV64-NEXT: sgtz a4, a4 -; RV64-NEXT: sgtz a3, a3 -; RV64-NEXT: neg a3, a3 ; RV64-NEXT: neg a4, a4 ; RV64-NEXT: neg a5, a5 -; RV64-NEXT: and a3, a3, a6 -; RV64-NEXT: and a0, a4, a0 -; RV64-NEXT: and a2, a5, a2 +; RV64-NEXT: neg a6, a6 +; RV64-NEXT: and a3, a4, a3 +; RV64-NEXT: and a0, a5, a0 +; RV64-NEXT: and a2, a6, a2 ; RV64-NEXT: slli a3, a3, 8 ; RV64-NEXT: zext.b a0, a0 ; RV64-NEXT: or a0, a0, a3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll index 5c1e41fb5e628..b83ddce61f44d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll @@ -470,61 +470,61 @@ define @test_vp_splice_nxv16i64( %va, @test_vp_splice_nxv16i64( %va, @test_vp_splice_nxv16i64_negative_offset( %va, %vb, i32 zeroext %evla, i32 zeroext %evlb) #0 { ; CHECK-LABEL: test_vp_splice_nxv16i64_negative_offset: ; CHECK: # %bb.0: -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a6, a5, 1 -; CHECK-NEXT: addi a6, a6, -1 -; CHECK-NEXT: slli a1, a5, 3 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: bltu a2, a6, .LBB23_2 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a1, a4, 3 +; CHECK-NEXT: slli a7, a4, 1 +; CHECK-NEXT: addi a7, a7, -1 +; CHECK-NEXT: add a5, a0, a1 +; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: bltu a2, a7, .LBB23_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a4, a6 +; CHECK-NEXT: mv a6, a7 ; CHECK-NEXT: .LBB23_2: ; CHECK-NEXT: addi sp, sp, -80 ; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; CHECK-NEXT: addi s0, sp, 80 -; CHECK-NEXT: csrr a6, vlenb -; CHECK-NEXT: slli a6, a6, 5 -; CHECK-NEXT: sub sp, sp, a6 +; CHECK-NEXT: csrr a7, vlenb +; CHECK-NEXT: slli a7, a7, 5 +; CHECK-NEXT: sub sp, sp, a7 ; CHECK-NEXT: andi sp, sp, -64 -; CHECK-NEXT: add a6, a0, a1 -; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: vl8re64.v v24, (a5) +; CHECK-NEXT: slli a5, a6, 3 ; CHECK-NEXT: addi a7, sp, 64 +; CHECK-NEXT: add a6, a7, a5 ; CHECK-NEXT: mv t0, a2 -; CHECK-NEXT: bltu a2, a5, .LBB23_4 +; CHECK-NEXT: bltu a2, a4, .LBB23_4 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: mv t0, a5 +; CHECK-NEXT: mv t0, a4 ; CHECK-NEXT: .LBB23_4: -; CHECK-NEXT: vl8re64.v v24, (a6) -; CHECK-NEXT: add a6, a7, a4 ; CHECK-NEXT: vl8re64.v v0, (a0) ; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v8, (a7) -; CHECK-NEXT: sub a0, a2, a5 +; CHECK-NEXT: sub a0, a2, a4 +; CHECK-NEXT: add a7, a7, a1 +; CHECK-NEXT: sub t0, a3, a4 ; CHECK-NEXT: sltu a2, a2, a0 ; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: and a0, a2, a0 -; CHECK-NEXT: add a7, a7, a1 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: and a2, a2, a0 +; CHECK-NEXT: sltu a0, a3, t0 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, t0 +; CHECK-NEXT: add t0, a6, a1 +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v16, (a7) -; CHECK-NEXT: mv a0, a3 -; CHECK-NEXT: bltu a3, a5, .LBB23_6 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v24, (t0) +; CHECK-NEXT: bltu a3, a4, .LBB23_6 ; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: mv a0, a5 +; CHECK-NEXT: mv a3, a4 ; CHECK-NEXT: .LBB23_6: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: li a2, 8 +; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v0, (a6) -; CHECK-NEXT: sub a2, a3, a5 -; CHECK-NEXT: add a5, a6, a1 -; CHECK-NEXT: sltu a3, a3, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: li a3, 8 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vse64.v v24, (a5) -; CHECK-NEXT: bltu a4, a3, .LBB23_8 +; CHECK-NEXT: bltu a5, a2, .LBB23_8 ; CHECK-NEXT: # %bb.7: -; CHECK-NEXT: li a4, 8 +; CHECK-NEXT: li a5, 8 ; CHECK-NEXT: .LBB23_8: -; CHECK-NEXT: sub a2, a6, a4 +; CHECK-NEXT: sub a2, a6, a5 ; CHECK-NEXT: add a1, a2, a1 -; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a2) ; CHECK-NEXT: addi sp, s0, -80 ; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/VE/Scalar/min.ll b/llvm/test/CodeGen/VE/Scalar/min.ll index 69d5ce48601f8..e8f4939f9149e 100644 --- a/llvm/test/CodeGen/VE/Scalar/min.ll +++ b/llvm/test/CodeGen/VE/Scalar/min.ll @@ -278,18 +278,18 @@ define i32 @min2u32(i32, i32) { define zeroext i1 @mini1(i1 zeroext, i1 zeroext) { ; CHECK-LABEL: mini1: ; CHECK: # %bb.0: -; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: and %s2, %s1, %s0 -; CHECK-NEXT: cmov.w.ne %s2, %s1, %s0 -; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 +; CHECK-NEXT: and %s2, 1, %s0 +; CHECK-NEXT: and %s0, %s1, %s0 +; CHECK-NEXT: cmov.w.ne %s0, %s1, %s2 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) ; ; OPT-LABEL: mini1: ; OPT: # %bb.0: -; OPT-NEXT: and %s0, %s0, (32)0 -; OPT-NEXT: and %s2, %s1, %s0 -; OPT-NEXT: cmov.w.ne %s2, %s1, %s0 -; OPT-NEXT: adds.w.zx %s0, %s2, (0)1 +; OPT-NEXT: and %s2, 1, %s0 +; OPT-NEXT: and %s0, %s1, %s0 +; OPT-NEXT: cmov.w.ne %s0, %s1, %s2 +; OPT-NEXT: adds.w.zx %s0, %s0, (0)1 ; OPT-NEXT: b.l.t (, %s10) %3 = xor i1 %0, true %4 = and i1 %3, %1 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 1ae1d61091362..98187d61c1f84 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2201,9 +2201,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: psraw $8, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw $7, %xmm3 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7] +; SSE41-NEXT: paddw %xmm0, %xmm3 +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7] ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE41-NEXT: psraw $8, %xmm2 @@ -2234,9 +2234,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7] +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index 189de051011d2..962ffe47d0d51 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -490,18 +490,19 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind { define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_ashr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $3, %eax -; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sarl $3, %ecx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $6, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_exact_extra_use: ; X64: # %bb.0: +; X64-NEXT: sarl $3, %edi +; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax -; X64-NEXT: sarl $3, %eax -; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: sarl $6, %eax ; X64-NEXT: retq %x = ashr exact i32 %a0, 3 @@ -603,18 +604,19 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind { define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_lshr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrl $5, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_exact_extra_use: ; X64: # %bb.0: +; X64-NEXT: shrl $3, %edi +; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: shrl $5, %eax ; X64-NEXT: retq %x = lshr exact i32 %a0, 3 diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll index 953a5e7285fe4..15b43c41b9945 100644 --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -600,8 +600,8 @@ define void @freeze_buildvector_extrause(ptr %origin0, ptr %origin1, ptr %origin ; X86-NEXT: vpinsrd $1, (%edi), %xmm0, %xmm0 ; X86-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0 ; X86-NEXT: vpinsrd $3, (%edx), %xmm0, %xmm0 -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-NEXT: vmovdqa %xmm0, (%ecx) +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; X86-NEXT: vmovdqa %xmm1, (%ecx) ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: popl %esi @@ -616,8 +616,8 @@ define void @freeze_buildvector_extrause(ptr %origin0, ptr %origin1, ptr %origin ; X64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0 ; X64-NEXT: vpinsrd $3, (%rcx), %xmm0, %xmm0 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vmovdqa %xmm0, (%r9) +; X64-NEXT: vpand %xmm1, %xmm0, %xmm1 +; X64-NEXT: vmovdqa %xmm1, (%r9) ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%r8) diff --git a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll index 6376b4d599de7..f3bb3343abbc4 100644 --- a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll @@ -4,7 +4,14 @@ define <4 x i32> @hadd_select_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: hadd_select_v4i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3] +; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] +; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq entry: %and1 = and <4 x i32> %x, @@ -73,7 +80,15 @@ entry: define <4 x i32> @hsub_select_shl_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: hsub_select_shl_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [65535,65535,65535,65535] +; CHECK-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpor %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vphsubd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpslld $16, %xmm0, %xmm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9] +; CHECK-NEXT: vpmaxud %xmm2, %xmm1, %xmm2 +; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %or1 = or <4 x i32> %x, %or2 = or <4 x i32> %y, diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 3a4a638c7330a..fb2433dbbb1e1 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -730,36 +730,36 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa (%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 ; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 +; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm9 ; AVX1-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm7 ; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 +; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -767,20 +767,20 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX2-LABEL: vec256_i64_signed_mem_reg: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlq $33, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -790,36 +790,36 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOP-NEXT: vmovdqa (%rdi), %xmm2 ; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5 +; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm4 ; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; XOP-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; XOP-NEXT: vpsubq %xmm0, %xmm4, %xmm0 +; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm5 ; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 +; XOP-NEXT: vpor %xmm4, %xmm8, %xmm9 ; XOP-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 -; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 -; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 +; XOP-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 +; XOP-NEXT: vpor %xmm5, %xmm8, %xmm7 ; XOP-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 -; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 +; XOP-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 ; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm3, %xmm5, %xmm3 ; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq @@ -897,101 +897,101 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i64_signed_reg_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 -; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 -; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 -; AVX1-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 -; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm9 +; AVX1-NEXT: vpmuludq %xmm2, %xmm9, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 +; AVX1-NEXT: vpsrlq $33, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm7 +; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 +; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i64_signed_reg_mem: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 ; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: vec256_i64_signed_reg_mem: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOP-NEXT: vpcomgtq %xmm2, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm5 -; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 -; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOP-NEXT: vmovdqa (%rdi), %xmm2 +; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 +; XOP-NEXT: vpcomgtq %xmm2, %xmm0, %xmm4 +; XOP-NEXT: vpsubq %xmm2, %xmm0, %xmm2 ; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpsubq %xmm2, %xmm4, %xmm2 -; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 -; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 -; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 -; XOP-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 -; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 -; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 -; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; XOP-NEXT: vpcomgtq %xmm3, %xmm1, %xmm5 +; XOP-NEXT: vpsubq %xmm3, %xmm1, %xmm3 +; XOP-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; XOP-NEXT: vpsubq %xmm3, %xmm5, %xmm3 +; XOP-NEXT: vpsrlq $1, %xmm3, %xmm6 +; XOP-NEXT: vpsrlq $1, %xmm2, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm2, %xmm2 -; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 -; XOP-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vpor %xmm4, %xmm8, %xmm9 +; XOP-NEXT: vpmuludq %xmm2, %xmm9, %xmm2 ; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 -; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 +; XOP-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 ; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; XOP-NEXT: vpsllq $32, %xmm2, %xmm2 -; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; XOP-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 -; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 +; XOP-NEXT: vpsrlq $33, %xmm3, %xmm3 +; XOP-NEXT: vpor %xmm5, %xmm8, %xmm7 +; XOP-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 +; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 +; XOP-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm3, %xmm5, %xmm3 +; XOP-NEXT: vpsllq $32, %xmm3, %xmm3 +; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i64_signed_reg_mem: @@ -1071,36 +1071,36 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX1-NEXT: vmovdqa (%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 ; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 +; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm9 ; AVX1-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm7 ; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 +; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1109,20 +1109,20 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 ; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1133,36 +1133,36 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vmovdqa 16(%rsi), %xmm1 ; XOP-NEXT: vmovdqa (%rdi), %xmm2 ; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5 +; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm4 ; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; XOP-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; XOP-NEXT: vpsubq %xmm0, %xmm4, %xmm0 +; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm5 ; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] -; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 +; XOP-NEXT: vpor %xmm4, %xmm8, %xmm9 ; XOP-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 -; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 -; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 +; XOP-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 +; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 +; XOP-NEXT: vpor %xmm5, %xmm8, %xmm7 ; XOP-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 -; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 +; XOP-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 ; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 -; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 +; XOP-NEXT: vpaddq %xmm3, %xmm5, %xmm3 ; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq @@ -1627,27 +1627,27 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i16_signed_reg_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm6 +; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm6 +; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpsubw %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i16_signed_reg_mem: @@ -1665,25 +1665,25 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw ; ; XOP-LABEL: vec256_i16_signed_reg_mem: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOP-NEXT: vpcomgtw %xmm2, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm5 -; XOP-NEXT: vpminsw %xmm2, %xmm3, %xmm6 -; XOP-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOP-NEXT: vmovdqa (%rdi), %xmm2 +; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 +; XOP-NEXT: vpcomgtw %xmm3, %xmm1, %xmm4 +; XOP-NEXT: vpcomgtw %xmm2, %xmm0, %xmm5 +; XOP-NEXT: vpminsw %xmm3, %xmm1, %xmm6 +; XOP-NEXT: vpmaxsw %xmm3, %xmm1, %xmm3 +; XOP-NEXT: vpsubw %xmm6, %xmm3, %xmm3 +; XOP-NEXT: vpminsw %xmm2, %xmm0, %xmm6 +; XOP-NEXT: vpmaxsw %xmm2, %xmm0, %xmm2 ; XOP-NEXT: vpsubw %xmm6, %xmm2, %xmm2 -; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm6 -; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2 +; XOP-NEXT: vpsrlw $1, %xmm3, %xmm3 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 -; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2 -; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm1, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOP-NEXT: vpmacsww %xmm1, %xmm4, %xmm3, %xmm1 +; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm2, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i16_signed_reg_mem: @@ -2425,9 +2425,9 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i8_signed_reg_mem: ; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa (%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm6 @@ -2487,38 +2487,38 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; ; XOP-LABEL: vec256_i8_signed_reg_mem: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOP-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5 -; XOP-NEXT: vpminsb %xmm1, %xmm0, %xmm6 -; XOP-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpminsb %xmm2, %xmm3, %xmm6 -; XOP-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOP-NEXT: vmovdqa (%rdi), %xmm2 +; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 +; XOP-NEXT: vpcomgtb %xmm3, %xmm1, %xmm4 +; XOP-NEXT: vpcomgtb %xmm2, %xmm0, %xmm5 +; XOP-NEXT: vpminsb %xmm2, %xmm0, %xmm6 +; XOP-NEXT: vpmaxsb %xmm2, %xmm0, %xmm2 ; XOP-NEXT: vpsubb %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpminsb %xmm3, %xmm1, %xmm6 +; XOP-NEXT: vpmaxsb %xmm3, %xmm1, %xmm3 +; XOP-NEXT: vpsubb %xmm6, %xmm3, %xmm3 ; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 +; XOP-NEXT: vpshlb %xmm6, %xmm3, %xmm3 ; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 -; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 -; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 +; XOP-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 ; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] -; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 +; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm2, %xmm2 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 -; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 +; XOP-NEXT: vpmaddubsw %xmm6, %xmm3, %xmm6 ; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 -; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOP-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 +; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm3, %xmm3 +; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i8_signed_reg_mem: diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll index 5f6337e29d685..a4750b4cd4ad0 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -507,58 +507,58 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i16_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm5 +; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubw %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_reg_mem: @@ -939,66 +939,66 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm7, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_reg_mem: diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll index 1921cf383b2f2..a75d42ed0c50f 100644 --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -28,24 +28,27 @@ define i32 @scalar_i32_signed_reg_reg(i32 %a1, i32 %a2) nounwind { ; ; X86-LABEL: scalar_i32_signed_reg_reg: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl %esi, %eax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %esi ; X86-NEXT: jg .LBB0_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: negl %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: .LBB0_2: ; X86-NEXT: shrl %eax -; X86-NEXT: imull %edx, %eax +; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %t3 = icmp sgt i32 %a1, %a2 ; signed %t4 = select i1 %t3, i32 -1, i32 1 @@ -76,26 +79,27 @@ define i32 @scalar_i32_unsigned_reg_reg(i32 %a1, i32 %a2) nounwind { ; ; X86-LABEL: scalar_i32_unsigned_reg_reg: ; X86: # %bb.0: -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: movl %edi, %esi -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: subl %edi, %eax +; X86-NEXT: subl %esi, %eax +; X86-NEXT: setbe %bl +; X86-NEXT: leal -1(%ebx,%ebx), %esi ; X86-NEXT: ja .LBB1_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %eax +; X86-NEXT: negl %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: .LBB1_2: ; X86-NEXT: shrl %eax -; X86-NEXT: imull %edx, %eax +; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %t3 = icmp ugt i32 %a1, %a2 %t4 = select i1 %t3, i32 -1, i32 1 @@ -128,25 +132,28 @@ define i32 @scalar_i32_signed_mem_reg(ptr %a1_addr, i32 %a2) nounwind { ; ; X86-LABEL: scalar_i32_signed_mem_reg: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %esi +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: subl %edx, %eax +; X86-NEXT: subl %esi, %eax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %esi ; X86-NEXT: jg .LBB2_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: subl %ecx, %edx +; X86-NEXT: negl %edx ; X86-NEXT: movl %edx, %eax ; X86-NEXT: .LBB2_2: ; X86-NEXT: shrl %eax ; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %a1 = load i32, ptr %a1_addr %t3 = icmp sgt i32 %a1, %a2 ; signed @@ -178,25 +185,28 @@ define i32 @scalar_i32_signed_reg_mem(i32 %a1, ptr %a2_addr) nounwind { ; ; X86-LABEL: scalar_i32_signed_reg_mem: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl %esi, %eax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %esi ; X86-NEXT: jg .LBB3_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: negl %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: .LBB3_2: ; X86-NEXT: shrl %eax -; X86-NEXT: imull %edx, %eax +; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %a2 = load i32, ptr %a2_addr %t3 = icmp sgt i32 %a1, %a2 ; signed @@ -229,26 +239,29 @@ define i32 @scalar_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; ; X86-LABEL: scalar_i32_signed_mem_mem: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx ; X86-NEXT: movl (%eax), %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl %esi, %eax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %esi ; X86-NEXT: jg .LBB4_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: negl %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: .LBB4_2: ; X86-NEXT: shrl %eax -; X86-NEXT: imull %edx, %eax +; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %a1 = load i32, ptr %a1_addr %a2 = load i32, ptr %a2_addr @@ -291,36 +304,34 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: orl $1, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: subl %esi, %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: setl %bl +; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: jl .LBB5_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: .LBB5_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %edx +; X86-NEXT: negl %ebx +; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: shrl %edi +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: imull %ebx, %ebp +; X86-NEXT: orl $1, %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: imull %edi, %ebx +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -359,10 +370,10 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpl %eax, %ebp ; X86-NEXT: sbbl %ecx, %esi @@ -429,45 +440,36 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %ebx -; X86-NEXT: movl 4(%eax), %esi -; X86-NEXT: cmpl %ebx, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: orl $1, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl (%eax), %esi +; X86-NEXT: movl 4(%eax), %ecx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: subl %esi, %edx ; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl %ebx, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: setl %bl +; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: jl .LBB7_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: .LBB7_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: negl %ebx +; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: shrl %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: imull %ebx, %ebp +; X86-NEXT: orl $1, %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: addl $12, %esp +; X86-NEXT: imull %edi, %ebx +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -508,37 +510,35 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: orl $1, %ebx +; X86-NEXT: movl 4(%eax), %ebp ; X86-NEXT: movl %esi, %eax ; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: subl %esi, %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: setl %bl +; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: jl .LBB8_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: .LBB8_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %edx +; X86-NEXT: negl %ebx +; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: shrl %edi +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: imull %ebx, %ebp +; X86-NEXT: orl $1, %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: imull %edi, %ebx +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -579,46 +579,37 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ebx -; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: movl 4(%ecx), %ecx ; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: cmpl %ebx, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: orl $1, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl 4(%eax), %ebp +; X86-NEXT: movl %esi, %eax ; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: subl %esi, %edx ; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl %ebx, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: setl %bl +; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: jl .LBB9_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: .LBB9_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: negl %ebx +; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: shrl %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: imull %ebx, %ebp +; X86-NEXT: orl $1, %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: addl $12, %esp +; X86-NEXT: imull %edi, %ebx +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -667,17 +658,16 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subw %dx, %ax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: jg .LBB10_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: negl %eax ; X86-NEXT: .LBB10_2: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -720,17 +710,16 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subw %dx, %ax +; X86-NEXT: setbe %bl +; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: ja .LBB11_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: negl %eax ; X86-NEXT: .LBB11_2: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpw %cx, %dx -; X86-NEXT: setae %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -777,16 +766,15 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind { ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl (%eax), %ecx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subw %dx, %ax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: jg .LBB12_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: negl %eax ; X86-NEXT: .LBB12_2: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -829,19 +817,18 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_reg_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subw %dx, %ax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: jg .LBB13_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: negl %eax ; X86-NEXT: .LBB13_2: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -888,16 +875,15 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %ecx ; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subw %dx, %ax +; X86-NEXT: setle %bl +; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: jg .LBB14_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: negl %eax ; X86-NEXT: .LBB14_2: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -946,17 +932,16 @@ define i8 @scalar_i8_signed_reg_reg(i8 %a1, i8 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: cmpb %ah, %cl -; X86-NEXT: setg %dl -; X86-NEXT: negb %dl -; X86-NEXT: orb $1, %dl ; X86-NEXT: movb %cl, %al ; X86-NEXT: subb %ah, %al +; X86-NEXT: setg %dl ; X86-NEXT: jg .LBB15_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: subb %cl, %ah ; X86-NEXT: movb %ah, %al ; X86-NEXT: .LBB15_2: +; X86-NEXT: negb %dl +; X86-NEXT: orb $1, %dl ; X86-NEXT: shrb %al ; X86-NEXT: mulb %dl ; X86-NEXT: addb %cl, %al @@ -993,18 +978,17 @@ define i8 @scalar_i8_unsigned_reg_reg(i8 %a1, i8 %a2) nounwind { ; X86-LABEL: scalar_i8_unsigned_reg_reg: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: movb %ch, %ah -; X86-NEXT: subb %cl, %ah -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: orb $1, %dl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: movb %cl, %al -; X86-NEXT: subb %ch, %al +; X86-NEXT: subb %ah, %al +; X86-NEXT: seta %dl ; X86-NEXT: ja .LBB16_2 ; X86-NEXT: # %bb.1: +; X86-NEXT: subb %cl, %ah ; X86-NEXT: movb %ah, %al ; X86-NEXT: .LBB16_2: +; X86-NEXT: negb %dl +; X86-NEXT: orb $1, %dl ; X86-NEXT: shrb %al ; X86-NEXT: mulb %dl ; X86-NEXT: addb %cl, %al @@ -1046,17 +1030,16 @@ define i8 @scalar_i8_signed_mem_reg(ptr %a1_addr, i8 %a2) nounwind { ; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl (%ecx), %ecx -; X86-NEXT: cmpb %ah, %cl -; X86-NEXT: setg %dl -; X86-NEXT: negb %dl -; X86-NEXT: orb $1, %dl ; X86-NEXT: movb %cl, %al ; X86-NEXT: subb %ah, %al +; X86-NEXT: setg %dl ; X86-NEXT: jg .LBB17_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: subb %cl, %ah ; X86-NEXT: movb %ah, %al ; X86-NEXT: .LBB17_2: +; X86-NEXT: negb %dl +; X86-NEXT: orb $1, %dl ; X86-NEXT: shrb %al ; X86-NEXT: mulb %dl ; X86-NEXT: addb %cl, %al @@ -1096,17 +1079,16 @@ define i8 @scalar_i8_signed_reg_mem(i8 %a1, ptr %a2_addr) nounwind { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb (%eax), %ah -; X86-NEXT: cmpb %ah, %cl -; X86-NEXT: setg %dl -; X86-NEXT: negb %dl -; X86-NEXT: orb $1, %dl ; X86-NEXT: movb %cl, %al ; X86-NEXT: subb %ah, %al +; X86-NEXT: setg %dl ; X86-NEXT: jg .LBB18_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: subb %cl, %ah ; X86-NEXT: movb %ah, %al ; X86-NEXT: .LBB18_2: +; X86-NEXT: negb %dl +; X86-NEXT: orb $1, %dl ; X86-NEXT: shrb %al ; X86-NEXT: mulb %dl ; X86-NEXT: addb %cl, %al @@ -1148,17 +1130,16 @@ define i8 @scalar_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl (%ecx), %ecx ; X86-NEXT: movb (%eax), %ah -; X86-NEXT: cmpb %ah, %cl -; X86-NEXT: setg %dl -; X86-NEXT: negb %dl -; X86-NEXT: orb $1, %dl ; X86-NEXT: movb %cl, %al ; X86-NEXT: subb %ah, %al +; X86-NEXT: setg %dl ; X86-NEXT: jg .LBB19_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: subb %cl, %ah ; X86-NEXT: movb %ah, %al ; X86-NEXT: .LBB19_2: +; X86-NEXT: negb %dl +; X86-NEXT: orb $1, %dl ; X86-NEXT: shrb %al ; X86-NEXT: mulb %dl ; X86-NEXT: addb %cl, %al diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index a1da40e7e7655..f53983036a016 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -155,10 +155,10 @@ define <16 x i32> @PR42819(ptr %a0) { define void @PR42833() { ; SSE2-LABEL: PR42833: ; SSE2: # %bb.0: -; SSE2-NEXT: movl b(%rip), %eax -; SSE2-NEXT: movdqa c+128(%rip), %xmm0 ; SSE2-NEXT: movdqa c+144(%rip), %xmm2 -; SSE2-NEXT: addl c+128(%rip), %eax +; SSE2-NEXT: movdqa c+128(%rip), %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: addl b(%rip), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: paddd %xmm0, %xmm3 @@ -191,10 +191,10 @@ define void @PR42833() { ; ; SSE42-LABEL: PR42833: ; SSE42: # %bb.0: -; SSE42-NEXT: movl b(%rip), %eax -; SSE42-NEXT: movdqa c+128(%rip), %xmm0 ; SSE42-NEXT: movdqa c+144(%rip), %xmm1 -; SSE42-NEXT: addl c+128(%rip), %eax +; SSE42-NEXT: movdqa c+128(%rip), %xmm0 +; SSE42-NEXT: movd %xmm0, %eax +; SSE42-NEXT: addl b(%rip), %eax ; SSE42-NEXT: movd %eax, %xmm2 ; SSE42-NEXT: paddd %xmm0, %xmm2 ; SSE42-NEXT: movdqa d+144(%rip), %xmm3 diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll index b633c28a214b7..412455384e937 100644 --- a/llvm/test/CodeGen/X86/pr38539.ll +++ b/llvm/test/CodeGen/X86/pr38539.ll @@ -23,7 +23,7 @@ define void @f() nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $160, %esp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl (%eax), %eax diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll index 894186f9b343b..1ab1a1a01e168 100644 --- a/llvm/test/CodeGen/X86/vector-compress.ll +++ b/llvm/test/CodeGen/X86/vector-compress.ll @@ -1094,26 +1094,25 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $1, %xmm1, %r11d -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: movzbl %al, %edx -; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vpextrb $1, %xmm1, %r13d +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax ; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: subb %r11b, %al -; AVX2-NEXT: vpextrb $2, %xmm1, %esi -; AVX2-NEXT: subb %sil, %al -; AVX2-NEXT: vpextrb $3, %xmm1, %r13d ; AVX2-NEXT: subb %r13b, %al +; AVX2-NEXT: vpextrb $2, %xmm1, %edx +; AVX2-NEXT: subb %dl, %al +; AVX2-NEXT: vpextrb $3, %xmm1, %ebp +; AVX2-NEXT: subb %bpl, %al ; AVX2-NEXT: vpextrb $4, %xmm1, %r12d ; AVX2-NEXT: subb %r12b, %al ; AVX2-NEXT: vpextrb $5, %xmm1, %r15d ; AVX2-NEXT: subb %r15b, %al ; AVX2-NEXT: vpextrb $6, %xmm1, %r14d ; AVX2-NEXT: subb %r14b, %al -; AVX2-NEXT: vpextrb $7, %xmm1, %ebp -; AVX2-NEXT: subb %bpl, %al -; AVX2-NEXT: vpextrb $8, %xmm1, %ebx +; AVX2-NEXT: vpextrb $7, %xmm1, %ebx ; AVX2-NEXT: subb %bl, %al +; AVX2-NEXT: vpextrb $8, %xmm1, %r11d +; AVX2-NEXT: subb %r11b, %al ; AVX2-NEXT: vpextrb $9, %xmm1, %r10d ; AVX2-NEXT: subb %r10b, %al ; AVX2-NEXT: vpextrb $10, %xmm1, %r9d @@ -1123,108 +1122,94 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> ; AVX2-NEXT: vpextrb $12, %xmm1, %edi ; AVX2-NEXT: subb %dil, %al ; AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: subb %cl, %al ; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: subb %cl, %al ; AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX2-NEXT: subb %cl, %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzbl -40(%rsp,%rax), %eax ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rsi) +; AVX2-NEXT: andl $1, %r13d +; AVX2-NEXT: addq %rsi, %r13 +; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%r13) ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rdx) -; AVX2-NEXT: movzbl %r11b, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl %sil, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl %r13b, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl %r12b, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: movzbl %r15b, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl %r14b, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl %bpl, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl %bl, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl %r10b, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl %r9b, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl %r8b, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl %dil, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: addq %r13, %rdx +; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rdx) +; AVX2-NEXT: andl $1, %ebp +; AVX2-NEXT: addq %rdx, %rbp +; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rbp) +; AVX2-NEXT: andl $1, %r12d +; AVX2-NEXT: addq %rbp, %r12 +; AVX2-NEXT: andl $1, %r15d +; AVX2-NEXT: addq %r12, %r15 +; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12 +; AVX2-NEXT: andl $15, %r12d +; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%r12) +; AVX2-NEXT: andl $1, %r14d +; AVX2-NEXT: addq %r15, %r14 +; AVX2-NEXT: # kill: def $r15d killed $r15d killed $r15 def $r15 +; AVX2-NEXT: andl $15, %r15d +; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%r15) +; AVX2-NEXT: andl $1, %ebx +; AVX2-NEXT: addq %r14, %rbx +; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14 +; AVX2-NEXT: andl $15, %r14d +; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%r14) +; AVX2-NEXT: andl $1, %r11d +; AVX2-NEXT: addq %rbx, %r11 +; AVX2-NEXT: # kill: def $ebx killed $ebx killed $rbx def $rbx +; AVX2-NEXT: andl $15, %ebx +; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rbx) +; AVX2-NEXT: andl $1, %r10d +; AVX2-NEXT: addq %r11, %r10 +; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11 +; AVX2-NEXT: andl $15, %r11d +; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%r11) +; AVX2-NEXT: andl $1, %r9d +; AVX2-NEXT: addq %r10, %r9 +; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10 +; AVX2-NEXT: andl $15, %r10d +; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%r10) +; AVX2-NEXT: andl $1, %r8d +; AVX2-NEXT: addq %r9, %r8 +; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9 +; AVX2-NEXT: andl $15, %r9d +; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%r9) +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: addq %r8, %rdi +; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8 +; AVX2-NEXT: andl $15, %r8d +; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%r8) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: addq %rdi, %rsi +; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rdi) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: addq %rsi, %rax +; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rsi) ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: cmpq $15, %rax -; AVX2-NEXT: movl $15, %ecx -; AVX2-NEXT: cmovbq %rax, %rcx -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload -; AVX2-NEXT: movb %al, -40(%rsp,%rcx) +; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rax) +; AVX2-NEXT: cmpq $15, %rcx +; AVX2-NEXT: movl $15, %eax +; AVX2-NEXT: cmovbq %rcx, %rax +; AVX2-NEXT: vpextrb $15, %xmm0, %ecx +; AVX2-NEXT: cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload +; AVX2-NEXT: movb %cl, -40(%rsp,%rax) ; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 @@ -1805,140 +1790,137 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $128, %rsp -; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 -; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 -; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl %ecx, %r13d -; AVX2-NEXT: movl %edx, %r15d -; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: subq $96, %rsp +; AVX2-NEXT: movl %r9d, %r11d +; AVX2-NEXT: movl %r8d, %r10d +; AVX2-NEXT: movl %ecx, %r9d +; AVX2-NEXT: movl %edx, %r8d +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl 360(%rbp), %eax -; AVX2-NEXT: movl 352(%rbp), %ecx +; AVX2-NEXT: movzbl 360(%rbp), %eax +; AVX2-NEXT: movzbl 352(%rbp), %ecx ; AVX2-NEXT: vmovd %ecx, %xmm4 ; AVX2-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 368(%rbp), %eax +; AVX2-NEXT: movzbl 368(%rbp), %eax ; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 376(%rbp), %eax +; AVX2-NEXT: movzbl 376(%rbp), %eax ; AVX2-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 384(%rbp), %eax +; AVX2-NEXT: movzbl 384(%rbp), %eax ; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 392(%rbp), %eax +; AVX2-NEXT: movzbl 392(%rbp), %eax ; AVX2-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 400(%rbp), %eax +; AVX2-NEXT: movzbl 400(%rbp), %eax ; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 408(%rbp), %eax +; AVX2-NEXT: movzbl 408(%rbp), %eax ; AVX2-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 416(%rbp), %eax +; AVX2-NEXT: movzbl 416(%rbp), %eax ; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 424(%rbp), %eax +; AVX2-NEXT: movzbl 424(%rbp), %eax ; AVX2-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 432(%rbp), %eax +; AVX2-NEXT: movzbl 432(%rbp), %eax ; AVX2-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 440(%rbp), %eax +; AVX2-NEXT: movzbl 440(%rbp), %eax ; AVX2-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 448(%rbp), %eax +; AVX2-NEXT: movzbl 448(%rbp), %eax ; AVX2-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 456(%rbp), %eax +; AVX2-NEXT: movzbl 456(%rbp), %eax ; AVX2-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 464(%rbp), %eax +; AVX2-NEXT: movzbl 464(%rbp), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 472(%rbp), %eax +; AVX2-NEXT: movzbl 472(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl 224(%rbp), %eax +; AVX2-NEXT: movzbl 224(%rbp), %eax ; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: movl 232(%rbp), %eax +; AVX2-NEXT: movzbl 232(%rbp), %eax ; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 240(%rbp), %eax +; AVX2-NEXT: movzbl 240(%rbp), %eax ; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 248(%rbp), %eax +; AVX2-NEXT: movzbl 248(%rbp), %eax ; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 256(%rbp), %eax +; AVX2-NEXT: movzbl 256(%rbp), %eax ; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 264(%rbp), %eax +; AVX2-NEXT: movzbl 264(%rbp), %eax ; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 272(%rbp), %eax +; AVX2-NEXT: movzbl 272(%rbp), %eax ; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 280(%rbp), %eax +; AVX2-NEXT: movzbl 280(%rbp), %eax ; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 288(%rbp), %eax +; AVX2-NEXT: movzbl 288(%rbp), %eax ; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 296(%rbp), %eax +; AVX2-NEXT: movzbl 296(%rbp), %eax ; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 304(%rbp), %eax +; AVX2-NEXT: movzbl 304(%rbp), %eax ; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 312(%rbp), %eax +; AVX2-NEXT: movzbl 312(%rbp), %eax ; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 320(%rbp), %eax +; AVX2-NEXT: movzbl 320(%rbp), %eax ; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 328(%rbp), %eax +; AVX2-NEXT: movzbl 328(%rbp), %eax ; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 336(%rbp), %eax +; AVX2-NEXT: movzbl 336(%rbp), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 344(%rbp), %eax +; AVX2-NEXT: movzbl 344(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-NEXT: movl 96(%rbp), %eax -; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: movl 104(%rbp), %eax -; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 112(%rbp), %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 120(%rbp), %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 128(%rbp), %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 136(%rbp), %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 144(%rbp), %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 152(%rbp), %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 160(%rbp), %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 168(%rbp), %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 176(%rbp), %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 184(%rbp), %eax +; AVX2-NEXT: vmovd %edi, %xmm5 +; AVX2-NEXT: vpinsrb $1, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $2, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $3, %r9d, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $4, %r10d, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $5, %r11d, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 16(%rbp), %ebx +; AVX2-NEXT: vpinsrb $6, %ebx, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 24(%rbp), %r14d +; AVX2-NEXT: vpinsrb $7, %r14d, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 32(%rbp), %r15d +; AVX2-NEXT: vpinsrb $8, %r15d, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 40(%rbp), %r12d +; AVX2-NEXT: vpinsrb $9, %r12d, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 48(%rbp), %r13d +; AVX2-NEXT: vpinsrb $10, %r13d, %xmm5, %xmm5 +; AVX2-NEXT: movzbl 56(%rbp), %eax ; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 192(%rbp), %eax +; AVX2-NEXT: movzbl 64(%rbp), %eax ; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 200(%rbp), %eax +; AVX2-NEXT: movzbl 72(%rbp), %eax ; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 208(%rbp), %eax +; AVX2-NEXT: movzbl 80(%rbp), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl 216(%rbp), %eax +; AVX2-NEXT: movzbl 88(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vmovd %edi, %xmm6 -; AVX2-NEXT: vpinsrb $1, %esi, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $2, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $3, %r13d, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $5, %r9d, %xmm6, %xmm6 -; AVX2-NEXT: movl 16(%rbp), %esi -; AVX2-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6 -; AVX2-NEXT: movl 24(%rbp), %edi -; AVX2-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6 -; AVX2-NEXT: movl 32(%rbp), %r8d -; AVX2-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6 -; AVX2-NEXT: movl 40(%rbp), %r9d -; AVX2-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6 -; AVX2-NEXT: movl 48(%rbp), %r10d -; AVX2-NEXT: vpinsrb $10, %r10d, %xmm6, %xmm6 -; AVX2-NEXT: movl 56(%rbp), %r11d -; AVX2-NEXT: vpinsrb $11, %r11d, %xmm6, %xmm6 -; AVX2-NEXT: movl 64(%rbp), %r14d -; AVX2-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6 -; AVX2-NEXT: movl 72(%rbp), %r12d -; AVX2-NEXT: vpinsrb $13, %r12d, %xmm6, %xmm6 -; AVX2-NEXT: movl 80(%rbp), %eax +; AVX2-NEXT: movzbl 96(%rbp), %eax +; AVX2-NEXT: vmovd %eax, %xmm6 +; AVX2-NEXT: movzbl 104(%rbp), %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 112(%rbp), %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 120(%rbp), %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 128(%rbp), %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 136(%rbp), %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 144(%rbp), %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 152(%rbp), %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 160(%rbp), %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 168(%rbp), %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 176(%rbp), %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 184(%rbp), %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 192(%rbp), %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 200(%rbp), %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl 208(%rbp), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movl 88(%rbp), %eax +; AVX2-NEXT: movzbl 216(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6 -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpand %ymm6, %ymm4, %ymm4 @@ -1980,379 +1962,435 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: vmovaps %ymm2, (%rsp) ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movzbl (%rsp,%rax), %edx ; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) -; AVX2-NEXT: andl $1, %ebx -; AVX2-NEXT: addq %rax, %rbx -; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rbx) -; AVX2-NEXT: andl $1, %r15d -; AVX2-NEXT: addq %rbx, %r15 -; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r15) -; AVX2-NEXT: andl $1, %r13d -; AVX2-NEXT: addq %r15, %r13 -; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r13) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %r13, %rcx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: andl $1, %esi -; AVX2-NEXT: addq %rax, %rsi -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) ; AVX2-NEXT: andl $1, %edi -; AVX2-NEXT: addq %rsi, %rdi -; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi -; AVX2-NEXT: andl $63, %esi -; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rsi) +; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rdi) +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: addq %rdi, %rsi +; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rsi) ; AVX2-NEXT: andl $1, %r8d -; AVX2-NEXT: addq %rdi, %r8 -; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi -; AVX2-NEXT: andl $63, %edi -; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdi) +; AVX2-NEXT: addq %rsi, %r8 +; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r8) ; AVX2-NEXT: andl $1, %r9d ; AVX2-NEXT: addq %r8, %r9 -; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8 -; AVX2-NEXT: andl $63, %r8d -; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%r8) +; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r9) ; AVX2-NEXT: andl $1, %r10d ; AVX2-NEXT: addq %r9, %r10 -; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9 -; AVX2-NEXT: andl $63, %r9d -; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%r9) +; AVX2-NEXT: movl %r10d, %eax +; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) ; AVX2-NEXT: andl $1, %r11d ; AVX2-NEXT: addq %r10, %r11 -; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10 -; AVX2-NEXT: andl $63, %r10d -; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%r10) -; AVX2-NEXT: andl $1, %r14d -; AVX2-NEXT: addq %r11, %r14 +; AVX2-NEXT: movzbl %bl, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %r11, %rax ; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11 ; AVX2-NEXT: andl $63, %r11d -; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%r11) -; AVX2-NEXT: andl $1, %r12d -; AVX2-NEXT: addq %r14, %r12 -; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14 -; AVX2-NEXT: andl $63, %r14d -; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%r14) -; AVX2-NEXT: movl 80(%rbp), %eax +; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%r11) +; AVX2-NEXT: movzbl %r14b, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movzbl %r15b, %eax ; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %r12, %rax -; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12 -; AVX2-NEXT: andl $63, %r12d -; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%r12) -; AVX2-NEXT: movl 88(%rbp), %ecx +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl %r12b, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 96(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 56(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movzbl 64(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 72(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movzbl 80(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 88(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movzbl 96(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 104(%rbp), %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 104(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 112(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 120(%rbp), %ecx +; AVX2-NEXT: movzbl 112(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 120(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 128(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 136(%rbp), %ecx +; AVX2-NEXT: movzbl 128(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 136(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 144(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 152(%rbp), %ecx +; AVX2-NEXT: movzbl 144(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 152(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 160(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 168(%rbp), %ecx +; AVX2-NEXT: movzbl 160(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 168(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 176(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 184(%rbp), %ecx +; AVX2-NEXT: movzbl 176(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 184(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 192(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 200(%rbp), %ecx +; AVX2-NEXT: movzbl 192(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 200(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 208(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 216(%rbp), %ecx +; AVX2-NEXT: movzbl 208(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 216(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 224(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 232(%rbp), %ecx +; AVX2-NEXT: movzbl 224(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 232(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $1, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 240(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 248(%rbp), %ecx +; AVX2-NEXT: movzbl 240(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 248(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $3, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 256(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 264(%rbp), %ecx +; AVX2-NEXT: movzbl 256(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 264(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $5, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 272(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 280(%rbp), %ecx +; AVX2-NEXT: movzbl 272(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 280(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $7, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 288(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 296(%rbp), %ecx +; AVX2-NEXT: movzbl 288(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 296(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $9, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 304(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 312(%rbp), %ecx +; AVX2-NEXT: movzbl 304(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 312(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $11, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 320(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 328(%rbp), %ecx +; AVX2-NEXT: movzbl 320(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 328(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $13, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 336(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 344(%rbp), %ecx +; AVX2-NEXT: movzbl 336(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rcx) +; AVX2-NEXT: movzbl 344(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $15, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 352(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: movzbl 352(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 360(%rbp), %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 360(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 368(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 376(%rbp), %ecx +; AVX2-NEXT: movzbl 368(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 376(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 384(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 392(%rbp), %ecx +; AVX2-NEXT: movzbl 384(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 392(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 400(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 408(%rbp), %ecx +; AVX2-NEXT: movzbl 400(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 408(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 416(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 424(%rbp), %ecx +; AVX2-NEXT: movzbl 416(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 424(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 432(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 440(%rbp), %ecx +; AVX2-NEXT: movzbl 432(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 440(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 448(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 456(%rbp), %ecx +; AVX2-NEXT: movzbl 448(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 456(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 464(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 472(%rbp), %ecx +; AVX2-NEXT: movzbl 464(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx) +; AVX2-NEXT: movzbl 472(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) ; AVX2-NEXT: vpextrb $15, %xmm0, %eax ; AVX2-NEXT: cmpq $64, %rcx -; AVX2-NEXT: cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload +; AVX2-NEXT: cmovbl %edx, %eax ; AVX2-NEXT: cmpq $63, %rcx -; AVX2-NEXT: movq %rcx, %rdx -; AVX2-NEXT: movl $63, %ecx -; AVX2-NEXT: cmovbq %rdx, %rcx -; AVX2-NEXT: movb %al, (%rsp,%rcx) +; AVX2-NEXT: movl $63, %edx +; AVX2-NEXT: cmovbq %rcx, %rdx +; AVX2-NEXT: movb %al, (%rsp,%rdx) ; AVX2-NEXT: vmovaps (%rsp), %ymm0 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: leaq -40(%rbp), %rsp diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index e60b56551e58d..d0690bd291f31 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -509,10 +509,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlw $7, %xmm3 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: paddb %xmm2, %xmm4 +; SSE2-NEXT: psrlw $7, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 @@ -545,10 +545,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $7, %xmm0 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: psrlw $7, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 ; SSE41-NEXT: paddb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -572,10 +572,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -704,10 +704,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; X86-SSE2-NEXT: pandn %xmm3, %xmm2 ; X86-SSE2-NEXT: por %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: psrlw $7, %xmm3 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: paddb %xmm2, %xmm4 +; X86-SSE2-NEXT: psrlw $7, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 ; X86-SSE2-NEXT: por %xmm3, %xmm4 ; X86-SSE2-NEXT: paddb %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 11a02f8cf754c..421fa98709d48 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -431,10 +431,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -451,10 +451,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2)) ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index d9799975cd37a..4969cb500d4df 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -533,10 +533,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrlw $7, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: paddb %xmm2, %xmm4 +; SSE2-NEXT: psrlw $7, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: paddb %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm0 @@ -568,10 +568,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE41-NEXT: paddb %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrlw $7, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm2, %xmm3 +; SSE41-NEXT: psrlw $7, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: por %xmm1, %xmm3 ; SSE41-NEXT: paddb %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 @@ -596,10 +596,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -731,10 +731,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: psrlw $7, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: paddb %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: paddb %xmm2, %xmm4 +; X86-SSE2-NEXT: psrlw $7, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 ; X86-SSE2-NEXT: por %xmm1, %xmm4 ; X86-SSE2-NEXT: paddb %xmm3, %xmm3 ; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index 15e09c3b6737e..e2a3e261c0411 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -457,10 +457,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 6c79be75550ed..93f4ce7573ad1 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -442,10 +442,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlw $7, %xmm3 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: paddb %xmm2, %xmm4 +; SSE2-NEXT: psrlw $7, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 @@ -478,10 +478,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $7, %xmm0 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: psrlw $7, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 ; SSE41-NEXT: paddb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -505,10 +505,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -637,10 +637,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-SSE2-NEXT: pandn %xmm3, %xmm2 ; X86-SSE2-NEXT: por %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: psrlw $7, %xmm3 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: paddb %xmm2, %xmm4 +; X86-SSE2-NEXT: psrlw $7, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 ; X86-SSE2-NEXT: por %xmm3, %xmm4 ; X86-SSE2-NEXT: paddb %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index 684721f434ebd..64c31187f29ef 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -375,10 +375,10 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -395,10 +395,10 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2)) ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq