diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 43d4138df8b49..9aae043f822a3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -26091,8 +26091,6 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { EVT ConcatSrcVT = V.getOperand(0).getValueType(); assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() && "Concat and extract subvector do not change element type"); - assert((ExtIdx % ExtNumElts) == 0 && - "Extract index is not a multiple of the input vector length."); unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements(); unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index bc2dbfb4cbaae..a252d911a1d4d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3842,13 +3842,32 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) { uint64_t LoEltsMin = Lo.getValueType().getVectorMinNumElements(); uint64_t IdxVal = Idx->getAsZExtVal(); + unsigned NumResultElts = SubVT.getVectorMinNumElements(); + if (IdxVal < LoEltsMin) { - assert(IdxVal + SubVT.getVectorMinNumElements() <= LoEltsMin && + assert(IdxVal + NumResultElts <= LoEltsMin && "Extracted subvector crosses vector split!"); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx); - } else if (SubVT.isScalableVector() == - N->getOperand(0).getValueType().isScalableVector()) - return DAG.getExtractSubvector(dl, SubVT, Hi, IdxVal - LoEltsMin); + } + + EVT SrcVT = N->getOperand(0).getValueType(); + if (SubVT.isScalableVector() == SrcVT.isScalableVector()) { + uint64_t ExtractIdx = IdxVal - LoEltsMin; + if (ExtractIdx % NumResultElts == 0) + return DAG.getExtractSubvector(dl, SubVT, Hi, ExtractIdx); + + // We cannot create an extract_subvector that isn't a multiple of the result + // size, which may go out of bounds for the last elements. Shuffle the + // desired elements down to 0 and do a simple 0 extract. + EVT HiVT = Hi.getValueType(); + SmallVector Mask(HiVT.getVectorNumElements(), -1); + for (int I = 0; I != static_cast(NumResultElts); ++I) + Mask[I] = ExtractIdx + I; + + SDValue Shuffle = + DAG.getVectorShuffle(HiVT, dl, Hi, DAG.getPOISON(HiVT), Mask); + return DAG.getExtractSubvector(dl, SubVT, Shuffle, 0); + } // After this point the DAG node only permits extracting fixed-width // subvectors from scalable vectors. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 84282d8a1c37b..fadf2c7a4b9bc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7956,6 +7956,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(N2C->getAPIntValue().getBitWidth() == TLI->getVectorIdxWidth(getDataLayout()) && "Constant index for EXTRACT_SUBVECTOR has an invalid size"); + assert(N2C->getZExtValue() % VT.getVectorMinNumElements() == 0 && + "Extract index is not a multiple of the output vector length"); // Trivial extraction. if (VT == N1VT) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 64e68ab7d753c..83bc8e5eaef8e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1802,16 +1802,36 @@ std::pair AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT, SelectionDAG &DAG) const { + EVT VT = N.getValueType(); assert(LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= - N.getValueType().getVectorNumElements() && + VT.getVectorNumElements() && "More vector elements requested than available!"); SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, DAG.getVectorIdxConstant(0, DL)); - SDValue Hi = DAG.getNode( - HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, - HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL)); - return std::pair(Lo, Hi); + + unsigned LoNumElts = LoVT.getVectorNumElements(); + + if (HiVT.isVector()) { + unsigned HiNumElts = HiVT.getVectorNumElements(); + if ((VT.getVectorNumElements() % HiNumElts) == 0) { + // Avoid creating an extract_subvector with an index that isn't a multiple + // of the result type. + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N, + DAG.getConstant(LoNumElts, DL, MVT::i32)); + return {Lo, Hi}; + } + + SmallVector Elts; + DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts, + /*Count=*/HiNumElts); + SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts); + return {Lo, Hi}; + } + + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, HiVT, N, + DAG.getVectorIdxConstant(LoNumElts, DL)); + return {Lo, Hi}; } SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, diff --git a/llvm/test/CodeGen/AMDGPU/issue153808-extract-subvector-legalize.ll b/llvm/test/CodeGen/AMDGPU/issue153808-extract-subvector-legalize.ll new file mode 100644 index 0000000000000..f1b1ea3fbd6d7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/issue153808-extract-subvector-legalize.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX942 %s + +define <3 x float> @issue153808_vector_extract_assert(ptr addrspace(1) %ptr) #0 { +; GFX900-LABEL: issue153808_vector_extract_assert: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_load_dwordx4 v[5:8], v[3:4], off +; GFX900-NEXT: global_load_dwordx3 v[0:2], v[3:4], off offset:192 +; GFX900-NEXT: s_mov_b32 s4, 0 +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: buffer_store_dwordx4 v[5:8], off, s[4:7], 0 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: issue153808_vector_extract_assert: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx4 v[6:9], v[0:1], off +; GFX942-NEXT: global_load_dwordx3 v[2:4], v[0:1], off offset:192 +; GFX942-NEXT: s_mov_b32 s0, 0 +; GFX942-NEXT: s_mov_b32 s1, s0 +; GFX942-NEXT: s_mov_b32 s2, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_waitcnt vmcnt(1) +; GFX942-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 +; GFX942-NEXT: s_waitcnt vmcnt(1) +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] + %val = load <51 x float>, ptr addrspace(1) %ptr, align 4 + %val.slice.0 = shufflevector <51 x float> %val, <51 x float> poison, <4 x i32> + call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val.slice.0, ptr addrspace(8) null, i32 0, i32 0, i32 0) + %val.slice.48 = shufflevector <51 x float> %val, <51 x float> poison, <3 x i32> + ret <3 x float> %val.slice.48 +} + +declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8) writeonly captures(none), i32, i32, i32 immarg) #1 + +attributes #0 = { nounwind } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 0a938b0d2297d..8862cbe6391ea 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -872,66 +872,66 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16 ; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-HSA-NEXT: flat_store_dwordx3 v[0:1], v[4:6] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GFX7-HSA-NEXT: s_endpgm ; ; GFX8-NOHSA-LABEL: constant_load_v11i32: ; GFX8-NOHSA: ; %bb.0: ; %entry ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[0:1], v[4:6] +; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GFX8-NOHSA-NEXT: s_endpgm ; ; EG-LABEL: constant_load_v11i32: @@ -969,25 +969,25 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX9-HSA-LABEL: constant_load_v11i32: ; GFX9-HSA: ; %bb.0: ; %entry ; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; GFX9-HSA-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 ; GFX9-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-HSA-NEXT: global_store_dwordx4 v7, v[0:3], s[8:9] offset:16 -; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-HSA-NEXT: global_store_dwordx4 v7, v[0:3], s[8:9] -; GFX9-HSA-NEXT: global_store_dwordx3 v7, v[4:6], s[8:9] offset:32 +; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] offset:16 +; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] +; GFX9-HSA-NEXT: global_store_dwordx3 v8, v[0:2], s[8:9] offset:32 ; GFX9-HSA-NEXT: s_endpgm ; ; GFX12-LABEL: constant_load_v11i32: @@ -995,19 +995,19 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v8, s12 +; GFX12-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v6, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v8, s12 ; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v10, s14 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 -; GFX12-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v11, v[0:3], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v11, v[4:7], s[8:9] +; GFX12-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32 ; GFX12-NEXT: s_endpgm entry: %ld = load <11 x i32>, ptr addrspace(4) %in