diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index be7521f341685..15aca9e5a9d48 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -474,7 +474,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) { switch (getTypeAction(InVT)) { case TargetLowering::TypeLegal: + // Try and use in-register bitcast + if (SDValue Res = LowerBitcastInRegister(N)) + return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, Res); + // Fallback to stack load store break; + case TargetLowering::TypePromoteInteger: if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector() && !NInVT.isVector()) // The input promotes to the same size. Convert the promoted value. @@ -2174,8 +2179,11 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) { } SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) { - // This should only occur in unusual situations like bitcasting to an - // x86_fp80, so just turn it into a store+load + // Try and use in register bitcast + if (SDValue Res = LowerBitcastInRegister(N)) + return Res; + + // Fallback return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0)); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index b6abad830c371..5eb1470a0957b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -14,6 +14,7 @@ #include "LegalizeTypes.h" #include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/IR/DataLayout.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" @@ -910,6 +911,94 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op, return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(), Align); } +SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const { + // Lower a bitcast into in-register shift operations + assert(N->getOpcode() == ISD::BITCAST && "Unexpected opcode!"); + + EVT FromVT = N->getOperand(0)->getValueType(0); + EVT ToVT = N->getValueType(0); + + SDLoc DL(N); + + bool IsBigEndian = DAG.getDataLayout().isBigEndian(); + + if (FromVT.isVector() && ToVT.isScalarInteger()) { + + if (!IsBigEndian) { + + EVT ToVecVT = EVT::getVectorVT(*DAG.getContext(), ToVT, 1); + // If ISD::EXTRACT_VECTOR_ELT is a legal or custom op then return + if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ToVecVT)) + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ToVT, + DAG.getBitcast(ToVecVT, N->getOperand(0)), + DAG.getVectorIdxConstant(0, DL)); + } + + EVT ElemVT = FromVT.getVectorElementType(); + unsigned NumElems = FromVT.getVectorNumElements(); + unsigned ElemBits = ElemVT.getSizeInBits(); + unsigned NeededBits = ElemBits * NumElems; + unsigned PackedBits = ToVT.getSizeInBits(); + + assert(PackedBits >= NeededBits && + "Scalar type does not have enough bits to pack vector values."); + + EVT PackVT = EVT::getIntegerVT(*DAG.getContext(), PackedBits); + SDValue Packed = DAG.getConstant(0, DL, PackVT); + + EVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + + for (unsigned I = 0; I < NumElems; ++I) { + unsigned ElementIndex = IsBigEndian ? (NumElems - 1 - I) : I; + + SDValue Index = DAG.getConstant(ElementIndex, DL, IdxTy); + + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, + N->getOperand(0), Index); + + SDValue ExtElem = DAG.getNode(ISD::ZERO_EXTEND, DL, PackVT, Elem); + SDValue ShiftAmount = + DAG.getShiftAmountConstant(ElemBits * I, PackVT, DL); + SDValue ShiftedElem = + DAG.getNode(ISD::SHL, DL, PackVT, ExtElem, ShiftAmount); + + Packed = DAG.getNode(ISD::OR, DL, PackVT, Packed, ShiftedElem); + } + + return DAG.getBitcast(ToVT, Packed); + } + + if (FromVT.isScalarInteger() && ToVT.isVector()) { + + EVT ElemVT = ToVT.getVectorElementType(); + unsigned NumElems = ToVT.getVectorNumElements(); + unsigned ElemBits = ElemVT.getSizeInBits(); + + unsigned PackedBits = FromVT.getSizeInBits(); + assert(PackedBits >= ElemBits * NumElems && + "Vector does not have enough bits to unpack scalar type."); + + SmallVector Elements(NumElems); + + EVT ShiftTy = TLI.getShiftAmountTy(FromVT, DAG.getDataLayout()); + + for (unsigned I = 0; I < NumElems; ++I) { + unsigned ElementIndex = IsBigEndian ? (NumElems - 1 - I) : I; + unsigned ShiftAmountVal = ElemBits * ElementIndex; + + SDValue ShiftAmount = DAG.getConstant(ShiftAmountVal, DL, ShiftTy); + SDValue Shifted = + DAG.getNode(ISD::SRL, DL, FromVT, N->getOperand(0), ShiftAmount); + SDValue Element = DAG.getNode(ISD::TRUNCATE, DL, ElemVT, Shifted); + Elements[I] = Element; + } + + return DAG.getBuildVector(ToVT, DL, Elements); + } + + return {}; +} + /// Replace the node's results with custom code provided by the target and /// return "true", or do nothing and return "false". /// The last parameter is FALSE if we are dealing with a node with legal diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 571a710cc92a3..9d0c970e350f9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -216,6 +216,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue BitConvertToInteger(SDValue Op); SDValue BitConvertVectorToIntegerVector(SDValue Op); SDValue CreateStackStoreLoad(SDValue Op, EVT DestVT); + SDValue LowerBitcastInRegister(SDNode *N) const; bool CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult); bool CustomWidenLowerNode(SDNode *N, EVT VT); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 5c1f717694a4c..7d06139120d71 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -472,13 +472,6 @@ VectorizePTXValueVTs(const SmallVectorImpl &ValueVTs, return VectorInfo; } -static SDValue MaybeBitcast(SelectionDAG &DAG, SDLoc DL, EVT VT, - SDValue Value) { - if (Value->getValueType(0) == VT) - return Value; - return DAG.getNode(ISD::BITCAST, DL, VT, Value); -} - // NVPTXTargetLowering Constructor. NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI) @@ -622,9 +615,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); - // Custom conversions to/from v2i8. - setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); - // Only logical ops can be done on v4i8 directly, others must be done // elementwise. setOperationAction( @@ -2086,30 +2076,6 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } -SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { - // Handle bitcasting from v2i8 without hitting the default promotion - // strategy which goes through stack memory. - EVT FromVT = Op->getOperand(0)->getValueType(0); - if (FromVT != MVT::v2i8) { - return Op; - } - - // Pack vector elements into i16 and bitcast to final type - SDLoc DL(Op); - SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, - Op->getOperand(0), DAG.getIntPtrConstant(0, DL)); - SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, - Op->getOperand(0), DAG.getIntPtrConstant(1, DL)); - SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0); - SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1); - SDValue Const8 = DAG.getConstant(8, DL, MVT::i16); - SDValue AsInt = DAG.getNode( - ISD::OR, DL, MVT::i16, - {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})}); - EVT ToVT = Op->getValueType(0); - return MaybeBitcast(DAG, DL, ToVT, AsInt); -} - // We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it // would get lowered as two constant loads and vector-packing move. // Instead we want just a constant move: @@ -2618,8 +2584,6 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return Op; case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); - case ISD::BITCAST: - return LowerBITCAST(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return Op; case ISD::EXTRACT_VECTOR_ELT: @@ -5202,28 +5166,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } -static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, - SmallVectorImpl &Results) { - // Handle bitcasting to v2i8 without hitting the default promotion - // strategy which goes through stack memory. - SDValue Op(Node, 0); - EVT ToVT = Op->getValueType(0); - if (ToVT != MVT::v2i8) { - return; - } - - // Bitcast to i16 and unpack elements into a vector - SDLoc DL(Node); - SDValue AsInt = MaybeBitcast(DAG, DL, MVT::i16, Op->getOperand(0)); - SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt); - SDValue Const8 = DAG.getConstant(8, DL, MVT::i16); - SDValue Vec1 = - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, - DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8})); - Results.push_back( - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1})); -} - /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl &Results) { @@ -5459,9 +5401,6 @@ void NVPTXTargetLowering::ReplaceNodeResults( switch (N->getOpcode()) { default: report_fatal_error("Unhandled custom legalization"); - case ISD::BITCAST: - ReplaceBITCAST(N, DAG, Results); - return; case ISD::LOAD: ReplaceLoadVector(N, DAG, Results); return; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 4a98fe21b81dc..446ff1536d36c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -265,8 +265,6 @@ class NVPTXTargetLowering : public TargetLowering { const NVPTXSubtarget &STI; // cache the subtarget here SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; - SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index 39f2572d9fd35..d75edc3a7bb43 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -49,12 +49,15 @@ define <4 x i16> @foo2(<2 x i32> %a) { define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){ ; CHECK-SD-LABEL: bitcast_v4i8_i32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: sub sp, sp, #16 -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: fmov w0, s0 -; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: umov w8, v0.h[0] +; CHECK-SD-NEXT: umov w9, v0.h[1] +; CHECK-SD-NEXT: umov w10, v0.h[2] +; CHECK-SD-NEXT: and w8, w8, #0xff +; CHECK-SD-NEXT: bfi w8, w9, #8, #8 +; CHECK-SD-NEXT: umov w9, v0.h[3] +; CHECK-SD-NEXT: bfi w8, w10, #16, #8 +; CHECK-SD-NEXT: orr w0, w8, w9, lsl #24 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: bitcast_v4i8_i32: @@ -99,15 +102,10 @@ define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-LABEL: bitcast_v2i16_i32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: sub sp, sp, #16 -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [sp, #12] -; CHECK-SD-NEXT: strh w8, [sp, #14] -; CHECK-SD-NEXT: ldr w0, [sp, #12] -; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: bfi w0, w8, #16, #16 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: bitcast_v2i16_i32: diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index 0f5b240e387ed..0221ffcb19063 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -229,15 +229,17 @@ define <2 x i1> @shufflevector_v2i1(<2 x i1> %a, <2 x i1> %b){ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){ ; CHECK-SD-LABEL: shufflevector_v4i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: sub sp, sp, #16 -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: ext v0.8b, v1.8b, v0.8b, #6 -; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v0.4h -; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4 -; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: fmov w0, s0 -; CHECK-SD-NEXT: add sp, sp, #16 -; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: umov w8, v0.h[1] +; CHECK-SD-NEXT: umov w9, v0.h[2] +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: umov w10, v1.h[0] +; CHECK-SD-NEXT: and w8, w8, #0xff +; CHECK-SD-NEXT: bfi w8, w9, #8, #8 +; CHECK-SD-NEXT: umov w9, v1.h[3] +; CHECK-SD-NEXT: bfi w8, w10, #16, #8 +; CHECK-SD-NEXT: orr w0, w8, w9, lsl #24 +; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: shufflevector_v4i8: ; CHECK-GI: // %bb.0: @@ -285,15 +287,11 @@ define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b){ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-LABEL: shufflevector_v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: sub sp, sp, #16 -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4 -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [sp, #12] -; CHECK-SD-NEXT: strh w8, [sp, #14] -; CHECK-SD-NEXT: ldr w0, [sp, #12] -; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: mov w0, v0.s[1] +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: fmov w8, s1 +; CHECK-SD-NEXT: bfi w0, w8, #16, #16 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: shufflevector_v2i16: @@ -462,14 +460,13 @@ define <2 x i1> @shufflevector_v2i1_zeroes(<2 x i1> %a, <2 x i1> %b){ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){ ; CHECK-SD-LABEL: shufflevector_v4i8_zeroes: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: sub sp, sp, #16 -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: dup v0.4h, v0.h[0] -; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: fmov w0, s0 -; CHECK-SD-NEXT: add sp, sp, #16 -; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: umov w8, v0.h[0] +; CHECK-SD-NEXT: and w9, w8, #0xff +; CHECK-SD-NEXT: orr w9, w9, w9, lsl #8 +; CHECK-SD-NEXT: bfi w9, w8, #16, #8 +; CHECK-SD-NEXT: orr w0, w9, w8, lsl #24 +; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: shufflevector_v4i8_zeroes: ; CHECK-GI: // %bb.0: @@ -495,16 +492,9 @@ define <32 x i8> @shufflevector_v32i8_zeroes(<32 x i8> %a, <32 x i8> %b){ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-LABEL: shufflevector_v2i16_zeroes: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: sub sp, sp, #16 -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: dup v1.2s, v0.s[0] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [sp, #12] -; CHECK-SD-NEXT: mov w8, v1.s[1] -; CHECK-SD-NEXT: strh w8, [sp, #14] -; CHECK-SD-NEXT: ldr w0, [sp, #12] -; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: bfi w0, w0, #16, #16 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: shufflevector_v2i16_zeroes: diff --git a/llvm/test/CodeGen/AMDGPU/build_vector-r600.ll b/llvm/test/CodeGen/AMDGPU/build_vector-r600.ll index 2abcbbcdd1bc6..1e061841bd2c5 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector-r600.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector-r600.ll @@ -45,12 +45,12 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; R600-LABEL: build_vector_v2i16: ; R600: ; %bb.0: ; %entry ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: MOV T4.X, literal.x, -; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; R600-NEXT: MOV T0.X, literal.x, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; R600-NEXT: 393221(5.510200e-40), 2(2.802597e-45) entry: store <2 x i16> , ptr addrspace(1) %out @@ -61,14 +61,14 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; R600-LABEL: build_vector_v2i16_trunc: ; R600: ; %bb.0: ; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x, +; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x, ; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; R600-NEXT: OR_INT T4.X, PV.W, literal.x, -; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; R600-NEXT: OR_INT T0.X, PV.W, literal.x, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; R600-NEXT: 327680(4.591775e-40), 2(2.802597e-45) %srl = lshr i32 %a, 16 %trunc = trunc i32 %srl to i16 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 17ab8fc780fb4..71d950fc47fac 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -368,26 +368,26 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, ; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: LSHR * T0.W, T0.X, literal.x, +; EG-NEXT: LSHR * T0.W, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T1.W, PS, -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: BCNT_INT T1.W, PS, +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT T0.X, PV.W, PS, -; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT T0.X, PV.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index add62a5c39cb1..96bf7b8bd96d9 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -11,63 +11,55 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 -; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 -; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4 -; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 -; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 -; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1 -; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 -; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1 -; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 -; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1 -; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 -; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1 -; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 -; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1 -; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1 -; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7-NEXT: s_endpgm +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 +; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 +; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1 +; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 +; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 +; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 +; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1 +; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 +; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 +; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 +; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1 +; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -78,10 +70,6 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 @@ -119,17 +107,11 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -169,17 +151,11 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] @@ -191,12 +167,6 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1] @@ -214,13 +184,7 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1] @@ -306,71 +270,65 @@ entry: define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 -; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 -; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 -; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 -; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 -; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 -; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX7-NEXT: s_endpgm +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 +; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc16: ; GFX8: ; %bb.0: ; %entry @@ -378,7 +336,6 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -391,11 +348,6 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 @@ -454,11 +406,6 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -468,7 +415,6 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -527,89 +473,77 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 -; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] -; GFX9-DL-NEXT: s_endpgm +; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: s_waitcnt vmcnt(2) +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc16: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] @@ -678,17 +612,11 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc16: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] @@ -828,71 +756,65 @@ entry: define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 -; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 -; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 -; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 -; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 -; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 -; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 -; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX7-NEXT: s_endpgm +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 +; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 +; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc8: ; GFX8: ; %bb.0: ; %entry @@ -900,7 +822,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -913,11 +834,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 @@ -976,11 +892,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -990,7 +901,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1049,11 +959,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1063,7 +968,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1122,16 +1026,10 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX10-DL-XNACK-LABEL: idot8_acc8: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] @@ -1200,17 +1098,11 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc8: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] @@ -1351,65 +1243,57 @@ entry: define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 -; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4 -; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 -; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16 -; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 -; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 -; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1 -; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 -; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1 -; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 -; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1 -; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 -; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1 -; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 -; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1 -; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1 -; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7-NEXT: s_endpgm +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4 +; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 +; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16 +; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 +; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1 +; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 +; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 +; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 +; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1 +; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 +; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 +; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 +; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1 +; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -1420,10 +1304,6 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 @@ -1463,17 +1343,11 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1514,17 +1388,11 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -1566,12 +1434,6 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1] @@ -1619,12 +1481,6 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1] @@ -1742,63 +1598,55 @@ entry: define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2 -; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 -; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 -; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4 -; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v0 -; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4 -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4 -; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, v5, v12, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, v3, v10, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, v1, v9, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7-NEXT: s_endpgm +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2 +; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 +; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4 +; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v0 +; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4 +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4 +; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, v5, v12, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, v4, v11, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, v1, v9, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -1809,10 +1657,6 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4 @@ -1850,17 +1694,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1 ; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4 @@ -1900,17 +1738,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] @@ -1922,12 +1754,6 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1] @@ -1945,13 +1771,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1] @@ -2001,11 +1821,6 @@ entry: define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -2015,12 +1830,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4 ; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 @@ -2073,7 +1887,6 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2086,251 +1899,198 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v9, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 20, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 28, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2 -; GFX8-NEXT: v_lshlrev_b16_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v18 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 -; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v17 -; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 +; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 -; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v16 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 -; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX8-NEXT: v_mad_u16 v2, v7, v12, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 +; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: v_mad_u16 v2, v8, v13, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX8-NEXT: v_mad_u16 v2, v17, v5, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 +; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 ; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 +; GFX8-NEXT: v_mad_u16 v2, v9, v14, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 -; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 -; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 +; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v15, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[8:9] ; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX9-NEXT: v_perm_b32 v9, v9, v1, s2 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-NEXT: v_perm_b32 v5, v15, v2, s2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 4, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 -; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 -; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 20, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v15 -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s2 -; GFX9-NEXT: v_perm_b32 v8, v13, v12, s2 -; GFX9-NEXT: v_perm_b32 v5, v6, v5, s2 -; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 -; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 -; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v8 -; GFX9-NEXT: v_perm_b32 v2, v2, v4, s2 -; GFX9-NEXT: v_perm_b32 v1, v1, v11, s2 -; GFX9-NEXT: v_perm_b32 v4, v17, v16, s2 -; GFX9-NEXT: v_perm_b32 v9, v10, v9, s2 -; GFX9-NEXT: v_perm_b32 v10, v15, v14, s2 +; GFX9-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; GFX9-NEXT: v_perm_b32 v6, v14, v13, s2 +; GFX9-NEXT: v_alignbit_b32 v2, v12, v2, 16 +; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v9, v5 +; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v9, v4 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v7, v10 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v7, v6 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v4 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_perm_b32 v8, v11, v10, s2 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v4 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX9-DL-NEXT: v_perm_b32 v9, v9, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v5, v15, v2, s2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 4, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v1 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v1 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 20, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v1 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v15 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v17 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s2 -; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s2 -; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s2 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v8 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v17, v16, s2 -; GFX9-DL-NEXT: v_perm_b32 v9, v10, v9, s2 -; GFX9-DL-NEXT: v_perm_b32 v10, v15, v14, s2 +; GFX9-DL-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; GFX9-DL-NEXT: v_perm_b32 v6, v14, v13, s2 +; GFX9-DL-NEXT: v_alignbit_b32 v2, v12, v2, 16 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v9, v5 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v9, v4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v7, v10 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v7, v6 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v4 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_perm_b32 v8, v11, v10, s2 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v4 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] @@ -2340,92 +2100,68 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v13 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v14 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 4, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v5, v5, v1, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v6, v6, v2, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v9, v10, v9, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_perm_b32 v10, v12, v11, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 28, v1 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX10-DL-XNACK-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v8, 12, v9 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v6 +; GFX10-DL-XNACK-NEXT: v_alignbit_b32 v6, v11, v2, 16 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v5 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v15 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v16 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v17 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v8, v9 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v10 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_perm_b32 v2, v2, v13, 0x5040100 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v12 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v5 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v2, v3, v5 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v2, v1 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v3, v1 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v4, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] @@ -2434,73 +2170,55 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v5, v5, v1, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v6, v6, v0, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v9, v10, v9, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v10, v12, v11, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0 +; GFX10-DL-NOXNACK-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v8, 12, v9 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v6 +; GFX10-DL-NOXNACK-NEXT: v_alignbit_b32 v6, v11, v0, 16 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v5 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v17 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v8, v9 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v10 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v0, v0, v13, 0x5040100 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v5 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v3, v1 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v4, v0 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3 @@ -2546,11 +2264,6 @@ entry: define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -2565,7 +2278,6 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4 ; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 @@ -2614,111 +2326,100 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 -; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2 -; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16 -; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15 -; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18 -; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 -; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 -; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11 -; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 -; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 -; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 -; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 -; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_mul_lo_u16_e32 v20, v16, v18 -; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 -; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 -; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 -; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v3, v8, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 -; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 -; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v10 -; GFX8-NEXT: flat_store_byte v[0:1], v2 -; GFX8-NEXT: s_endpgm +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 12 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10 +; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16 +; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 +; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15 +; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18 +; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 +; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 +; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11 +; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 +; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 +; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 +; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX8-NEXT: v_mul_lo_u16_e32 v20, v16, v18 +; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 +; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 +; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 +; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v3, v8, v4 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 +; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 +; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 +; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v10 +; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2728,7 +2429,6 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 @@ -2806,11 +2506,6 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2820,7 +2515,6 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 @@ -2898,17 +2592,11 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] @@ -2999,17 +2687,11 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 069bebdf3c469..d9ddef9b43cd4 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -9,11 +9,6 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -28,33 +23,32 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: v_and_b32_e32 v10, 15, v0 +; GFX7-NEXT: v_bfe_u32 v11, v0, 4, 4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4 -; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v3, v3, v10, s4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v3, v4, v11, v3 +; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 +; GFX7-NEXT: v_mad_u32_u24 v3, v5, v12, v3 +; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4 +; GFX7-NEXT: v_mad_u32_u24 v3, v6, v13, v3 +; GFX7-NEXT: v_bfe_u32 v8, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v3, v7, v14, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX7-NEXT: v_mad_u32_u24 v3, v8, v15, v3 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -64,8 +58,6 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -76,37 +68,33 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 -; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v14, v3, 20, 4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v0 +; GFX8-NEXT: v_bfe_u32 v7, v0, 4, 4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0 -; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0 -; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0 +; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s0 +; GFX8-NEXT: v_bfe_u32 v9, v0, 8, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v6, v7, v4 +; GFX8-NEXT: v_bfe_u32 v11, v0, 12, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v8, v9, v4 +; GFX8-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v10, v11, v4 +; GFX8-NEXT: v_bfe_u32 v15, v0, 20, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v12, v13, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0 +; GFX8-NEXT: v_bfe_u32 v3, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v14, v15, v4 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v4 +; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -117,48 +105,42 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 -; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 -; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX9-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v10, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v11, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v12, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v13, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v15, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX9-NEXT: v_mul_u32_u24_e32 v6, v7, v8 +; GFX9-NEXT: v_mul_u32_u24_e32 v7, v9, v10 +; GFX9-NEXT: v_mul_u32_u24_e32 v8, v11, v12 ; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16 -; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 -; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 -; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 -; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 -; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 -; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 -; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 +; GFX9-NEXT: v_add3_u32 v3, v5, s0, v6 +; GFX9-NEXT: v_mul_u32_u24_e32 v9, v13, v14 +; GFX9-NEXT: v_mul_u32_u24_e32 v10, v15, v16 +; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8 +; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10 +; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -167,17 +149,11 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] @@ -189,12 +165,6 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] @@ -282,11 +252,6 @@ entry: define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -301,33 +266,32 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 -; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 -; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: v_and_b32_e32 v11, 15, v0 +; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_bfe_u32 v8, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_bfe_u32 v9, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -337,8 +301,7 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -351,37 +314,41 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX8-NEXT: v_and_b32_e32 v17, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX8-NEXT: v_and_b32_sdwa v18, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v19, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v15 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u16 v4, v16, v17, v4 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX8-NEXT: v_mad_u16 v4, v5, v10, v4 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4 +; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_mad_u16 v4, v18, v19, v4 +; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 -; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 -; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 -; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 -; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 -; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 -; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -390,44 +357,47 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-NEXT: v_and_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v4, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v14 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_legacy_u16 v3, v15, v16, v3 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 +; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-NEXT: v_mad_legacy_u16 v3, v17, v18, v3 +; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 ; GFX9-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -436,44 +406,47 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v14 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v15, v16, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v17, v18, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 ; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -483,46 +456,49 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 +; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 12, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v6 +; GFX10-DL-NEXT: v_mov_b32_e32 v5, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v7 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX10-DL-NEXT: v_mad_u16 v3, v8, v9, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 20, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_mad_u16 v3, v9, v10, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7] +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v7, v5, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -600,11 +576,6 @@ entry: define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -619,33 +590,32 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 -; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 -; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: v_and_b32_e32 v11, 15, v0 +; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_bfe_u32 v8, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_bfe_u32 v9, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -655,8 +625,7 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -669,37 +638,41 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX8-NEXT: v_and_b32_e32 v17, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX8-NEXT: v_and_b32_sdwa v18, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v19, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v15 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u16 v4, v16, v17, v4 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX8-NEXT: v_mad_u16 v4, v5, v10, v4 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4 +; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_mad_u16 v4, v18, v19, v4 +; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 -; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 -; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 -; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 -; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 -; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 -; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -708,44 +681,47 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-NEXT: v_and_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v4, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v14 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_legacy_u16 v3, v15, v16, v3 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 +; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-NEXT: v_mad_legacy_u16 v3, v17, v18, v3 +; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -754,44 +730,47 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v14 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v15, v16, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v17, v18, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -801,46 +780,49 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 +; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 12, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v6 +; GFX10-DL-NEXT: v_mov_b32_e32 v5, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v7 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX10-DL-NEXT: v_mad_u16 v3, v8, v9, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 20, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_mad_u16 v3, v9, v10, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v7, v5, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -918,11 +900,6 @@ entry: define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc4: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -937,33 +914,32 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 -; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 -; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: v_and_b32_e32 v11, 15, v0 +; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_bfe_u32 v8, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_bfe_u32 v9, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -974,8 +950,6 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -988,28 +962,21 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 ; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 @@ -1017,6 +984,7 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 ; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1028,42 +996,34 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 20, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 @@ -1075,42 +1035,34 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 8, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 20, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 @@ -1124,43 +1076,35 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 4, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_u16 v4, v2, v3, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 12, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 12, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 20, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] @@ -1225,11 +1169,6 @@ entry: define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -1244,33 +1183,32 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 -; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 -; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: v_and_b32_e32 v11, 15, v0 +; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_bfe_u32 v8, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_bfe_u32 v9, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -1281,8 +1219,6 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -1295,28 +1231,21 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 ; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 @@ -1324,6 +1253,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 ; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1335,42 +1265,34 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 20, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 @@ -1382,42 +1304,34 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 8, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 20, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 @@ -1431,43 +1345,35 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 4, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_u16 v4, v2, v3, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 12, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 12, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 20, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] @@ -1530,11 +1436,6 @@ entry: define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -1549,36 +1450,35 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: v_and_b32_e32 v10, 15, v0 +; GFX7-NEXT: v_bfe_u32 v11, v0, 4, 4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v16, v2, v0, s4 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16 -; GFX7-NEXT: v_mad_u32_u24 v2, v8, v15, v16 -; GFX7-NEXT: v_mad_u32_u24 v2, v7, v14, v2 -; GFX7-NEXT: v_mad_u32_u24 v2, v6, v13, v2 -; GFX7-NEXT: v_mad_u32_u24 v2, v5, v12, v2 -; GFX7-NEXT: v_mad_u32_u24 v2, v4, v11, v2 -; GFX7-NEXT: v_mad_u32_u24 v2, v3, v10, v2 -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v9, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v16, v3, v10, s4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v4, v4, v11, v16 +; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 +; GFX7-NEXT: v_mad_u32_u24 v4, v5, v12, v4 +; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4 +; GFX7-NEXT: v_mad_u32_u24 v4, v6, v13, v4 +; GFX7-NEXT: v_bfe_u32 v8, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v4, v7, v14, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX7-NEXT: v_mad_u32_u24 v4, v8, v15, v4 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v4 +; GFX7-NEXT: v_mad_u32_u24 v3, v3, v10, v16 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1587,8 +1487,6 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -1599,39 +1497,35 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 -; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v14, v3, 20, 4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v0 +; GFX8-NEXT: v_bfe_u32 v7, v0, 4, 4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s0 -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v16 -; GFX8-NEXT: v_mad_u32_u24 v3, v8, v15, v16 -; GFX8-NEXT: v_mad_u32_u24 v3, v7, v14, v3 -; GFX8-NEXT: v_mad_u32_u24 v3, v6, v13, v3 -; GFX8-NEXT: v_mad_u32_u24 v3, v5, v12, v3 -; GFX8-NEXT: v_mad_u32_u24 v3, v4, v11, v3 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v10, v3 -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v9, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v16, v4, v5, s0 +; GFX8-NEXT: v_bfe_u32 v9, v0, 8, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v16 +; GFX8-NEXT: v_mad_u32_u24 v5, v6, v7, v16 +; GFX8-NEXT: v_bfe_u32 v11, v0, 12, 4 +; GFX8-NEXT: v_mad_u32_u24 v5, v8, v9, v5 +; GFX8-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX8-NEXT: v_mad_u32_u24 v5, v10, v11, v5 +; GFX8-NEXT: v_bfe_u32 v15, v0, 20, 4 +; GFX8-NEXT: v_mad_u32_u24 v5, v12, v13, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0 +; GFX8-NEXT: v_bfe_u32 v3, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX8-NEXT: v_mad_u32_u24 v5, v14, v15, v5 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v5 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1642,49 +1536,43 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v3, v1, 4, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX9-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v10, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v11, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v12, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v13, v1, 16, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_mul_u32_u24_e32 v17, v1, v2 +; GFX9-NEXT: v_bfe_u32 v15, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s0 -; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16 -; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX9-NEXT: v_mad_u32_u24 v2, v3, v10, v1 -; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 -; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 -; GFX9-NEXT: v_add3_u32 v2, v2, v9, v8 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 -; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 -; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6 -; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4 -; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v3, v5, v6, s0 +; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v10 +; GFX9-NEXT: v_mul_u32_u24_e32 v10, v11, v12 +; GFX9-NEXT: v_mad_u32_u24 v4, v7, v8, v3 +; GFX9-NEXT: v_mul_u32_u24_e32 v11, v13, v14 +; GFX9-NEXT: v_mul_u32_u24_e32 v12, v15, v16 +; GFX9-NEXT: v_add3_u32 v4, v4, v9, v10 +; GFX9-NEXT: v_add3_u32 v4, v4, v11, v12 +; GFX9-NEXT: v_mul_u32_u24_e32 v17, v5, v6 +; GFX9-NEXT: v_add3_u32 v1, v4, v1, v2 +; GFX9-NEXT: v_add3_u32 v1, v17, v3, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -1693,49 +1581,43 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 4, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v1, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v1, v2 +; GFX9-DL-NEXT: v_bfe_u32 v15, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v10, v1 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v14 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v13 -; GFX9-DL-NEXT: v_add3_u32 v2, v2, v9, v8 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v12 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11 -; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6 -; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4 -; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v5, v6, s0 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v10 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v10, v11, v12 +; GFX9-DL-NEXT: v_mad_u32_u24 v4, v7, v8, v3 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v11, v13, v14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v12, v15, v16 +; GFX9-DL-NEXT: v_add3_u32 v4, v4, v9, v10 +; GFX9-DL-NEXT: v_add3_u32 v4, v4, v11, v12 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v5, v6 +; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v2 +; GFX9-DL-NEXT: v_add3_u32 v1, v17, v3, v1 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -1745,12 +1627,6 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] @@ -1758,38 +1634,38 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1 -; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 12, 4 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v8 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s0 +; GFX10-DL-NEXT: v_mad_u32_u24 v8, v4, v5, s0 +; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 16, 4 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v10, v10, v11 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v12 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v10, v13 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v6, v6, v9, v8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v9, v12, v13 +; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 ; GFX10-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v15 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v14 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v7 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v4, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v10 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v8, v9 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v6, v5 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v11, v11, v14 +; GFX10-DL-NEXT: v_add3_u32 v6, v6, v7, v10 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v4, v5 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_add3_u32 v2, v6, v9, v11 +; GFX10-DL-NEXT: v_add3_u32 v0, v2, v1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0 +; GFX10-DL-NEXT: v_add3_u32 v0, v3, v8, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -1868,11 +1744,6 @@ entry: define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -1887,7 +1758,6 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 @@ -1923,8 +1793,6 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -1935,21 +1803,17 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 -; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_bfe_u32 v4, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v5, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v7, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0 +; GFX8-NEXT: v_bfe_u32 v9, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4 ; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4 ; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4 @@ -1959,13 +1823,13 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0 -; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0 -; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v9, v15, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v8, v14, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v7, v13, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v6, v12, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v5, v11, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v4, v10, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1976,28 +1840,22 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 -; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4 ; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 ; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4 @@ -2006,18 +1864,18 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16 -; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, v10, v16 +; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v15 +; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 -; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 -; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 -; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 -; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 -; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 -; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 +; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v13 +; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v12 +; GFX9-NEXT: v_add3_u32 v1, v1, v9, v8 +; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v11 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v4 +; GFX9-NEXT: v_add3_u32 v1, v1, v7, v6 +; GFX9-NEXT: v_add3_u32 v1, v1, v5, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -2026,17 +1884,11 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] @@ -2048,12 +1900,6 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] @@ -2106,11 +1952,6 @@ entry: define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -2125,34 +1966,37 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_and_b32_e32 v8, 15, v2 +; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v15, 15, v0 +; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 20, v2 +; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 20, v0 +; GFX7-NEXT: v_bfe_u32 v11, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 +; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 28, v2 +; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX7-NEXT: v_bfe_u32 v3, v0, 24, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 28, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v10, v0, 16 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v11, v1 +; GFX7-NEXT: v_bfe_u32 v10, v2, 16, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 -; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 +; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v10, v8, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2161,8 +2005,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2175,156 +2018,166 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 20, v3 +; GFX8-NEXT: v_and_b32_sdwa v16, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2 +; GFX8-NEXT: v_and_b32_sdwa v17, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_and_b32_sdwa v18, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 -; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 -; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 -; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 -; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 -; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 -; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX8-NEXT: v_mad_u16 v2, v7, v12, v2 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX8-NEXT: v_mad_u16 v2, v8, v13, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v14, v2 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX8-NEXT: v_mad_u16 v2, v17, v5, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v15, v2 +; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: v_mov_b32_e32 v4, 15 +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 -; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 -; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 -; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX9-NEXT: v_and_b32_sdwa v16, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v17, 15, v2 +; GFX9-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v4, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v14 +; GFX9-NEXT: v_perm_b32 v7, v7, v8, s2 +; GFX9-NEXT: v_perm_b32 v8, v9, v17, s2 +; GFX9-NEXT: v_perm_b32 v4, v4, v15, s2 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 +; GFX9-NEXT: v_perm_b32 v1, v5, v1, s2 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: v_perm_b32 v2, v11, v18, s2 +; GFX9-NEXT: v_perm_b32 v5, v6, v16, s2 +; GFX9-NEXT: v_perm_b32 v6, v12, v13, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v5, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v6 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[6:7] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v16, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v14 +; GFX9-DL-NEXT: v_perm_b32 v7, v7, v8, s2 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v17, s2 +; GFX9-DL-NEXT: v_perm_b32 v4, v4, v15, s2 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-DL-NEXT: v_perm_b32 v2, v10, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-DL-NEXT: v_perm_b32 v2, v11, v18, s2 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v16, s2 +; GFX9-DL-NEXT: v_perm_b32 v6, v12, v13, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v5, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v6 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: @@ -2333,12 +2186,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 15 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] @@ -2346,41 +2194,49 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 4, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX10-DL-NEXT: v_perm_b32 v7, v7, v9, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v6, v6, v10, 0x5040100 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v11 +; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v12 +; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v13 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v2 +; GFX10-DL-NEXT: v_perm_b32 v10, v11, v10, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v8, v8, v9, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 -; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v6, v3 +; GFX10-DL-NEXT: v_and_b32_sdwa v12, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 28, v1 +; GFX10-DL-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v5, v5, v12, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v8 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v6 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NEXT: v_perm_b32 v2, v8, v2, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v1, v9, v1, 0x5040100 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2430,11 +2286,6 @@ entry: define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -2449,34 +2300,39 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 -; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_and_b32_e32 v7, 15, v2 +; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v13, 15, v0 +; GFX7-NEXT: v_bfe_u32 v11, v0, 4, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v13, v1 +; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v11, v1 +; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 12, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v9, v2, 16, 4 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 -; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v12, v1 +; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 24 +; GFX7-NEXT: v_bfe_u32 v3, v0, 20, 4 +; GFX7-NEXT: v_alignbit_b32 v0, v10, v0, 24 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1 +; GFX7-NEXT: v_and_b32_e32 v16, 0xf0f, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_and_b32_e32 v7, 0xf0f, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1 +; GFX7-NEXT: v_bfe_u32 v10, v16, 8, 8 +; GFX7-NEXT: v_bfe_u32 v5, v7, 8, 8 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v10, v5, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2485,8 +2341,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2499,67 +2354,66 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3 -; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 28, v2 -; GFX8-NEXT: v_bfe_u32 v17, v2, 24, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v19, v2, 16, 4 -; GFX8-NEXT: v_mul_lo_u16_sdwa v11, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v18, v10, v17 -; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v5, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v3 -; GFX8-NEXT: v_bfe_u32 v3, v2, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX8-NEXT: v_mul_lo_u16_e32 v2, v12, v19 -; GFX8-NEXT: v_mul_lo_u16_e32 v8, v8, v15 -; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v9, v18, v9 -; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v2, v11 -; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX8-NEXT: v_mul_lo_u16_e32 v6, v6, v13 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3 +; GFX8-NEXT: v_and_b32_sdwa v17, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v18, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX8-NEXT: v_and_b32_sdwa v19, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX8-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_mul_lo_u16_sdwa v6, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v11, v18, v5 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v15 +; GFX8-NEXT: v_mul_lo_u16_e32 v15, v17, v19 +; GFX8-NEXT: v_or_b32_e32 v6, v11, v6 +; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v8, v8, v13 +; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v9, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v3, v16, v3 +; GFX8-NEXT: v_mul_lo_u16_sdwa v10, v2, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v15, v7 +; GFX8-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX8-NEXT: v_or_b32_e32 v9, v3, v10 +; GFX8-NEXT: v_or_b32_sdwa v3, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v10, v10, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v10 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v3, v6, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 +; GFX8-NEXT: v_add_u16_e32 v4, v9, v4 +; GFX8-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v8 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX8-NEXT: v_mad_u16 v2, v12, v19, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v9 +; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; GFX8-NEXT: v_mad_u16 v2, v18, v5, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2568,63 +2422,66 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: global_load_dword v1, v0, s[8:9] ; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: global_load_ubyte v4, v3, s[0:1] -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX9-NEXT: v_bfe_u32 v9, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v1, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2 -; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v2, v2, 16, 4 -; GFX9-NEXT: v_mul_lo_u16_e32 v18, v11, v2 -; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v17, v9, v16 -; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14 -; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v18, v10 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX9-NEXT: v_and_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 15, v2 +; GFX9-NEXT: v_and_b32_sdwa v18, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v14 +; GFX9-NEXT: v_mul_lo_u16_e32 v14, v17, v2 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v10, v16, v18 +; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v12 +; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v1, v15, v1 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v14, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX9-NEXT: v_or_b32_e32 v7, v1, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-NEXT: v_or_b32_e32 v5, v5, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-NEXT: v_or_b32_e32 v10, v12, v0 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v4, v5, v4 +; GFX9-NEXT: v_add_u16_e32 v4, v7, v4 ; GFX9-NEXT: v_add_u16_e32 v1, v4, v1 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-NEXT: v_mad_legacy_u16 v0, v11, v2, v0 +; GFX9-NEXT: v_mad_legacy_u16 v0, v17, v2, v0 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 -; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v5 ; GFX9-NEXT: global_store_byte v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2633,125 +2490,136 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[0:1] -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v1, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 16, 4 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v11, v2 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v17, v9, v16 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v0, v18, v10 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v18, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v14 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v14, v17, v2 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v16, v18 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v12 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, v15, v1 +; GFX9-DL-NEXT: v_or_b32_sdwa v10, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v0, v14, v6 +; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX9-DL-NEXT: v_or_b32_e32 v7, v1, v9 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-DL-NEXT: v_or_b32_e32 v5, v5, v12 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v8 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-DL-NEXT: v_or_b32_e32 v10, v12, v0 +; GFX9-DL-NEXT: v_or_b32_e32 v9, v9, v0 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v9 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v4, v5, v4 +; GFX9-DL-NEXT: v_add_u16_e32 v4, v7, v4 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v4, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v11, v2, v0 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v17, v2, v0 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5 ; GFX9-DL-NEXT: global_store_byte v3, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] -; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 8, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v1, v7, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v13 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v12 +; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX10-DL-NEXT: v_and_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v13, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v12, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v11 +; GFX10-DL-NEXT: v_mul_lo_u16 v6, v9, v14 +; GFX10-DL-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v10 +; GFX10-DL-NEXT: v_mul_lo_u16 v1, v15, v1 +; GFX10-DL-NEXT: v_mul_lo_u16 v2, v0, v2 ; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v9 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 24, 4 -; GFX10-DL-NEXT: v_or_b32_e32 v6, v1, v6 -; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v15 -; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v14 -; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v0 -; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v13 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-DL-NEXT: v_mul_lo_u16 v1, v12, v7 -; GFX10-DL-NEXT: v_mul_lo_u16 v11, v10, v16 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX10-DL-NEXT: v_mul_lo_u16 v8, v17, v12 +; GFX10-DL-NEXT: v_mul_lo_u16 v9, v16, v13 +; GFX10-DL-NEXT: v_lshlrev_b16 v5, 8, v5 ; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2 -; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v8 -; GFX10-DL-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX10-DL-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v13 +; GFX10-DL-NEXT: v_or_b32_sdwa v10, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX10-DL-NEXT: v_or_b32_sdwa v6, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v10 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v9 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v1, v3 +; GFX10-DL-NEXT: v_or_b32_sdwa v1, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u16 v6, v3, v8 ; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v0, v5, v6 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v6, v7 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NEXT: v_mad_u16 v0, v12, v7, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v17, v12, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX10-DL-NEXT: v_mad_u16 v0, v10, v16, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v5 +; GFX10-DL-NEXT: v_mad_u16 v0, v16, v13, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NEXT: global_store_byte v4, v0, s[6:7] +; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2793,11 +2661,6 @@ entry: define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc4_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s14, -1 -; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -2812,33 +2675,32 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 -; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 -; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: v_and_b32_e32 v11, 15, v0 +; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_bfe_u32 v8, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_bfe_u32 v9, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -2849,8 +2711,6 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2863,28 +2723,21 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s11 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 ; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 @@ -2892,6 +2745,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 ; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -2903,57 +2757,47 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 -; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 28, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 -; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 -; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 4, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2 ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-NEXT: v_alignbit_b32 v9, v13, v2, 16 +; GFX9-NEXT: v_alignbit_b32 v7, v7, v1, 16 +; GFX9-NEXT: v_perm_b32 v2, v10, v2, s0 +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s0 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-NEXT: v_perm_b32 v11, v12, v11, s0 +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, v1, v3 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v11 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v6, v7, v9 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_perm_b32 v14, v15, v14, s0 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v8, v14 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v4 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm @@ -2963,57 +2807,47 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 28, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2 ; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-DL-NEXT: v_alignbit_b32 v9, v13, v2, 16 +; GFX9-DL-NEXT: v_alignbit_b32 v7, v7, v1, 16 +; GFX9-DL-NEXT: v_perm_b32 v2, v10, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v4, v1, s0 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 -; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-DL-NEXT: v_perm_b32 v11, v12, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_add_u16_e32 v2, v1, v3 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v11 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v9 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_perm_b32 v14, v15, v14, s0 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v8, v14 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v4 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm @@ -3024,57 +2858,47 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX10-DL-NEXT: v_perm_b32 v4, v4, v1, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v5, v5, v2, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 12, v2 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX10-DL-NEXT: v_perm_b32 v5, v8, v7, 0x5040100 ; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 20, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 -; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 20, v1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NEXT: v_alignbit_b32 v4, v4, v1, 16 +; GFX10-DL-NEXT: v_alignbit_b32 v7, v8, v2, 16 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v7, v4 ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v5, 0x5040100 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 8704f4e780448..c746089733df0 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -1572,96 +1572,85 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; EG-LABEL: v5i8_arg: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 0, @16, KC0[], KC1[] +; EG-NEXT: ALU 4, @16, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 4 @6 -; EG-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1 +; EG-NEXT: ALU 19, @21, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T5.XW, T2.X +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3 -; EG-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3 -; EG-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3 -; EG-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3 -; EG-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3 +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 47, #3 +; EG-NEXT: VTX_READ_8 T2.X, T0.X, 46, #3 +; EG-NEXT: VTX_READ_8 T3.X, T0.X, 44, #3 +; EG-NEXT: VTX_READ_8 T4.X, T0.X, 45, #3 +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 48, #3 ; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: MOV * T5.X, 0.0, -; EG-NEXT: ALU clause starting at 17: +; EG-NEXT: MOV T0.X, 0.0, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, -; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) -; EG-NEXT: LSHL * T1.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T5.X, T2.W, PV.W, +; EG-NEXT: ALU clause starting at 21: +; EG-NEXT: LSHL * T1.W, T1.W, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T5.X, T0.X, PV.W, ; EG-NEXT: LSHL * T5.W, literal.x, PV.W, ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) ; EG-NEXT: MOV T5.Y, 0.0, ; EG-NEXT: MOV T5.Z, 0.0, -; EG-NEXT: AND_INT T1.W, T9.X, literal.x, -; EG-NEXT: AND_INT * T0.Z, T8.X, literal.x, -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: LSHL T1.W, PV.W, literal.x, -; EG-NEXT: LSHL * T2.W, T7.X, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: OR_INT T1.W, PS, PV.W, -; EG-NEXT: LSHL * T2.W, T0.Z, literal.x, +; EG-NEXT: LSHL * T1.W, T4.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T1.W, T3.X, PV.W, +; EG-NEXT: LSHL * T2.W, T2.X, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT T1.W, PV.W, PS, -; EG-NEXT: AND_INT * T2.W, T6.X, literal.x, -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: OR_INT T6.X, PV.W, PS, -; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, +; EG-NEXT: LSHL * T2.W, T1.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.X, PV.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: LSHR * T8.X, T0.W, literal.x, +; EG-NEXT: LSHR * T2.X, T0.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: v5i8_arg: ; CM: ; %bb.0: ; %entry -; CM-NEXT: ALU 0, @16, KC0[], KC1[] +; CM-NEXT: ALU 4, @16, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 4 @6 -; CM-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X -; CM-NEXT: MEM_RAT MSKOR T5.XW, T7.X +; CM-NEXT: ALU 18, @21, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X +; CM-NEXT: MEM_RAT MSKOR T5.XW, T0.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3 -; CM-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3 -; CM-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3 -; CM-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3 -; CM-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3 +; CM-NEXT: VTX_READ_8 T1.X, T0.X, 47, #3 +; CM-NEXT: VTX_READ_8 T2.X, T0.X, 46, #3 +; CM-NEXT: VTX_READ_8 T3.X, T0.X, 44, #3 +; CM-NEXT: VTX_READ_8 T4.X, T0.X, 45, #3 +; CM-NEXT: VTX_READ_8 T0.X, T0.X, 48, #3 ; CM-NEXT: ALU clause starting at 16: -; CM-NEXT: MOV * T5.X, 0.0, -; CM-NEXT: ALU clause starting at 17: +; CM-NEXT: MOV T0.X, 0.0, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, -; CM-NEXT: LSHL * T1.W, PV.W, literal.y, -; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45) -; CM-NEXT: LSHL T5.X, PV.Z, PV.W, +; CM-NEXT: ALU clause starting at 21: +; CM-NEXT: LSHL * T1.W, T1.W, literal.x, +; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; CM-NEXT: LSHL T5.X, T0.X, PV.W, ; CM-NEXT: LSHL * T5.W, literal.x, PV.W, ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) ; CM-NEXT: MOV T5.Y, 0.0, ; CM-NEXT: MOV T5.Z, 0.0, -; CM-NEXT: AND_INT * T1.W, T9.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Y, T8.X, literal.x, -; CM-NEXT: LSHL T0.Z, PV.W, literal.y, -; CM-NEXT: LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212 -; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44) -; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z, -; CM-NEXT: LSHL * T1.W, PV.Y, literal.x, +; CM-NEXT: LSHL * T1.W, T4.X, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T7.X, T0.W, literal.x, +; CM-NEXT: OR_INT T0.Z, T3.X, PV.W, +; CM-NEXT: LSHL * T1.W, T2.X, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: LSHR T0.X, T0.W, literal.x, ; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W, -; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, -; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43) -; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, -; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, +; CM-NEXT: LSHL * T0.W, T1.X, literal.y, +; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44) +; CM-NEXT: OR_INT * T1.X, PV.Z, PV.W, +; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: store <5 x i8> %in, ptr addrspace(1) %out, align 4 @@ -2418,214 +2407,88 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; EG-LABEL: v8i8_arg: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 1, @36, KC0[], KC1[] -; EG-NEXT: TEX 0 @20 -; EG-NEXT: ALU 5, @38, KC0[], KC1[] -; EG-NEXT: TEX 0 @22 -; EG-NEXT: ALU 5, @44, KC0[], KC1[] -; EG-NEXT: TEX 0 @24 -; EG-NEXT: ALU 7, @50, KC0[], KC1[] -; EG-NEXT: TEX 0 @26 -; EG-NEXT: ALU 7, @58, KC0[], KC1[] -; EG-NEXT: TEX 0 @28 -; EG-NEXT: ALU 7, @66, KC0[], KC1[] -; EG-NEXT: TEX 0 @30 -; EG-NEXT: ALU 7, @74, KC0[], KC1[] -; EG-NEXT: TEX 0 @32 -; EG-NEXT: ALU 5, @82, KC0[], KC1[] -; EG-NEXT: TEX 0 @34 -; EG-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1 +; EG-NEXT: ALU 0, @24, KC0[], KC1[] +; EG-NEXT: TEX 2 @8 +; EG-NEXT: ALU 2, @25, KC0[], KC1[] +; EG-NEXT: TEX 4 @14 +; EG-NEXT: ALU 14, @28, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 20: -; EG-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3 -; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3 -; EG-NEXT: Fetch clause starting at 24: -; EG-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3 -; EG-NEXT: Fetch clause starting at 26: -; EG-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3 -; EG-NEXT: Fetch clause starting at 28: -; EG-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3 -; EG-NEXT: Fetch clause starting at 30: -; EG-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3 -; EG-NEXT: Fetch clause starting at 32: -; EG-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3 -; EG-NEXT: Fetch clause starting at 34: -; EG-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3 -; EG-NEXT: ALU clause starting at 36: -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: MOV * T5.X, 0.0, -; EG-NEXT: ALU clause starting at 38: -; EG-NEXT: LSHL T0.W, T6.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV T2.X, PV.W, -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: ALU clause starting at 44: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T6.X, literal.y, -; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 50: -; EG-NEXT: AND_INT T0.W, T6.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T2.X, PV.W, -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: ALU clause starting at 58: -; EG-NEXT: AND_INT T0.W, T6.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 66: -; EG-NEXT: AND_INT T0.W, T6.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65281(nan) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T2.X, PV.W, -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: ALU clause starting at 74: -; EG-NEXT: AND_INT T0.W, T6.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65281(nan) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 49, #3 +; EG-NEXT: VTX_READ_8 T2.X, T0.X, 50, #3 +; EG-NEXT: VTX_READ_8 T3.X, T0.X, 48, #3 +; EG-NEXT: Fetch clause starting at 14: +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 47, #3 +; EG-NEXT: VTX_READ_8 T3.X, T0.X, 51, #3 +; EG-NEXT: VTX_READ_8 T4.X, T0.X, 46, #3 +; EG-NEXT: VTX_READ_8 T5.X, T0.X, 44, #3 +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 45, #3 +; EG-NEXT: ALU clause starting at 24: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 25: +; EG-NEXT: LSHL * T0.W, T1.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 82: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T6.X, literal.y, -; EG-NEXT: -256(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T5.Y, PV.W, PS, -; EG-NEXT: MOV T2.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: ALU clause starting at 88: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T5.X, literal.y, -; EG-NEXT: -256(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT T5.X, PV.W, PS, -; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T0.W, T3.X, PV.W, +; EG-NEXT: ALU clause starting at 28: +; EG-NEXT: LSHL T1.W, T2.X, literal.x, +; EG-NEXT: LSHL * T2.W, T0.X, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: OR_INT T0.Y, T5.X, PS, +; EG-NEXT: LSHL T0.Z, T4.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: OR_INT T0.W, T0.W, PV.W, +; EG-NEXT: LSHL * T1.W, T3.X, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: OR_INT T1.Y, PV.W, PS, +; EG-NEXT: OR_INT T0.W, PV.Y, PV.Z, +; EG-NEXT: LSHL * T1.W, T1.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T1.X, PV.W, PS, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: v8i8_arg: ; CM: ; %bb.0: ; %entry -; CM-NEXT: ALU 1, @36, KC0[], KC1[] -; CM-NEXT: TEX 0 @20 -; CM-NEXT: ALU 5, @38, KC0[], KC1[] -; CM-NEXT: TEX 0 @22 -; CM-NEXT: ALU 5, @44, KC0[], KC1[] -; CM-NEXT: TEX 0 @24 -; CM-NEXT: ALU 7, @50, KC0[], KC1[] -; CM-NEXT: TEX 0 @26 -; CM-NEXT: ALU 7, @58, KC0[], KC1[] -; CM-NEXT: TEX 0 @28 -; CM-NEXT: ALU 7, @66, KC0[], KC1[] -; CM-NEXT: TEX 0 @30 -; CM-NEXT: ALU 7, @74, KC0[], KC1[] -; CM-NEXT: TEX 0 @32 -; CM-NEXT: ALU 5, @82, KC0[], KC1[] -; CM-NEXT: TEX 0 @34 -; CM-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X +; CM-NEXT: ALU 0, @24, KC0[], KC1[] +; CM-NEXT: TEX 2 @8 +; CM-NEXT: ALU 2, @25, KC0[], KC1[] +; CM-NEXT: TEX 4 @14 +; CM-NEXT: ALU 14, @28, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD -; CM-NEXT: Fetch clause starting at 20: -; CM-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3 -; CM-NEXT: Fetch clause starting at 22: -; CM-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3 -; CM-NEXT: Fetch clause starting at 24: -; CM-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3 -; CM-NEXT: Fetch clause starting at 26: -; CM-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3 -; CM-NEXT: Fetch clause starting at 28: -; CM-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3 -; CM-NEXT: Fetch clause starting at 30: -; CM-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3 -; CM-NEXT: Fetch clause starting at 32: -; CM-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3 -; CM-NEXT: Fetch clause starting at 34: -; CM-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3 -; CM-NEXT: ALU clause starting at 36: -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: MOV * T5.X, 0.0, -; CM-NEXT: ALU clause starting at 38: -; CM-NEXT: LSHL T0.Z, T6.X, literal.x, -; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, -; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) -; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, -; CM-NEXT: MOV T2.X, PV.W, -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: ALU clause starting at 44: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T6.X, literal.y, -; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T3.X, PV.W, -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: ALU clause starting at 50: -; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T2.X, PV.W, -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: ALU clause starting at 58: -; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T3.X, PV.W, -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: ALU clause starting at 66: -; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -65281(nan), 8(1.121039e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T2.X, PV.W, -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: ALU clause starting at 74: -; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -65281(nan), 8(1.121039e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T3.X, PV.W, -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: ALU clause starting at 82: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, -; CM-NEXT: -256(nan), 255(3.573311e-43) -; CM-NEXT: OR_INT * T5.Y, PV.Z, PV.W, -; CM-NEXT: MOV T2.X, PV.Y, -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: ALU clause starting at 88: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T5.X, literal.y, -; CM-NEXT: -256(nan), 255(3.573311e-43) -; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W, -; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, +; CM-NEXT: Fetch clause starting at 8: +; CM-NEXT: VTX_READ_8 T1.X, T0.X, 49, #3 +; CM-NEXT: VTX_READ_8 T2.X, T0.X, 50, #3 +; CM-NEXT: VTX_READ_8 T3.X, T0.X, 48, #3 +; CM-NEXT: Fetch clause starting at 14: +; CM-NEXT: VTX_READ_8 T1.X, T0.X, 47, #3 +; CM-NEXT: VTX_READ_8 T3.X, T0.X, 51, #3 +; CM-NEXT: VTX_READ_8 T4.X, T0.X, 46, #3 +; CM-NEXT: VTX_READ_8 T5.X, T0.X, 44, #3 +; CM-NEXT: VTX_READ_8 T0.X, T0.X, 45, #3 +; CM-NEXT: ALU clause starting at 24: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 25: +; CM-NEXT: LSHL * T0.W, T1.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: OR_INT * T0.W, T3.X, PV.W, +; CM-NEXT: ALU clause starting at 28: +; CM-NEXT: LSHL T0.Z, T2.X, literal.x, +; CM-NEXT: LSHL * T1.W, T0.X, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; CM-NEXT: OR_INT T0.X, T5.X, PV.W, +; CM-NEXT: LSHL T0.Y, T4.X, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: OR_INT T0.Z, T0.W, PV.Z, +; CM-NEXT: LSHL * T0.W, T3.X, literal.y, BS:VEC_201 +; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; CM-NEXT: OR_INT T1.Y, PV.Z, PV.W, +; CM-NEXT: OR_INT T0.Z, PV.X, PV.Y, +; CM-NEXT: LSHL * T0.W, T1.X, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: OR_INT * T1.X, PV.Z, PV.W, +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: store <8 x i8> %in, ptr addrspace(1) %out @@ -3147,406 +3010,144 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; EG-LABEL: v16i8_arg: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 1, @68, KC0[], KC1[] -; EG-NEXT: TEX 0 @36 -; EG-NEXT: ALU 5, @70, KC0[], KC1[] -; EG-NEXT: TEX 0 @38 -; EG-NEXT: ALU 5, @76, KC0[], KC1[] -; EG-NEXT: TEX 0 @40 -; EG-NEXT: ALU 5, @82, KC0[], KC1[] -; EG-NEXT: TEX 0 @42 -; EG-NEXT: ALU 5, @88, KC0[], KC1[] -; EG-NEXT: TEX 0 @44 -; EG-NEXT: ALU 7, @94, KC0[], KC1[] -; EG-NEXT: TEX 0 @46 -; EG-NEXT: ALU 7, @102, KC0[], KC1[] -; EG-NEXT: TEX 0 @48 -; EG-NEXT: ALU 7, @110, KC0[], KC1[] -; EG-NEXT: TEX 0 @50 -; EG-NEXT: ALU 7, @118, KC0[], KC1[] -; EG-NEXT: TEX 0 @52 -; EG-NEXT: ALU 7, @126, KC0[], KC1[] -; EG-NEXT: TEX 0 @54 -; EG-NEXT: ALU 7, @134, KC0[], KC1[] -; EG-NEXT: TEX 0 @56 -; EG-NEXT: ALU 7, @142, KC0[], KC1[] -; EG-NEXT: TEX 0 @58 -; EG-NEXT: ALU 7, @150, KC0[], KC1[] -; EG-NEXT: TEX 0 @60 -; EG-NEXT: ALU 5, @158, KC0[], KC1[] -; EG-NEXT: TEX 0 @62 -; EG-NEXT: ALU 5, @164, KC0[], KC1[] -; EG-NEXT: TEX 0 @64 -; EG-NEXT: ALU 5, @170, KC0[], KC1[] -; EG-NEXT: TEX 0 @66 -; EG-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 +; EG-NEXT: ALU 0, @42, KC0[], KC1[] +; EG-NEXT: TEX 3 @10 +; EG-NEXT: ALU 3, @43, KC0[], KC1[] +; EG-NEXT: TEX 4 @18 +; EG-NEXT: ALU 7, @47, KC0[], KC1[] +; EG-NEXT: TEX 6 @28 +; EG-NEXT: ALU 21, @55, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 36: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3 -; EG-NEXT: Fetch clause starting at 38: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3 -; EG-NEXT: Fetch clause starting at 40: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3 -; EG-NEXT: Fetch clause starting at 42: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3 -; EG-NEXT: Fetch clause starting at 44: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3 -; EG-NEXT: Fetch clause starting at 46: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3 -; EG-NEXT: Fetch clause starting at 48: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3 -; EG-NEXT: Fetch clause starting at 50: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3 -; EG-NEXT: Fetch clause starting at 52: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3 -; EG-NEXT: Fetch clause starting at 54: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3 -; EG-NEXT: Fetch clause starting at 56: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3 -; EG-NEXT: Fetch clause starting at 58: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3 -; EG-NEXT: Fetch clause starting at 60: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3 -; EG-NEXT: Fetch clause starting at 62: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3 -; EG-NEXT: Fetch clause starting at 64: -; EG-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3 -; EG-NEXT: Fetch clause starting at 66: -; EG-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3 -; EG-NEXT: ALU clause starting at 68: -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: MOV * T7.X, 0.0, -; EG-NEXT: ALU clause starting at 70: -; EG-NEXT: LSHL T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV T2.X, PV.W, -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: ALU clause starting at 76: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T8.X, literal.y, -; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: ALU clause starting at 82: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T8.X, literal.y, -; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV * T0.Y, T5.X, -; EG-NEXT: ALU clause starting at 88: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T8.X, literal.y, -; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 94: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T2.X, PV.W, -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: ALU clause starting at 102: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: ALU clause starting at 110: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV * T0.Y, T5.X, -; EG-NEXT: ALU clause starting at 118: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 126: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65281(nan) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T2.X, PV.W, -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: ALU clause starting at 134: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65281(nan) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: ALU clause starting at 142: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65281(nan) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: Fetch clause starting at 10: +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 65, #3 +; EG-NEXT: VTX_READ_8 T2.X, T0.X, 66, #3 +; EG-NEXT: VTX_READ_8 T3.X, T0.X, 64, #3 +; EG-NEXT: VTX_READ_8 T4.X, T0.X, 61, #3 +; EG-NEXT: Fetch clause starting at 18: +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 62, #3 +; EG-NEXT: VTX_READ_8 T3.X, T0.X, 60, #3 +; EG-NEXT: VTX_READ_8 T4.X, T0.X, 67, #3 +; EG-NEXT: VTX_READ_8 T5.X, T0.X, 56, #3 +; EG-NEXT: VTX_READ_8 T6.X, T0.X, 57, #3 +; EG-NEXT: Fetch clause starting at 28: +; EG-NEXT: VTX_READ_8 T2.X, T0.X, 55, #3 +; EG-NEXT: VTX_READ_8 T3.X, T0.X, 59, #3 +; EG-NEXT: VTX_READ_8 T4.X, T0.X, 54, #3 +; EG-NEXT: VTX_READ_8 T5.X, T0.X, 52, #3 +; EG-NEXT: VTX_READ_8 T6.X, T0.X, 53, #3 +; EG-NEXT: VTX_READ_8 T7.X, T0.X, 63, #3 +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 58, #3 +; EG-NEXT: ALU clause starting at 42: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 43: +; EG-NEXT: LSHL T0.W, T1.X, literal.x, +; EG-NEXT: LSHL * T1.W, T4.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV * T0.Y, T5.X, -; EG-NEXT: ALU clause starting at 150: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65281(nan) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: OR_INT * T0.W, T3.X, PV.W, +; EG-NEXT: ALU clause starting at 47: +; EG-NEXT: LSHL T2.W, T2.X, literal.x, +; EG-NEXT: LSHL * T3.W, T6.X, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: OR_INT T0.Y, T5.X, PS, +; EG-NEXT: OR_INT T0.Z, T0.W, PV.W, +; EG-NEXT: LSHL T0.W, T4.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: OR_INT * T1.W, T3.X, T1.W, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 55: +; EG-NEXT: LSHL * T2.W, T1.X, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.X, T0.X, literal.x, +; EG-NEXT: OR_INT T1.Y, T1.W, PV.W, +; EG-NEXT: LSHL T1.Z, T7.X, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: LSHL T1.W, T6.X, literal.z, BS:VEC_201 +; EG-NEXT: OR_INT * T0.W, T0.Z, T0.W, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 158: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, -; EG-NEXT: -256(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T7.W, PV.W, PS, -; EG-NEXT: MOV T2.X, PV.W, -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: ALU clause starting at 164: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, -; EG-NEXT: -256(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T7.Z, PV.W, PS, -; EG-NEXT: MOV T3.X, PV.Z, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: ALU clause starting at 170: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, -; EG-NEXT: -256(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T7.Y, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T5.X, -; EG-NEXT: ALU clause starting at 176: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T7.X, literal.y, -; EG-NEXT: -256(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT T7.X, PV.W, PS, -; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT T1.X, T5.X, PV.W, +; EG-NEXT: LSHL T2.Y, T4.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: OR_INT T0.Z, PV.Y, PV.Z, +; EG-NEXT: OR_INT T1.W, T0.Y, PV.X, +; EG-NEXT: LSHL * T2.W, T3.X, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: OR_INT T0.Y, PV.W, PS, +; EG-NEXT: OR_INT T1.W, PV.X, PV.Y, +; EG-NEXT: LSHL * T2.W, T2.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.X, PV.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: v16i8_arg: ; CM: ; %bb.0: ; %entry -; CM-NEXT: ALU 1, @68, KC0[], KC1[] -; CM-NEXT: TEX 0 @36 -; CM-NEXT: ALU 5, @70, KC0[], KC1[] -; CM-NEXT: TEX 0 @38 -; CM-NEXT: ALU 5, @76, KC0[], KC1[] -; CM-NEXT: TEX 0 @40 -; CM-NEXT: ALU 5, @82, KC0[], KC1[] -; CM-NEXT: TEX 0 @42 -; CM-NEXT: ALU 5, @88, KC0[], KC1[] -; CM-NEXT: TEX 0 @44 -; CM-NEXT: ALU 7, @94, KC0[], KC1[] -; CM-NEXT: TEX 0 @46 -; CM-NEXT: ALU 7, @102, KC0[], KC1[] -; CM-NEXT: TEX 0 @48 -; CM-NEXT: ALU 7, @110, KC0[], KC1[] -; CM-NEXT: TEX 0 @50 -; CM-NEXT: ALU 7, @118, KC0[], KC1[] -; CM-NEXT: TEX 0 @52 -; CM-NEXT: ALU 7, @126, KC0[], KC1[] -; CM-NEXT: TEX 0 @54 -; CM-NEXT: ALU 7, @134, KC0[], KC1[] -; CM-NEXT: TEX 0 @56 -; CM-NEXT: ALU 7, @142, KC0[], KC1[] -; CM-NEXT: TEX 0 @58 -; CM-NEXT: ALU 7, @150, KC0[], KC1[] -; CM-NEXT: TEX 0 @60 -; CM-NEXT: ALU 5, @158, KC0[], KC1[] -; CM-NEXT: TEX 0 @62 -; CM-NEXT: ALU 5, @164, KC0[], KC1[] -; CM-NEXT: TEX 0 @64 -; CM-NEXT: ALU 5, @170, KC0[], KC1[] -; CM-NEXT: TEX 0 @66 -; CM-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X +; CM-NEXT: ALU 0, @42, KC0[], KC1[] +; CM-NEXT: TEX 3 @10 +; CM-NEXT: ALU 4, @43, KC0[], KC1[] +; CM-NEXT: TEX 4 @18 +; CM-NEXT: ALU 8, @48, KC0[], KC1[] +; CM-NEXT: TEX 6 @28 +; CM-NEXT: ALU 19, @57, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD -; CM-NEXT: Fetch clause starting at 36: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3 -; CM-NEXT: Fetch clause starting at 38: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3 -; CM-NEXT: Fetch clause starting at 40: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3 -; CM-NEXT: Fetch clause starting at 42: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3 -; CM-NEXT: Fetch clause starting at 44: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3 -; CM-NEXT: Fetch clause starting at 46: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3 -; CM-NEXT: Fetch clause starting at 48: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3 -; CM-NEXT: Fetch clause starting at 50: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3 -; CM-NEXT: Fetch clause starting at 52: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3 -; CM-NEXT: Fetch clause starting at 54: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3 -; CM-NEXT: Fetch clause starting at 56: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3 -; CM-NEXT: Fetch clause starting at 58: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3 -; CM-NEXT: Fetch clause starting at 60: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3 -; CM-NEXT: Fetch clause starting at 62: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3 -; CM-NEXT: Fetch clause starting at 64: -; CM-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3 -; CM-NEXT: Fetch clause starting at 66: -; CM-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3 -; CM-NEXT: ALU clause starting at 68: -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: MOV * T7.X, 0.0, -; CM-NEXT: ALU clause starting at 70: -; CM-NEXT: LSHL T0.Z, T8.X, literal.x, -; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, -; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) -; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, -; CM-NEXT: MOV T2.X, PV.W, -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: ALU clause starting at 76: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T8.X, literal.y, -; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T3.X, PV.W, -; CM-NEXT: MOV * T0.Y, T4.X, -; CM-NEXT: ALU clause starting at 82: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T8.X, literal.y, -; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T4.X, PV.W, -; CM-NEXT: MOV * T0.Y, T5.X, -; CM-NEXT: ALU clause starting at 88: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T8.X, literal.y, -; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T5.X, PV.W, -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: ALU clause starting at 94: -; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T2.X, PV.W, -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: ALU clause starting at 102: -; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T3.X, PV.W, -; CM-NEXT: MOV * T0.Y, T4.X, -; CM-NEXT: ALU clause starting at 110: -; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T4.X, PV.W, -; CM-NEXT: MOV * T0.Y, T5.X, -; CM-NEXT: ALU clause starting at 118: -; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T5.X, PV.W, -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: ALU clause starting at 126: -; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -65281(nan), 8(1.121039e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T2.X, PV.W, -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: ALU clause starting at 134: -; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -65281(nan), 8(1.121039e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T3.X, PV.W, -; CM-NEXT: MOV * T0.Y, T4.X, -; CM-NEXT: ALU clause starting at 142: -; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -65281(nan), 8(1.121039e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T4.X, PV.W, -; CM-NEXT: MOV * T0.Y, T5.X, -; CM-NEXT: ALU clause starting at 150: -; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, PV.W, literal.y, -; CM-NEXT: -65281(nan), 8(1.121039e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T5.X, PV.W, -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: ALU clause starting at 158: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, -; CM-NEXT: -256(nan), 255(3.573311e-43) -; CM-NEXT: OR_INT * T7.W, PV.Z, PV.W, -; CM-NEXT: MOV T2.X, PV.W, -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: ALU clause starting at 164: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, -; CM-NEXT: -256(nan), 255(3.573311e-43) -; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W, -; CM-NEXT: MOV T3.X, PV.Z, -; CM-NEXT: MOV * T0.Y, T4.X, -; CM-NEXT: ALU clause starting at 170: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, -; CM-NEXT: -256(nan), 255(3.573311e-43) -; CM-NEXT: OR_INT * T7.Y, PV.Z, PV.W, -; CM-NEXT: MOV T4.X, PV.Y, -; CM-NEXT: MOV * T0.Y, T5.X, -; CM-NEXT: ALU clause starting at 176: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T7.X, literal.y, -; CM-NEXT: -256(nan), 255(3.573311e-43) -; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W, -; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, +; CM-NEXT: Fetch clause starting at 10: +; CM-NEXT: VTX_READ_8 T1.X, T0.X, 65, #3 +; CM-NEXT: VTX_READ_8 T2.X, T0.X, 66, #3 +; CM-NEXT: VTX_READ_8 T3.X, T0.X, 61, #3 +; CM-NEXT: VTX_READ_8 T4.X, T0.X, 64, #3 +; CM-NEXT: Fetch clause starting at 18: +; CM-NEXT: VTX_READ_8 T1.X, T0.X, 67, #3 +; CM-NEXT: VTX_READ_8 T3.X, T0.X, 52, #3 +; CM-NEXT: VTX_READ_8 T4.X, T0.X, 53, #3 +; CM-NEXT: VTX_READ_8 T5.X, T0.X, 62, #3 +; CM-NEXT: VTX_READ_8 T6.X, T0.X, 60, #3 +; CM-NEXT: Fetch clause starting at 28: +; CM-NEXT: VTX_READ_8 T2.X, T0.X, 55, #3 +; CM-NEXT: VTX_READ_8 T3.X, T0.X, 59, #3 +; CM-NEXT: VTX_READ_8 T4.X, T0.X, 54, #3 +; CM-NEXT: VTX_READ_8 T5.X, T0.X, 58, #3 +; CM-NEXT: VTX_READ_8 T6.X, T0.X, 56, #3 +; CM-NEXT: VTX_READ_8 T7.X, T0.X, 63, #3 +; CM-NEXT: VTX_READ_8 T0.X, T0.X, 57, #3 +; CM-NEXT: ALU clause starting at 42: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 43: +; CM-NEXT: LSHL * T0.W, T1.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: OR_INT T0.Z, T4.X, PV.W, +; CM-NEXT: LSHL * T0.W, T3.X, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: ALU clause starting at 48: +; CM-NEXT: LSHL T2.X, T2.X, literal.x, +; CM-NEXT: OR_INT T0.Y, T6.X, T0.W, BS:VEC_120/SCL_212 +; CM-NEXT: LSHL * T1.Z, T5.X, literal.x, BS:VEC_201 +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: LSHL * T0.W, T4.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: OR_INT T1.Y, T3.X, PV.W, +; CM-NEXT: OR_INT T1.Z, T0.Y, T1.Z, +; CM-NEXT: OR_INT * T0.W, T0.Z, T2.X, +; CM-NEXT: ALU clause starting at 57: +; CM-NEXT: LSHL T0.Z, T1.X, literal.x, +; CM-NEXT: LSHL * T1.W, T0.X, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: 24(3.363116e-44), 8(1.121039e-44) +; CM-NEXT: LSHL T0.X, T7.X, literal.x, +; CM-NEXT: OR_INT T0.Y, T6.X, PV.W, BS:VEC_120/SCL_212 +; CM-NEXT: LSHL T2.Z, T5.X, literal.y, BS:VEC_201 +; CM-NEXT: OR_INT * T0.W, T0.W, PV.Z, +; CM-NEXT: 24(3.363116e-44), 16(2.242078e-44) +; CM-NEXT: LSHL T1.X, T4.X, literal.x, +; CM-NEXT: OR_INT T0.Y, PV.Y, PV.Z, +; CM-NEXT: OR_INT T0.Z, T1.Z, PV.X, +; CM-NEXT: LSHL * T1.W, T3.X, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; CM-NEXT: OR_INT T0.Y, PV.Y, PV.W, +; CM-NEXT: OR_INT T1.Z, T1.Y, PV.X, +; CM-NEXT: LSHL * T1.W, T2.X, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: OR_INT * T0.X, PV.Z, PV.W, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: store <16 x i8> %in, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index bb98af4e7a5c7..48c8ab60d2829 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1122,18 +1122,18 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHR * T4.Y, T4.X, literal.x, +; EG-NEXT: LSHR * T0.Y, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, T4.X, literal.x, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) ; ; GFX12-LABEL: constant_zextload_v2i16_to_v2i32: @@ -1206,21 +1206,20 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1 +; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x, -; EG-NEXT: LSHR T0.W, T4.X, literal.x, -; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) -; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x, +; EG-NEXT: ASHR * T0.Y, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) ; ; GFX12-LABEL: constant_sextload_v2i16_to_v2i32: ; GFX12: ; %bb.0: @@ -5716,20 +5715,20 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHR * T4.Z, T4.X, literal.x, +; EG-NEXT: LSHR * T0.Z, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, T4.X, literal.x, -; EG-NEXT: MOV T4.Y, 0.0, -; EG-NEXT: MOV T4.W, 0.0, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV T0.W, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) ; ; GFX12-LABEL: constant_zextload_v2i16_to_v2i64: @@ -5812,22 +5811,22 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: ASHR * T4.W, T4.X, literal.x, +; EG-NEXT: ASHR * T0.W, T0.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: ASHR * T4.Z, T4.X, literal.x, +; EG-NEXT: ASHR * T0.Z, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T4.X, T4.X, 0.0, literal.x, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) -; EG-NEXT: ASHR * T4.Y, PV.X, literal.x, +; EG-NEXT: ASHR * T0.Y, PV.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; ; GFX12-LABEL: constant_sextload_v2i16_to_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index ff55ab8859c83..a990982c7e313 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -1272,23 +1272,23 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV * T0.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T4.Z, T4.X, literal.x, PV.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T4.Y, T4.X, literal.x, T0.W, -; EG-NEXT: LSHR * T4.W, T4.X, literal.y, +; EG-NEXT: MOV T0.W, literal.x, +; EG-NEXT: LSHR * T1.W, T0.X, literal.y, ; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) -; EG-NEXT: AND_INT T4.X, T4.X, literal.x, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; EG-NEXT: BFE_UINT * T1.Z, T0.X, literal.x, PV.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT * T1.Y, T0.X, literal.x, T0.W, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.X, T0.X, literal.x, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.y, ; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i32: @@ -1375,26 +1375,25 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T4.X, 1 +; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T4.X, literal.y, -; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) -; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T4.X, literal.y, +; EG-NEXT: ASHR * T1.W, T0.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T1.X, T0.X, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T0.X, literal.y, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T5.Z, PS, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T4.X, literal.x, +; EG-NEXT: BFE_INT T1.Z, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T0.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x, -; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y, +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: BFE_INT * T1.Y, PV.W, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i32: @@ -1518,35 +1517,34 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 +; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T5.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV * T0.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T6.Z, T5.X, literal.x, PV.W, +; EG-NEXT: MOV T0.W, literal.x, +; EG-NEXT: LSHR * T1.W, T0.X, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_UINT * T1.Z, T0.X, literal.x, PV.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T6.Y, T5.X, literal.x, T0.W, -; EG-NEXT: BFE_UINT T7.Z, T5.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T6.W, T5.X, literal.z, -; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T6.X, T5.X, literal.x, -; EG-NEXT: BFE_UINT T7.Y, T5.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; EG-NEXT: BFE_UINT T1.Y, T0.X, literal.x, T0.W, +; EG-NEXT: LSHR * T2.W, T0.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: AND_INT T1.X, T0.X, literal.x, +; EG-NEXT: BFE_UINT T2.Z, T0.Y, literal.y, T0.W, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.z, +; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: LSHR * T7.W, T5.Y, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T7.X, T5.Y, literal.x, +; EG-NEXT: BFE_UINT * T2.Y, T0.Y, literal.x, T0.W, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.X, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) -; EG-NEXT: LSHR * T8.X, PV.W, literal.x, +; EG-NEXT: LSHR * T3.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i32: @@ -1677,38 +1675,36 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 23, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 +; EG-NEXT: ALU 21, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T5.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T5.X, literal.y, -; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) -; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x, -; EG-NEXT: LSHR T0.Z, T5.Y, literal.y, -; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T5.X, literal.z, -; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T0.Y, T5.Y, literal.x, -; EG-NEXT: BFE_INT T6.Z, PS, 0.0, literal.y, -; EG-NEXT: BFE_INT T7.W, PV.Z, 0.0, literal.y, -; EG-NEXT: LSHR * T0.W, T5.X, literal.y, -; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) -; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, -; EG-NEXT: BFE_INT T6.Y, PS, 0.0, literal.y, -; EG-NEXT: BFE_INT T7.Z, PV.Y, 0.0, literal.y, -; EG-NEXT: LSHR T0.W, T5.Y, literal.y, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, +; EG-NEXT: ASHR * T1.W, T0.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T1.X, T0.X, 0.0, literal.x, +; EG-NEXT: LSHR T0.W, T0.X, literal.y, +; EG-NEXT: ASHR * T2.W, T0.Y, literal.z, +; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T2.X, T0.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T1.Z, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR T0.W, T0.Y, literal.y, +; EG-NEXT: LSHR * T3.W, T0.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: BFE_INT T1.Y, PS, 0.0, literal.y, +; EG-NEXT: BFE_INT T2.Z, PV.W, 0.0, literal.y, +; EG-NEXT: LSHR T0.W, T0.Y, literal.y, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T8.X, PS, literal.x, -; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y, +; EG-NEXT: LSHR T3.X, PS, literal.x, +; EG-NEXT: BFE_INT * T2.Y, PV.W, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i32: @@ -1911,56 +1907,54 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @8 -; EG-NEXT: ALU 39, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T13.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T11.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1 +; EG-NEXT: ALU 37, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T6.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: MOV * T0.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T8.Z, T7.X, literal.x, PV.W, +; EG-NEXT: MOV T1.W, literal.x, +; EG-NEXT: LSHR * T2.W, T0.X, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_UINT * T2.Z, T0.X, literal.x, PV.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T8.Y, T7.X, literal.x, T0.W, -; EG-NEXT: BFE_UINT T9.Z, T7.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T8.W, T7.X, literal.z, -; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T8.X, T7.X, literal.x, -; EG-NEXT: BFE_UINT T9.Y, T7.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; EG-NEXT: BFE_UINT T2.Y, T0.X, literal.x, T1.W, +; EG-NEXT: LSHR * T3.W, T0.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: AND_INT T2.X, T0.X, literal.x, +; EG-NEXT: BFE_UINT T3.Z, T0.Y, literal.y, T1.W, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.z, +; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T10.Z, T7.Z, literal.x, T0.W, -; EG-NEXT: LSHR * T9.W, T7.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: AND_INT T9.X, T7.Y, literal.x, -; EG-NEXT: BFE_UINT T10.Y, T7.Z, literal.y, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T11.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T12.Z, T7.W, literal.y, T0.W, -; EG-NEXT: LSHR T10.W, T7.Z, literal.z, -; EG-NEXT: AND_INT * T10.X, T7.Z, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T3.Y, T0.Y, literal.x, T1.W, +; EG-NEXT: LSHR * T4.W, T0.Z, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: AND_INT T3.X, T0.Y, literal.x, +; EG-NEXT: BFE_UINT T4.Z, T0.Z, literal.y, T1.W, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) +; EG-NEXT: LSHR T1.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T4.Y, T0.Z, literal.y, T1.W, +; EG-NEXT: LSHR T5.W, T0.W, literal.z, +; EG-NEXT: AND_INT * T4.X, T0.Z, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T12.Y, T7.W, literal.x, T0.W, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) -; EG-NEXT: LSHR T13.X, PV.W, literal.x, -; EG-NEXT: LSHR T12.W, T7.W, literal.y, -; EG-NEXT: AND_INT * T12.X, T7.W, literal.z, -; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) +; EG-NEXT: BFE_UINT T5.Z, T0.W, literal.x, T1.W, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) +; EG-NEXT: LSHR T6.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T5.Y, T0.W, literal.y, T1.W, BS:VEC_021/SCL_122 +; EG-NEXT: AND_INT * T5.X, T0.W, literal.z, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) -; EG-NEXT: LSHR * T14.X, PV.W, literal.x, +; EG-NEXT: LSHR * T7.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i32: @@ -2177,64 +2171,58 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @8 -; EG-NEXT: ALU 47, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T7.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T8.X, 1 +; EG-NEXT: ALU 41, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T0.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, -; EG-NEXT: LSHR T0.W, T7.W, literal.y, -; EG-NEXT: LSHR * T1.W, T7.Z, literal.z, +; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T9.X, T7.X, 0.0, literal.x, -; EG-NEXT: LSHR T0.Y, T7.W, literal.y, -; EG-NEXT: LSHR T0.Z, T7.Z, literal.z, -; EG-NEXT: LSHR T2.W, T7.Y, literal.x, -; EG-NEXT: LSHR * T3.W, T7.X, literal.y, -; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T10.X, T7.Y, 0.0, literal.x, -; EG-NEXT: LSHR T1.Y, T7.Z, literal.y, -; EG-NEXT: LSHR T1.Z, T7.Y, literal.y, -; EG-NEXT: BFE_INT T9.W, PS, 0.0, literal.x, -; EG-NEXT: LSHR * T3.W, T7.X, literal.z, -; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T11.X, T7.Z, 0.0, literal.x, -; EG-NEXT: LSHR T2.Y, T7.Y, literal.y, -; EG-NEXT: BFE_INT T9.Z, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T10.W, PV.Z, 0.0, literal.x, -; EG-NEXT: LSHR * T3.W, T7.X, literal.x, +; EG-NEXT: LSHR T2.X, PV.W, literal.x, +; EG-NEXT: ASHR * T3.W, T0.X, literal.y, +; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) +; EG-NEXT: BFE_INT T3.X, T0.X, 0.0, literal.x, +; EG-NEXT: LSHR T1.Y, T0.W, literal.y, +; EG-NEXT: LSHR T1.Z, T0.Z, literal.y, +; EG-NEXT: LSHR T1.W, T0.X, literal.y, +; EG-NEXT: ASHR * T4.W, T0.Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T12.X, T7.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T9.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T10.Z, PV.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T11.W, T1.Y, 0.0, literal.x, -; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T4.X, T0.Y, 0.0, literal.x, +; EG-NEXT: LSHR T2.Y, T0.Y, literal.y, +; EG-NEXT: BFE_INT T3.Z, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR T1.W, T0.X, literal.x, +; EG-NEXT: ASHR * T5.W, T0.Z, literal.z, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: LSHR T7.X, PS, literal.x, -; EG-NEXT: BFE_INT T10.Y, T2.W, 0.0, literal.y, -; EG-NEXT: BFE_INT T11.Z, T0.Z, 0.0, literal.y, -; EG-NEXT: BFE_INT T12.W, T0.Y, 0.0, literal.y, -; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T13.X, PS, literal.x, -; EG-NEXT: BFE_INT T11.Y, T1.W, 0.0, literal.y, -; EG-NEXT: BFE_INT T12.Z, T0.W, 0.0, literal.y, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR T0.W, T7.W, literal.y, BS:VEC_201 +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT T3.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T4.Z, PV.Y, 0.0, literal.x, +; EG-NEXT: LSHR T1.W, T0.Y, literal.x, +; EG-NEXT: ASHR * T6.W, T0.W, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T6.X, T0.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T4.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T5.Z, T1.Z, 0.0, literal.x, +; EG-NEXT: LSHR T1.W, T0.Z, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) +; EG-NEXT: LSHR T0.X, PS, literal.x, +; EG-NEXT: BFE_INT T5.Y, PV.W, 0.0, literal.y, +; EG-NEXT: BFE_INT T6.Z, T1.Y, 0.0, literal.y, +; EG-NEXT: LSHR T0.W, T0.W, literal.y, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T14.X, PS, literal.x, -; EG-NEXT: BFE_INT * T12.Y, PV.W, 0.0, literal.y, +; EG-NEXT: LSHR T7.X, PS, literal.x, +; EG-NEXT: BFE_INT * T6.Y, PV.W, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i32: @@ -2595,97 +2583,95 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @12 -; EG-NEXT: ALU 75, @17, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T23.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T12.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T20.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T18.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1 +; EG-NEXT: ALU 73, @17, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T15.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T14.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T12.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T1.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T9.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 12: -; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1 -; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 +; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 17: -; EG-NEXT: MOV * T0.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T13.Z, T11.X, literal.x, PV.W, +; EG-NEXT: MOV T2.W, literal.x, +; EG-NEXT: LSHR * T3.W, T0.X, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_UINT * T3.Z, T0.X, literal.x, PV.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T13.Y, T11.X, literal.x, T0.W, -; EG-NEXT: BFE_UINT T14.Z, T11.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T13.W, T11.X, literal.z, -; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T13.X, T11.X, literal.x, -; EG-NEXT: BFE_UINT T14.Y, T11.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; EG-NEXT: BFE_UINT T3.Y, T0.X, literal.x, T2.W, +; EG-NEXT: LSHR * T4.W, T0.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: AND_INT T3.X, T0.X, literal.x, +; EG-NEXT: BFE_UINT T4.Z, T0.Y, literal.y, T2.W, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.z, +; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T15.Z, T11.Z, literal.x, T0.W, -; EG-NEXT: LSHR * T14.W, T11.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: AND_INT T14.X, T11.Y, literal.x, -; EG-NEXT: BFE_UINT T15.Y, T11.Z, literal.y, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T16.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T17.Z, T11.W, literal.y, T0.W, -; EG-NEXT: LSHR T15.W, T11.Z, literal.z, -; EG-NEXT: AND_INT * T15.X, T11.Z, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T4.Y, T0.Y, literal.x, T2.W, +; EG-NEXT: LSHR * T5.W, T0.Z, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: AND_INT T4.X, T0.Y, literal.x, +; EG-NEXT: BFE_UINT T5.Z, T0.Z, literal.y, T2.W, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) +; EG-NEXT: LSHR T2.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T5.Y, T0.Z, literal.y, T2.W, +; EG-NEXT: LSHR T6.W, T0.W, literal.z, +; EG-NEXT: AND_INT * T5.X, T0.Z, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T17.Y, T11.W, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) -; EG-NEXT: LSHR T18.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T19.Z, T12.X, literal.y, T0.W, BS:VEC_021/SCL_122 -; EG-NEXT: LSHR T17.W, T11.W, literal.z, -; EG-NEXT: AND_INT * T17.X, T11.W, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T6.Z, T0.W, literal.x, T2.W, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) +; EG-NEXT: LSHR T7.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T6.Y, T0.W, literal.y, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHR T8.W, T1.X, literal.z, +; EG-NEXT: AND_INT * T6.X, T0.W, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T19.Y, T12.X, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44) -; EG-NEXT: LSHR T20.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T21.Z, T12.Y, literal.y, T0.W, -; EG-NEXT: LSHR T19.W, T12.X, literal.z, -; EG-NEXT: AND_INT * T19.X, T12.X, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T8.Z, T1.X, literal.x, T2.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44) +; EG-NEXT: LSHR T9.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T8.Y, T1.X, literal.y, T2.W, +; EG-NEXT: LSHR T10.W, T1.Y, literal.z, +; EG-NEXT: AND_INT * T8.X, T1.X, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T21.Y, T12.Y, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 64(8.968310e-44) -; EG-NEXT: LSHR T12.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T22.Z, T12.Z, literal.y, T0.W, -; EG-NEXT: LSHR T21.W, T12.Y, literal.z, -; EG-NEXT: AND_INT * T21.X, T12.Y, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T10.Z, T1.Y, literal.x, T2.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44) +; EG-NEXT: LSHR T1.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T10.Y, T1.Y, literal.y, T2.W, +; EG-NEXT: LSHR T11.W, T1.Z, literal.z, +; EG-NEXT: AND_INT * T10.X, T1.Y, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T22.Y, T12.Z, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 80(1.121039e-43) -; EG-NEXT: LSHR T23.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T24.Z, T12.W, literal.y, T0.W, -; EG-NEXT: LSHR T22.W, T12.Z, literal.z, -; EG-NEXT: AND_INT * T22.X, T12.Z, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T11.Z, T1.Z, literal.x, T2.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43) +; EG-NEXT: LSHR T12.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T11.Y, T1.Z, literal.y, T2.W, +; EG-NEXT: LSHR T13.W, T1.W, literal.z, +; EG-NEXT: AND_INT * T11.X, T1.Z, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T24.Y, T12.W, literal.x, T0.W, +; EG-NEXT: BFE_UINT T13.Z, T1.W, literal.x, T2.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 96(1.345247e-43) -; EG-NEXT: LSHR T25.X, PV.W, literal.x, -; EG-NEXT: LSHR T24.W, T12.W, literal.y, -; EG-NEXT: AND_INT * T24.X, T12.W, literal.z, -; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) +; EG-NEXT: 16(2.242078e-44), 96(1.345247e-43) +; EG-NEXT: LSHR T14.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T13.Y, T1.W, literal.y, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: AND_INT * T13.X, T1.W, literal.z, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) -; EG-NEXT: LSHR * T26.X, PV.W, literal.x, +; EG-NEXT: LSHR * T15.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i32: @@ -3075,122 +3061,108 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; EG-LABEL: constant_sextload_v32i8_to_v32i32: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @14 -; EG-NEXT: ALU 18, @19, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @16 -; EG-NEXT: ALU 75, @38, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T12.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T17.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T16.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T15.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T14.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T13.X, 1 +; EG-NEXT: ALU 8, @16, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @12 +; EG-NEXT: ALU 76, @25, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T15.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T6.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T5.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T1.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T0.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 14: -; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1 -; EG-NEXT: Fetch clause starting at 16: -; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 -; EG-NEXT: ALU clause starting at 18: -; EG-NEXT: MOV * T11.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 19: -; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x, +; EG-NEXT: Fetch clause starting at 12: +; EG-NEXT: VTX_READ_128 T4.XYZW, T3.X, 0, #1 +; EG-NEXT: VTX_READ_128 T3.XYZW, T3.X, 16, #1 +; EG-NEXT: ALU clause starting at 16: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR T14.X, PV.W, literal.x, +; EG-NEXT: LSHR T1.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) -; EG-NEXT: LSHR T15.X, PV.W, literal.x, -; EG-NEXT: LSHR T0.Z, T12.W, literal.y, -; EG-NEXT: LSHR T0.W, T12.Z, literal.z, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, +; EG-NEXT: LSHR T2.X, PV.W, literal.x, +; EG-NEXT: MOV * T3.X, KC0[2].Z, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 25: +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T5.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44) +; EG-NEXT: LSHR T6.X, PV.W, literal.x, +; EG-NEXT: LSHR T0.W, T3.W, literal.y, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44) -; EG-NEXT: LSHR T16.X, PS, literal.x, -; EG-NEXT: LSHR T0.Y, T12.W, literal.y, -; EG-NEXT: LSHR T1.Z, T12.Z, literal.z, -; EG-NEXT: LSHR T1.W, T12.Y, literal.w, -; EG-NEXT: LSHR * T2.W, T12.Z, literal.y, -; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) -; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) -; EG-NEXT: ALU clause starting at 38: -; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x, -; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T17.X, PV.W, literal.x, -; EG-NEXT: LSHR T1.Y, T12.Y, literal.y, -; EG-NEXT: LSHR T2.Z, T12.Y, literal.z, -; EG-NEXT: LSHR T3.W, T12.X, literal.y, -; EG-NEXT: LSHR * T4.W, T12.X, literal.z, +; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) +; EG-NEXT: LSHR T7.X, PS, literal.x, +; EG-NEXT: LSHR T0.Y, T3.Z, literal.y, +; EG-NEXT: LSHR T0.Z, T3.Y, literal.y, +; EG-NEXT: LSHR T1.W, T3.X, literal.y, +; EG-NEXT: ASHR * T8.W, T4.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T18.X, T11.X, 0.0, literal.x, -; EG-NEXT: LSHR T2.Y, T11.W, literal.y, -; EG-NEXT: LSHR T3.Z, T11.W, literal.z, -; EG-NEXT: LSHR T5.W, T11.Z, literal.y, -; EG-NEXT: LSHR * T6.W, T11.X, literal.z, +; EG-NEXT: BFE_INT T8.X, T4.X, 0.0, literal.x, +; EG-NEXT: LSHR T1.Y, T4.W, literal.y, +; EG-NEXT: LSHR T1.Z, T4.Z, literal.y, +; EG-NEXT: LSHR T2.W, T4.X, literal.y, +; EG-NEXT: ASHR * T9.W, T4.Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T19.X, T11.Y, 0.0, literal.x, -; EG-NEXT: LSHR T3.Y, T11.Z, literal.y, -; EG-NEXT: LSHR T4.Z, T11.Y, literal.y, -; EG-NEXT: BFE_INT T18.W, PS, 0.0, literal.x, -; EG-NEXT: LSHR * T6.W, T11.X, literal.z, -; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T20.X, T11.Z, 0.0, literal.x, -; EG-NEXT: LSHR T4.Y, T11.Y, literal.y, -; EG-NEXT: BFE_INT T18.Z, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T19.W, PV.Z, 0.0, literal.x, -; EG-NEXT: LSHR * T6.W, T11.X, literal.x, +; EG-NEXT: BFE_INT T9.X, T4.Y, 0.0, literal.x, +; EG-NEXT: LSHR T2.Y, T4.Y, literal.y, +; EG-NEXT: BFE_INT T8.Z, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR T2.W, T4.X, literal.x, +; EG-NEXT: ASHR * T10.W, T4.Z, literal.z, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T21.X, T11.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T18.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T19.Z, PV.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T20.W, T3.Y, 0.0, literal.x, -; EG-NEXT: LSHR * T6.W, T11.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T22.X, T12.X, 0.0, literal.x, -; EG-NEXT: BFE_INT T19.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T20.Z, T5.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T21.W, T3.Z, 0.0, literal.x, -; EG-NEXT: LSHR * T5.W, T11.Z, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T11.X, T12.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T20.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T21.Z, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BFE_INT T22.W, T4.W, 0.0, literal.x, -; EG-NEXT: LSHR * T4.W, T11.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T23.X, T12.Z, 0.0, literal.x, -; EG-NEXT: BFE_INT T21.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T22.Z, T3.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T11.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR * T3.W, T12.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T24.X, T12.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T22.Y, PS, 0.0, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T10.X, T4.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT T8.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T9.Z, PV.Y, 0.0, literal.x, +; EG-NEXT: LSHR T2.W, T4.Y, literal.x, +; EG-NEXT: ASHR * T11.W, T4.W, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T11.X, T4.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T9.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T10.Z, T1.Z, 0.0, literal.x, +; EG-NEXT: LSHR T2.W, T4.Z, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ASHR * T12.W, T3.X, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T12.X, T3.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T10.Y, PV.W, 0.0, literal.x, ; EG-NEXT: BFE_INT T11.Z, T1.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T23.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T2.W, T4.W, literal.x, +; EG-NEXT: ASHR * T4.W, T3.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T4.X, T3.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T11.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T12.Z, T1.W, 0.0, literal.x, +; EG-NEXT: LSHR T1.W, T3.X, literal.x, +; EG-NEXT: ASHR * T13.W, T3.Z, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T13.X, T3.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT T12.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T4.Z, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T1.W, T3.Y, literal.x, +; EG-NEXT: ASHR * T14.W, T3.W, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T14.X, T3.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T4.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T13.Z, T0.Y, 0.0, literal.x, +; EG-NEXT: LSHR T1.W, T3.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 80(1.121039e-43) -; EG-NEXT: LSHR T12.X, PS, literal.x, -; EG-NEXT: BFE_INT T11.Y, T1.W, 0.0, literal.y, -; EG-NEXT: BFE_INT T23.Z, T1.Z, 0.0, literal.y, -; EG-NEXT: BFE_INT T24.W, T0.Y, 0.0, literal.y, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T25.X, PS, literal.x, -; EG-NEXT: BFE_INT T23.Y, T0.W, 0.0, literal.y, -; EG-NEXT: BFE_INT T24.Z, T0.Z, 0.0, literal.y, -; EG-NEXT: LSHR T0.W, T12.W, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: 8(1.121039e-44), 96(1.345247e-43) +; EG-NEXT: LSHR T3.X, PS, literal.x, +; EG-NEXT: BFE_INT T13.Y, PV.W, 0.0, literal.y, +; EG-NEXT: BFE_INT T14.Z, T0.W, 0.0, literal.y, +; EG-NEXT: LSHR T0.W, T3.W, literal.y, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T26.X, PS, literal.x, -; EG-NEXT: BFE_INT * T24.Y, PV.W, 0.0, literal.y, +; EG-NEXT: LSHR T15.X, PS, literal.x, +; EG-NEXT: BFE_INT * T14.Y, PV.W, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i32: @@ -3872,184 +3844,182 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @22 -; EG-NEXT: ALU 59, @31, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 57, @31, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @26 -; EG-NEXT: ALU 88, @91, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T49.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T32.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T44.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T42.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T33.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T37.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T35.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T31.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T22.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T27.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1 +; EG-NEXT: ALU 88, @89, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T31.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T30.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T28.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T14.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T23.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T15.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T18.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T10.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T6.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T4.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T22.XYZW, T21.X, 16, #1 -; EG-NEXT: VTX_READ_128 T23.XYZW, T21.X, 0, #1 +; EG-NEXT: VTX_READ_128 T3.XYZW, T2.X, 16, #1 +; EG-NEXT: VTX_READ_128 T4.XYZW, T2.X, 0, #1 ; EG-NEXT: Fetch clause starting at 26: -; EG-NEXT: VTX_READ_128 T32.XYZW, T21.X, 48, #1 -; EG-NEXT: VTX_READ_128 T33.XYZW, T21.X, 32, #1 +; EG-NEXT: VTX_READ_128 T14.XYZW, T2.X, 48, #1 +; EG-NEXT: VTX_READ_128 T15.XYZW, T2.X, 32, #1 ; EG-NEXT: ALU clause starting at 30: -; EG-NEXT: MOV * T21.X, KC0[2].Z, +; EG-NEXT: MOV * T2.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 31: -; EG-NEXT: MOV * T0.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T19.Z, T23.X, literal.x, PV.W, +; EG-NEXT: MOV T2.W, literal.x, +; EG-NEXT: LSHR * T0.W, T4.X, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_UINT * T0.Z, T4.X, literal.x, PV.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T19.Y, T23.X, literal.x, T0.W, -; EG-NEXT: BFE_UINT T20.Z, T23.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T19.W, T23.X, literal.z, -; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T19.X, T23.X, literal.x, -; EG-NEXT: BFE_UINT T20.Y, T23.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T23.X, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; EG-NEXT: BFE_UINT T0.Y, T4.X, literal.x, T2.W, +; EG-NEXT: LSHR * T1.W, T4.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: AND_INT T0.X, T4.X, literal.x, +; EG-NEXT: BFE_UINT T1.Z, T4.Y, literal.y, T2.W, +; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.z, +; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T24.Z, T23.Z, literal.x, T0.W, -; EG-NEXT: LSHR * T20.W, T23.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: AND_INT T20.X, T23.Y, literal.x, -; EG-NEXT: BFE_UINT T24.Y, T23.Z, literal.y, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T25.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T26.Z, T23.W, literal.y, T0.W, -; EG-NEXT: LSHR T24.W, T23.Z, literal.z, -; EG-NEXT: AND_INT * T24.X, T23.Z, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T1.Y, T4.Y, literal.x, T2.W, +; EG-NEXT: LSHR * T5.W, T4.Z, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: AND_INT T1.X, T4.Y, literal.x, +; EG-NEXT: BFE_UINT T5.Z, T4.Z, literal.y, T2.W, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) +; EG-NEXT: LSHR T6.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T5.Y, T4.Z, literal.y, T2.W, +; EG-NEXT: LSHR T7.W, T4.W, literal.z, +; EG-NEXT: AND_INT * T5.X, T4.Z, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T26.Y, T23.W, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) -; EG-NEXT: LSHR T27.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T28.Z, T22.X, literal.y, T0.W, BS:VEC_021/SCL_122 -; EG-NEXT: LSHR T26.W, T23.W, literal.z, -; EG-NEXT: AND_INT * T26.X, T23.W, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T7.Z, T4.W, literal.x, T2.W, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) +; EG-NEXT: LSHR T8.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T7.Y, T4.W, literal.y, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHR T9.W, T3.X, literal.z, +; EG-NEXT: AND_INT * T7.X, T4.W, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T28.Y, T22.X, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44) -; EG-NEXT: LSHR T29.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T30.Z, T22.Y, literal.y, T0.W, -; EG-NEXT: LSHR T28.W, T22.X, literal.z, -; EG-NEXT: AND_INT * T28.X, T22.X, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T9.Z, T3.X, literal.x, T2.W, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44) +; EG-NEXT: LSHR T10.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T9.Y, T3.X, literal.y, T2.W, +; EG-NEXT: LSHR T11.W, T3.Y, literal.z, +; EG-NEXT: AND_INT * T9.X, T3.X, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T30.Y, T22.Y, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 64(8.968310e-44) -; EG-NEXT: LSHR T22.X, PV.W, literal.x, -; EG-NEXT: LSHR T30.W, T22.Y, literal.y, -; EG-NEXT: AND_INT * T30.X, T22.Y, literal.z, -; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T21.Z, T22.Z, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43) -; EG-NEXT: LSHR T31.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT * T21.Y, T22.Z, literal.y, T0.W, +; EG-NEXT: BFE_UINT T11.Z, T3.Y, literal.x, T2.W, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44) +; EG-NEXT: LSHR T3.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T11.Y, T3.Y, literal.y, T2.W, +; EG-NEXT: AND_INT * T11.X, T3.Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; EG-NEXT: ALU clause starting at 91: -; EG-NEXT: BFE_UINT T34.Z, T22.W, literal.x, T0.W, -; EG-NEXT: LSHR * T21.W, T22.Z, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: AND_INT T21.X, T22.Z, literal.x, -; EG-NEXT: BFE_UINT T34.Y, T22.W, literal.y, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T35.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T36.Z, T33.X, literal.y, T0.W, BS:VEC_021/SCL_122 -; EG-NEXT: LSHR T34.W, T22.W, literal.z, -; EG-NEXT: AND_INT * T34.X, T22.W, literal.w, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: ADD_INT T4.W, KC0[2].Y, literal.x, +; EG-NEXT: LSHR * T12.W, T3.Z, literal.y, +; EG-NEXT: 80(1.121039e-43), 24(3.363116e-44) +; EG-NEXT: LSHR T13.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT * T12.Z, T3.Z, literal.y, T2.W, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: ALU clause starting at 89: +; EG-NEXT: BFE_UINT T12.Y, T3.Z, literal.x, T2.W, +; EG-NEXT: LSHR * T16.W, T3.W, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: AND_INT T12.X, T3.Z, literal.x, +; EG-NEXT: BFE_UINT T16.Z, T3.W, literal.y, T2.W, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, +; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) +; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) +; EG-NEXT: LSHR T2.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T16.Y, T3.W, literal.y, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHR T17.W, T15.X, literal.z, +; EG-NEXT: AND_INT * T16.X, T3.W, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T36.Y, T33.X, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 112(1.569454e-43) -; EG-NEXT: LSHR T37.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T38.Z, T33.Y, literal.y, T0.W, -; EG-NEXT: LSHR T36.W, T33.X, literal.z, -; EG-NEXT: AND_INT * T36.X, T33.X, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T17.Z, T15.X, literal.x, T2.W, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43) +; EG-NEXT: LSHR T18.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T17.Y, T15.X, literal.y, T2.W, +; EG-NEXT: LSHR T19.W, T15.Y, literal.z, +; EG-NEXT: AND_INT * T17.X, T15.X, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T38.Y, T33.Y, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 128(1.793662e-43) -; EG-NEXT: LSHR T33.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T39.Z, T33.Z, literal.y, T0.W, -; EG-NEXT: LSHR T38.W, T33.Y, literal.z, -; EG-NEXT: AND_INT * T38.X, T33.Y, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T19.Z, T15.Y, literal.x, T2.W, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 128(1.793662e-43) +; EG-NEXT: LSHR T15.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T19.Y, T15.Y, literal.y, T2.W, +; EG-NEXT: LSHR T20.W, T15.Z, literal.z, +; EG-NEXT: AND_INT * T19.X, T15.Y, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T39.Y, T33.Z, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 144(2.017870e-43) -; EG-NEXT: LSHR T40.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T41.Z, T33.W, literal.y, T0.W, -; EG-NEXT: LSHR T39.W, T33.Z, literal.z, -; EG-NEXT: AND_INT * T39.X, T33.Z, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T20.Z, T15.Z, literal.x, T2.W, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 144(2.017870e-43) +; EG-NEXT: LSHR T21.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T20.Y, T15.Z, literal.y, T2.W, +; EG-NEXT: LSHR T22.W, T15.W, literal.z, +; EG-NEXT: AND_INT * T20.X, T15.Z, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T41.Y, T33.W, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 160(2.242078e-43) -; EG-NEXT: LSHR T42.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T43.Z, T32.X, literal.y, T0.W, BS:VEC_021/SCL_122 -; EG-NEXT: LSHR T41.W, T33.W, literal.z, -; EG-NEXT: AND_INT * T41.X, T33.W, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T22.Z, T15.W, literal.x, T2.W, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 160(2.242078e-43) +; EG-NEXT: LSHR T23.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T22.Y, T15.W, literal.y, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHR T24.W, T14.X, literal.z, +; EG-NEXT: AND_INT * T22.X, T15.W, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T43.Y, T32.X, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 176(2.466285e-43) -; EG-NEXT: LSHR T44.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T45.Z, T32.Y, literal.y, T0.W, -; EG-NEXT: LSHR T43.W, T32.X, literal.z, -; EG-NEXT: AND_INT * T43.X, T32.X, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T24.Z, T14.X, literal.x, T2.W, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 176(2.466285e-43) +; EG-NEXT: LSHR T25.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T24.Y, T14.X, literal.y, T2.W, +; EG-NEXT: LSHR T26.W, T14.Y, literal.z, +; EG-NEXT: AND_INT * T24.X, T14.X, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T45.Y, T32.Y, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 192(2.690493e-43) -; EG-NEXT: LSHR T32.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T46.Z, T32.Z, literal.y, T0.W, -; EG-NEXT: LSHR T45.W, T32.Y, literal.z, -; EG-NEXT: AND_INT * T45.X, T32.Y, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T26.Z, T14.Y, literal.x, T2.W, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 192(2.690493e-43) +; EG-NEXT: LSHR T14.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T26.Y, T14.Y, literal.y, T2.W, +; EG-NEXT: LSHR T27.W, T14.Z, literal.z, +; EG-NEXT: AND_INT * T26.X, T14.Y, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T46.Y, T32.Z, literal.x, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 208(2.914701e-43) -; EG-NEXT: LSHR T47.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T48.Z, T32.W, literal.y, T0.W, -; EG-NEXT: LSHR T46.W, T32.Z, literal.z, -; EG-NEXT: AND_INT * T46.X, T32.Z, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: BFE_UINT T27.Z, T14.Z, literal.x, T2.W, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 208(2.914701e-43) +; EG-NEXT: LSHR T28.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T27.Y, T14.Z, literal.y, T2.W, +; EG-NEXT: LSHR T29.W, T14.W, literal.z, +; EG-NEXT: AND_INT * T27.X, T14.Z, literal.w, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T48.Y, T32.W, literal.x, T0.W, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 224(3.138909e-43) -; EG-NEXT: LSHR T49.X, PV.W, literal.x, -; EG-NEXT: LSHR T48.W, T32.W, literal.y, -; EG-NEXT: AND_INT * T48.X, T32.W, literal.z, -; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) +; EG-NEXT: BFE_UINT T29.Z, T14.W, literal.x, T2.W, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 224(3.138909e-43) +; EG-NEXT: LSHR T30.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T29.Y, T14.W, literal.y, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: AND_INT * T29.X, T14.W, literal.z, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, ; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00) -; EG-NEXT: LSHR * T50.X, PV.W, literal.x, +; EG-NEXT: LSHR * T31.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_zextload_v64i8_to_v64i32: @@ -4783,231 +4753,204 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; EG-LABEL: constant_sextload_v64i8_to_v64i32: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @32, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @24 -; EG-NEXT: ALU 40, @33, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @28 -; EG-NEXT: ALU 76, @74, KC0[CB0:0-32], KC1[] -; EG-NEXT: ALU 72, @151, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T49.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T19.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T35.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T34.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T33.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T32.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T30.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T29.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T28.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T27.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T26.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T25.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T24.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T23.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T22.X, 1 +; EG-NEXT: ALU 17, @30, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 3 @22 +; EG-NEXT: ALU 78, @48, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 71, @127, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T31.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T6.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T17.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T16.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T15.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T14.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T13.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T12.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T11.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T10.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T5.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T1.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 24: -; EG-NEXT: VTX_READ_128 T20.XYZW, T21.X, 32, #1 -; EG-NEXT: VTX_READ_128 T19.XYZW, T21.X, 48, #1 -; EG-NEXT: Fetch clause starting at 28: -; EG-NEXT: VTX_READ_128 T31.XYZW, T21.X, 0, #1 -; EG-NEXT: VTX_READ_128 T21.XYZW, T21.X, 16, #1 -; EG-NEXT: ALU clause starting at 32: -; EG-NEXT: MOV * T21.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 33: -; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x, +; EG-NEXT: Fetch clause starting at 22: +; EG-NEXT: VTX_READ_128 T7.XYZW, T6.X, 0, #1 +; EG-NEXT: VTX_READ_128 T8.XYZW, T6.X, 16, #1 +; EG-NEXT: VTX_READ_128 T9.XYZW, T6.X, 32, #1 +; EG-NEXT: VTX_READ_128 T6.XYZW, T6.X, 48, #1 +; EG-NEXT: ALU clause starting at 30: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR T23.X, PV.W, literal.x, +; EG-NEXT: LSHR T1.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) -; EG-NEXT: LSHR T24.X, PV.W, literal.x, +; EG-NEXT: LSHR T2.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44) -; EG-NEXT: LSHR T25.X, PV.W, literal.x, +; EG-NEXT: LSHR T3.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44) -; EG-NEXT: LSHR T26.X, PV.W, literal.x, +; EG-NEXT: LSHR T4.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) -; EG-NEXT: LSHR T27.X, PV.W, literal.x, +; EG-NEXT: LSHR T5.X, PV.W, literal.x, +; EG-NEXT: MOV * T6.X, KC0[2].Z, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 48: +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) +; EG-NEXT: LSHR T10.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43) -; EG-NEXT: LSHR T28.X, PV.W, literal.x, -; EG-NEXT: LSHR T0.Y, T19.W, literal.y, -; EG-NEXT: LSHR T0.Z, T19.Z, literal.z, -; EG-NEXT: LSHR * T0.W, T19.W, literal.w, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, -; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T29.X, PV.W, literal.x, -; EG-NEXT: LSHR T1.Y, T19.Z, literal.y, -; EG-NEXT: LSHR T1.Z, T19.Y, literal.z, -; EG-NEXT: LSHR * T1.W, T19.Z, literal.w, +; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) +; EG-NEXT: LSHR T11.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43) +; EG-NEXT: LSHR T12.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43) +; EG-NEXT: LSHR T13.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43) +; EG-NEXT: LSHR T14.X, PV.W, literal.x, +; EG-NEXT: LSHR T0.Y, T6.W, literal.y, +; EG-NEXT: LSHR T0.Z, T6.Z, literal.y, +; EG-NEXT: LSHR T0.W, T6.Y, literal.y, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) -; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, -; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T30.X, PV.W, literal.x, -; EG-NEXT: LSHR T2.Y, T19.Y, literal.y, -; EG-NEXT: LSHR T2.Z, T19.Y, literal.z, -; EG-NEXT: LSHR T2.W, T19.X, literal.y, -; EG-NEXT: LSHR * T3.W, T19.X, literal.z, +; EG-NEXT: 176(2.466285e-43), 0(0.000000e+00) +; EG-NEXT: LSHR T15.X, PS, literal.x, +; EG-NEXT: LSHR T1.Y, T6.X, literal.y, +; EG-NEXT: LSHR T1.Z, T9.W, literal.y, +; EG-NEXT: LSHR T1.W, T9.Z, literal.y, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 74: -; EG-NEXT: LSHR T3.Y, T20.W, literal.x, -; EG-NEXT: LSHR T3.Z, T20.W, literal.y, -; EG-NEXT: LSHR T4.W, T20.Z, literal.x, -; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.z, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T32.X, PS, literal.x, -; EG-NEXT: LSHR T4.Y, T20.Z, literal.y, -; EG-NEXT: LSHR T4.Z, T20.Y, literal.z, -; EG-NEXT: LSHR T5.W, T20.Y, literal.y, -; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.w, -; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) -; EG-NEXT: 16(2.242078e-44), 160(2.242078e-43) -; EG-NEXT: LSHR T33.X, PS, literal.x, -; EG-NEXT: LSHR T5.Y, T20.X, literal.y, -; EG-NEXT: LSHR T5.Z, T20.X, literal.z, -; EG-NEXT: LSHR T6.W, T21.W, literal.y, -; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.w, +; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00) +; EG-NEXT: LSHR T16.X, PS, literal.x, +; EG-NEXT: LSHR T2.Y, T9.Y, literal.y, +; EG-NEXT: LSHR T2.Z, T9.X, literal.y, +; EG-NEXT: LSHR T2.W, T8.W, literal.y, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: 24(3.363116e-44), 176(2.466285e-43) -; EG-NEXT: LSHR T34.X, PS, literal.x, -; EG-NEXT: LSHR T6.Y, T21.W, literal.y, -; EG-NEXT: LSHR T6.Z, T21.Z, literal.z, -; EG-NEXT: LSHR T7.W, T21.Z, literal.y, -; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.w, -; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) -; EG-NEXT: 16(2.242078e-44), 192(2.690493e-43) -; EG-NEXT: LSHR T35.X, PS, literal.x, -; EG-NEXT: LSHR T7.Y, T21.Y, literal.y, -; EG-NEXT: LSHR T7.Z, T21.Y, literal.z, -; EG-NEXT: LSHR T8.W, T21.X, literal.y, -; EG-NEXT: LSHR * T9.W, T21.X, literal.z, +; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00) +; EG-NEXT: LSHR T17.X, PS, literal.x, +; EG-NEXT: LSHR T3.Y, T8.Z, literal.y, +; EG-NEXT: LSHR T3.Z, T8.Y, literal.y, +; EG-NEXT: LSHR T3.W, T8.X, literal.y, +; EG-NEXT: ASHR * T18.W, T7.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T36.X, T31.X, 0.0, literal.x, -; EG-NEXT: LSHR T8.Y, T31.W, literal.y, -; EG-NEXT: LSHR T8.Z, T31.W, literal.z, -; EG-NEXT: LSHR T10.W, T31.Z, literal.y, -; EG-NEXT: LSHR * T11.W, T31.X, literal.z, +; EG-NEXT: BFE_INT T18.X, T7.X, 0.0, literal.x, +; EG-NEXT: LSHR T4.Y, T7.W, literal.y, +; EG-NEXT: LSHR T4.Z, T7.Z, literal.y, +; EG-NEXT: LSHR T4.W, T7.X, literal.y, +; EG-NEXT: ASHR * T19.W, T7.Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T37.X, T31.Y, 0.0, literal.x, -; EG-NEXT: LSHR T9.Y, T31.Z, literal.y, -; EG-NEXT: LSHR T9.Z, T31.Y, literal.y, -; EG-NEXT: BFE_INT T36.W, PS, 0.0, literal.x, -; EG-NEXT: LSHR * T11.W, T31.X, literal.z, -; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T38.X, T31.Z, 0.0, literal.x, -; EG-NEXT: LSHR T10.Y, T31.Y, literal.y, -; EG-NEXT: BFE_INT T36.Z, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T37.W, PV.Z, 0.0, literal.x, -; EG-NEXT: LSHR * T11.W, T31.X, literal.x, +; EG-NEXT: BFE_INT T19.X, T7.Y, 0.0, literal.x, +; EG-NEXT: LSHR T5.Y, T7.Y, literal.y, +; EG-NEXT: BFE_INT T18.Z, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR T4.W, T7.X, literal.x, +; EG-NEXT: ASHR * T20.W, T7.Z, literal.z, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T39.X, T31.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T36.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T37.Z, PV.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T38.W, T9.Y, 0.0, literal.x, -; EG-NEXT: LSHR * T11.W, T31.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T40.X, T21.X, 0.0, literal.x, -; EG-NEXT: BFE_INT T37.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T38.Z, T10.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T39.W, T8.Z, 0.0, literal.x, -; EG-NEXT: LSHR * T10.W, T31.Z, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T31.X, T21.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T38.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T39.Z, T8.Y, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BFE_INT T40.W, T9.W, 0.0, literal.x, -; EG-NEXT: LSHR * T9.W, T31.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T41.X, T21.Z, 0.0, literal.x, -; EG-NEXT: BFE_INT T39.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T40.Z, T8.W, 0.0, literal.x, -; EG-NEXT: BFE_INT * T31.W, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 151: -; EG-NEXT: LSHR * T8.W, T21.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T42.X, T21.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T40.Y, PV.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T31.Z, T7.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T41.W, T7.W, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR * T7.W, T21.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T43.X, T20.X, 0.0, literal.x, -; EG-NEXT: BFE_INT T31.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T41.Z, T6.Z, 0.0, literal.x, -; EG-NEXT: BFE_INT T42.W, T6.Y, 0.0, literal.x, -; EG-NEXT: LSHR * T7.W, T21.Z, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T21.X, T20.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T41.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T42.Z, T6.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T43.W, T5.Z, 0.0, literal.x, -; EG-NEXT: LSHR * T6.W, T21.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T44.X, T20.Z, 0.0, literal.x, -; EG-NEXT: BFE_INT T42.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T43.Z, T5.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T21.W, T5.W, 0.0, literal.x, -; EG-NEXT: LSHR * T5.W, T20.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T45.X, T20.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T43.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T21.Z, T4.Z, 0.0, literal.x, -; EG-NEXT: BFE_INT T44.W, T4.Y, 0.0, literal.x, -; EG-NEXT: LSHR * T5.W, T20.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T46.X, T19.X, 0.0, literal.x, -; EG-NEXT: BFE_INT T21.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T44.Z, T4.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T45.W, T3.Z, 0.0, literal.x, -; EG-NEXT: LSHR * T4.W, T20.Z, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T20.X, T19.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T44.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T45.Z, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BFE_INT T46.W, T3.W, 0.0, literal.x, -; EG-NEXT: LSHR * T3.W, T20.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T47.X, T19.Z, 0.0, literal.x, -; EG-NEXT: BFE_INT T45.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T46.Z, T2.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T20.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR * T2.W, T19.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T20.X, T7.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT T18.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T19.Z, PV.Y, 0.0, literal.x, +; EG-NEXT: LSHR T4.W, T7.Y, literal.x, +; EG-NEXT: ASHR * T21.W, T7.W, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T21.X, T7.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T19.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T20.Z, T4.Z, 0.0, literal.x, +; EG-NEXT: LSHR T4.W, T7.Z, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ASHR * T22.W, T8.X, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T22.X, T8.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T20.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T21.Z, T4.Y, 0.0, literal.x, +; EG-NEXT: LSHR T4.W, T7.W, literal.x, +; EG-NEXT: ASHR * T7.W, T8.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T7.X, T8.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T21.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T22.Z, T3.W, 0.0, literal.x, +; EG-NEXT: LSHR * T3.W, T8.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T48.X, T19.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T46.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T20.Z, T2.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T47.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ALU clause starting at 127: +; EG-NEXT: ASHR * T23.W, T8.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T23.X, T8.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT T22.Y, T3.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T7.Z, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T3.W, T8.Y, literal.x, +; EG-NEXT: ASHR * T24.W, T8.W, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T24.X, T8.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T7.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T23.Z, T3.Y, 0.0, literal.x, +; EG-NEXT: LSHR T3.W, T8.Z, literal.x, +; EG-NEXT: ASHR * T25.W, T9.X, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T25.X, T9.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T23.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T24.Z, T2.W, 0.0, literal.x, +; EG-NEXT: LSHR T2.W, T8.W, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ASHR * T8.W, T9.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T8.X, T9.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T24.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T25.Z, T2.Z, 0.0, literal.x, +; EG-NEXT: LSHR T2.W, T9.X, literal.x, +; EG-NEXT: ASHR * T26.W, T9.Z, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T26.X, T9.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT T25.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T8.Z, T2.Y, 0.0, literal.x, +; EG-NEXT: LSHR T2.W, T9.Y, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ASHR * T27.W, T9.W, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T27.X, T9.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T8.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T26.Z, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T1.W, T9.Z, literal.x, +; EG-NEXT: ASHR * T28.W, T6.X, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T28.X, T6.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T26.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T27.Z, T1.Z, 0.0, literal.x, +; EG-NEXT: LSHR T1.W, T9.W, literal.x, +; EG-NEXT: ASHR * T9.W, T6.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T9.X, T6.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T27.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T28.Z, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T1.W, T6.X, literal.x, +; EG-NEXT: ASHR * T29.W, T6.Z, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T29.X, T6.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT T28.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T9.Z, T0.W, 0.0, literal.x, +; EG-NEXT: LSHR T0.W, T6.Y, literal.x, +; EG-NEXT: ASHR * T30.W, T6.W, literal.y, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T30.X, T6.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T9.Y, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T29.Z, T0.Z, 0.0, literal.x, +; EG-NEXT: LSHR T0.W, T6.Z, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 208(2.914701e-43) -; EG-NEXT: LSHR T19.X, PS, literal.x, -; EG-NEXT: BFE_INT T20.Y, T1.Z, 0.0, literal.y, -; EG-NEXT: BFE_INT T47.Z, T1.Y, 0.0, literal.y, -; EG-NEXT: BFE_INT T48.W, T0.W, 0.0, literal.y, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T49.X, PS, literal.x, -; EG-NEXT: BFE_INT T47.Y, T0.Z, 0.0, literal.y, -; EG-NEXT: BFE_INT T48.Z, T0.Y, 0.0, literal.y, -; EG-NEXT: LSHR T0.W, T19.W, literal.y, +; EG-NEXT: 8(1.121039e-44), 224(3.138909e-43) +; EG-NEXT: LSHR T6.X, PS, literal.x, +; EG-NEXT: BFE_INT T29.Y, PV.W, 0.0, literal.y, +; EG-NEXT: BFE_INT T30.Z, T0.Y, 0.0, literal.y, +; EG-NEXT: LSHR T0.W, T6.W, literal.y, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T50.X, PS, literal.x, -; EG-NEXT: BFE_INT * T48.Y, PV.W, 0.0, literal.y, +; EG-NEXT: LSHR T31.X, PS, literal.x, +; EG-NEXT: BFE_INT * T30.Y, PV.W, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; ; GFX12-LABEL: constant_sextload_v64i8_to_v64i32: @@ -5771,32 +5714,31 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T6.X, 1 +; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV * T0.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T5.X, T4.X, literal.x, PV.W, -; EG-NEXT: LSHR * T5.Z, T4.X, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T5.Y, 0.0, -; EG-NEXT: BFE_UINT * T4.Z, T4.X, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, T4.X, literal.x, -; EG-NEXT: MOV T4.Y, 0.0, -; EG-NEXT: MOV T5.W, 0.0, -; EG-NEXT: MOV * T4.W, 0.0, +; EG-NEXT: LSHR T1.Z, T0.X, literal.x, +; EG-NEXT: MOV * T0.W, literal.y, +; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44) +; EG-NEXT: BFE_UINT T1.X, T0.X, literal.x, PV.W, +; EG-NEXT: MOV T1.Y, 0.0, +; EG-NEXT: BFE_UINT T0.Z, T0.X, literal.y, PV.W, +; EG-NEXT: AND_INT * T0.X, T0.X, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV T1.W, 0.0, +; EG-NEXT: MOV * T0.W, 0.0, +; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR * T7.X, PV.W, literal.x, +; EG-NEXT: LSHR * T3.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i64: @@ -5920,32 +5862,32 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x, -; EG-NEXT: ASHR T4.W, T4.X, literal.y, -; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.z, +; EG-NEXT: BFE_INT T1.X, T0.X, 0.0, literal.x, +; EG-NEXT: ASHR T0.W, T0.X, literal.y, +; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ASHR T5.Y, PV.X, literal.x, -; EG-NEXT: ASHR T4.Z, T4.X, literal.y, -; EG-NEXT: LSHR T0.W, T4.X, literal.z, -; EG-NEXT: LSHR * T1.W, T4.X, literal.w, +; EG-NEXT: ASHR T1.Y, PV.X, literal.x, +; EG-NEXT: ASHR T0.Z, T0.X, literal.y, +; EG-NEXT: LSHR T1.W, T0.X, literal.z, +; EG-NEXT: LSHR * T2.W, T0.X, literal.w, ; EG-NEXT: 31(4.344025e-44), 24(3.363116e-44) ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T4.X, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T5.Z, PV.W, 0.0, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: BFE_INT T0.X, PS, 0.0, literal.x, +; EG-NEXT: BFE_INT T1.Z, PV.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: LSHR T7.X, PV.W, literal.x, -; EG-NEXT: ASHR T4.Y, PV.X, literal.y, -; EG-NEXT: ASHR * T5.W, PV.Z, literal.y, +; EG-NEXT: LSHR T3.X, PV.W, literal.x, +; EG-NEXT: ASHR T0.Y, PV.X, literal.y, +; EG-NEXT: ASHR * T1.W, PV.Z, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i64: @@ -6104,51 +6046,50 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @8 -; EG-NEXT: ALU 34, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T12.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T9.X, 1 +; EG-NEXT: ALU 33, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T7.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T6.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T5.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T4.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T5.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: MOV * T0.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T6.X, T5.Y, literal.x, PV.W, -; EG-NEXT: LSHR * T6.Z, T5.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T6.Y, 0.0, -; EG-NEXT: BFE_UINT * T7.Z, T5.Y, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T7.X, T5.Y, literal.x, -; EG-NEXT: MOV * T7.Y, 0.0, +; EG-NEXT: LSHR T1.Z, T0.Y, literal.x, +; EG-NEXT: MOV * T0.W, literal.y, +; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44) +; EG-NEXT: BFE_UINT T1.X, T0.Y, literal.x, PV.W, +; EG-NEXT: MOV T1.Y, 0.0, +; EG-NEXT: BFE_UINT T2.Z, T0.Y, literal.y, PV.W, +; EG-NEXT: AND_INT * T2.X, T0.Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T8.X, T5.X, literal.x, T0.W, -; EG-NEXT: LSHR * T8.Z, T5.X, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T8.Y, 0.0, -; EG-NEXT: BFE_UINT * T5.Z, T5.X, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T5.X, T5.X, literal.x, -; EG-NEXT: MOV T5.Y, 0.0, -; EG-NEXT: MOV T6.W, 0.0, -; EG-NEXT: MOV * T7.W, 0.0, +; EG-NEXT: MOV T2.Y, 0.0, +; EG-NEXT: LSHR * T3.Z, T0.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T3.X, T0.X, literal.x, T0.W, +; EG-NEXT: MOV T3.Y, 0.0, +; EG-NEXT: BFE_UINT T0.Z, T0.X, literal.y, T0.W, +; EG-NEXT: AND_INT * T0.X, T0.X, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: MOV T8.W, 0.0, -; EG-NEXT: MOV * T5.W, 0.0, -; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV T1.W, 0.0, +; EG-NEXT: MOV * T2.W, 0.0, +; EG-NEXT: MOV T3.W, 0.0, +; EG-NEXT: MOV * T0.W, 0.0, +; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR T10.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T5.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) -; EG-NEXT: LSHR T11.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T6.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44) -; EG-NEXT: LSHR * T12.X, PV.W, literal.x, +; EG-NEXT: LSHR * T7.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i64: @@ -6351,55 +6292,55 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @8 ; EG-NEXT: ALU 39, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T12.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T8.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T6.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T5.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T2.X, T0.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: LSHR T8.X, PV.W, literal.x, -; EG-NEXT: ASHR T7.Y, PV.X, literal.y, -; EG-NEXT: LSHR T0.W, T5.Y, literal.z, +; EG-NEXT: LSHR T3.X, PV.W, literal.x, +; EG-NEXT: ASHR T2.Y, PV.X, literal.y, +; EG-NEXT: LSHR T0.W, T0.Y, literal.z, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) ; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) -; EG-NEXT: LSHR T9.X, PS, literal.x, -; EG-NEXT: BFE_INT T7.Z, PV.W, 0.0, literal.y, -; EG-NEXT: ASHR * T10.W, T5.X, literal.z, +; EG-NEXT: LSHR T4.X, PS, literal.x, +; EG-NEXT: BFE_INT T2.Z, PV.W, 0.0, literal.y, +; EG-NEXT: ASHR * T5.W, T0.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T11.X, T5.X, 0.0, literal.x, -; EG-NEXT: ASHR T10.Z, T5.X, literal.y, -; EG-NEXT: LSHR T0.W, T5.X, literal.z, -; EG-NEXT: ASHR * T5.W, T5.Y, literal.w, +; EG-NEXT: BFE_INT T6.X, T0.X, 0.0, literal.x, +; EG-NEXT: ASHR T5.Z, T0.X, literal.y, +; EG-NEXT: LSHR T0.W, T0.X, literal.z, +; EG-NEXT: ASHR * T7.W, T0.Y, literal.w, ; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) -; EG-NEXT: BFE_INT T10.X, PV.W, 0.0, literal.x, -; EG-NEXT: ASHR T11.Y, PV.X, literal.y, -; EG-NEXT: ASHR T5.Z, T5.Y, literal.z, -; EG-NEXT: LSHR T0.W, T5.X, literal.x, -; EG-NEXT: LSHR * T1.W, T5.Y, literal.w, +; EG-NEXT: BFE_INT T5.X, PV.W, 0.0, literal.x, +; EG-NEXT: ASHR T6.Y, PV.X, literal.y, +; EG-NEXT: ASHR T7.Z, T0.Y, literal.z, +; EG-NEXT: LSHR T0.W, T0.X, literal.x, +; EG-NEXT: LSHR * T1.W, T0.Y, literal.w, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T5.X, PS, 0.0, literal.x, -; EG-NEXT: ASHR T10.Y, PV.X, literal.y, -; EG-NEXT: BFE_INT T11.Z, PV.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T7.X, PS, 0.0, literal.x, +; EG-NEXT: ASHR T5.Y, PV.X, literal.y, +; EG-NEXT: BFE_INT T6.Z, PV.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T12.X, PV.W, literal.x, -; EG-NEXT: ASHR T5.Y, PV.X, literal.y, -; EG-NEXT: ASHR T11.W, PV.Z, literal.y, -; EG-NEXT: ASHR * T7.W, T7.Z, literal.y, +; EG-NEXT: LSHR T0.X, PV.W, literal.x, +; EG-NEXT: ASHR T7.Y, PV.X, literal.y, +; EG-NEXT: ASHR T6.W, PV.Z, literal.y, +; EG-NEXT: ASHR * T2.W, T2.Z, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i64: @@ -6667,89 +6608,88 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @12 -; EG-NEXT: ALU 68, @15, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T22.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T21.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T20.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T19.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T18.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T17.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T15.X, 1 +; EG-NEXT: ALU 67, @15, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T15.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T14.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T13.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T12.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T11.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T10.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 12: -; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: MOV * T0.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T8.X, T7.W, literal.x, PV.W, -; EG-NEXT: LSHR * T8.Z, T7.W, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T8.Y, 0.0, -; EG-NEXT: BFE_UINT * T9.Z, T7.W, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T9.X, T7.W, literal.x, -; EG-NEXT: MOV * T9.Y, 0.0, +; EG-NEXT: LSHR T1.Z, T0.W, literal.x, +; EG-NEXT: MOV * T1.W, literal.y, +; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44) +; EG-NEXT: BFE_UINT T1.X, T0.W, literal.x, PV.W, +; EG-NEXT: MOV T1.Y, 0.0, +; EG-NEXT: BFE_UINT T2.Z, T0.W, literal.y, PV.W, +; EG-NEXT: AND_INT * T2.X, T0.W, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T10.X, T7.Z, literal.x, T0.W, -; EG-NEXT: LSHR * T10.Z, T7.Z, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T10.Y, 0.0, -; EG-NEXT: BFE_UINT * T11.Z, T7.Z, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T11.X, T7.Z, literal.x, -; EG-NEXT: MOV * T11.Y, 0.0, +; EG-NEXT: MOV T2.Y, 0.0, +; EG-NEXT: LSHR * T3.Z, T0.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T3.X, T0.Z, literal.x, T1.W, +; EG-NEXT: MOV T3.Y, 0.0, +; EG-NEXT: BFE_UINT T4.Z, T0.Z, literal.y, T1.W, +; EG-NEXT: AND_INT * T4.X, T0.Z, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T12.X, T7.Y, literal.x, T0.W, -; EG-NEXT: LSHR * T12.Z, T7.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T12.Y, 0.0, -; EG-NEXT: BFE_UINT * T13.Z, T7.Y, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T13.X, T7.Y, literal.x, -; EG-NEXT: MOV * T13.Y, 0.0, +; EG-NEXT: MOV T4.Y, 0.0, +; EG-NEXT: LSHR * T5.Z, T0.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T5.X, T0.Y, literal.x, T1.W, +; EG-NEXT: MOV T5.Y, 0.0, +; EG-NEXT: BFE_UINT T6.Z, T0.Y, literal.y, T1.W, +; EG-NEXT: AND_INT * T6.X, T0.Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T14.X, T7.X, literal.x, T0.W, -; EG-NEXT: LSHR * T14.Z, T7.X, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T14.Y, 0.0, -; EG-NEXT: BFE_UINT * T7.Z, T7.X, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T7.X, T7.X, literal.x, +; EG-NEXT: MOV T6.Y, 0.0, +; EG-NEXT: LSHR * T7.Z, T0.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T7.X, T0.X, literal.x, T1.W, ; EG-NEXT: MOV T7.Y, 0.0, -; EG-NEXT: MOV T8.W, 0.0, -; EG-NEXT: MOV * T9.W, 0.0, +; EG-NEXT: BFE_UINT T0.Z, T0.X, literal.y, T1.W, +; EG-NEXT: AND_INT * T0.X, T0.X, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: MOV T10.W, 0.0, -; EG-NEXT: MOV * T11.W, 0.0, -; EG-NEXT: MOV T12.W, 0.0, -; EG-NEXT: MOV * T13.W, 0.0, -; EG-NEXT: MOV T14.W, 0.0, -; EG-NEXT: MOV * T7.W, 0.0, -; EG-NEXT: LSHR T15.X, KC0[2].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV T1.W, 0.0, +; EG-NEXT: MOV * T2.W, 0.0, +; EG-NEXT: MOV T3.W, 0.0, +; EG-NEXT: MOV * T4.W, 0.0, +; EG-NEXT: MOV T5.W, 0.0, +; EG-NEXT: MOV * T6.W, 0.0, +; EG-NEXT: MOV T7.W, 0.0, +; EG-NEXT: MOV * T0.W, 0.0, +; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR T16.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T9.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) -; EG-NEXT: LSHR T17.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T10.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44) -; EG-NEXT: LSHR T18.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T11.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44) -; EG-NEXT: LSHR T19.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T12.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) -; EG-NEXT: LSHR T20.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T13.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43) -; EG-NEXT: LSHR T21.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T14.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) -; EG-NEXT: LSHR * T22.X, PV.W, literal.x, +; EG-NEXT: LSHR * T15.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i64: @@ -7114,98 +7054,98 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @12 ; EG-NEXT: ALU 78, @15, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T22.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T16.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T15.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T12.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T11.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T10.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T9.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T8.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T15.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T9.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T8.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T5.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 12: -; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR T9.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T2.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) -; EG-NEXT: LSHR T10.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T3.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44) -; EG-NEXT: LSHR T11.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T4.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44) -; EG-NEXT: LSHR * T12.X, PV.W, literal.x, +; EG-NEXT: LSHR * T5.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_INT * T13.X, T7.W, 0.0, literal.x, +; EG-NEXT: BFE_INT * T6.X, T0.W, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T14.X, T7.Y, 0.0, literal.x, -; EG-NEXT: ASHR T13.Y, PV.X, literal.y, -; EG-NEXT: LSHR T0.W, T7.W, literal.x, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, +; EG-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x, +; EG-NEXT: ASHR T6.Y, PV.X, literal.y, +; EG-NEXT: LSHR T1.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T15.X, PS, literal.x, -; EG-NEXT: ASHR T14.Y, PV.X, literal.y, -; EG-NEXT: BFE_INT T13.Z, PV.W, 0.0, literal.z, -; EG-NEXT: LSHR T0.W, T7.Y, literal.z, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, +; EG-NEXT: LSHR T8.X, PS, literal.x, +; EG-NEXT: ASHR T7.Y, PV.X, literal.y, +; EG-NEXT: BFE_INT T6.Z, PV.W, 0.0, literal.z, +; EG-NEXT: LSHR T1.W, T0.Y, literal.z, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) ; EG-NEXT: 8(1.121039e-44), 96(1.345247e-43) -; EG-NEXT: LSHR T16.X, PS, literal.x, -; EG-NEXT: BFE_INT T14.Z, PV.W, 0.0, literal.y, -; EG-NEXT: ASHR * T17.W, T7.X, literal.z, +; EG-NEXT: LSHR T9.X, PS, literal.x, +; EG-NEXT: BFE_INT T7.Z, PV.W, 0.0, literal.y, +; EG-NEXT: ASHR * T10.W, T0.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T18.X, T7.X, 0.0, literal.x, -; EG-NEXT: ASHR T17.Z, T7.X, literal.y, -; EG-NEXT: LSHR T0.W, T7.X, literal.z, -; EG-NEXT: ASHR * T19.W, T7.Y, literal.w, +; EG-NEXT: BFE_INT T11.X, T0.X, 0.0, literal.x, +; EG-NEXT: ASHR T10.Z, T0.X, literal.y, +; EG-NEXT: LSHR T1.W, T0.X, literal.z, +; EG-NEXT: ASHR * T12.W, T0.Y, literal.w, ; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) -; EG-NEXT: BFE_INT T17.X, PV.W, 0.0, literal.x, -; EG-NEXT: ASHR T18.Y, PV.X, literal.y, -; EG-NEXT: ASHR T19.Z, T7.Y, literal.z, -; EG-NEXT: LSHR T0.W, T7.X, literal.x, -; EG-NEXT: LSHR * T1.W, T7.Y, literal.w, -; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) +; EG-NEXT: BFE_INT T10.X, PV.W, 0.0, literal.x, +; EG-NEXT: ASHR T11.Y, PV.X, literal.y, +; EG-NEXT: ASHR T12.Z, T0.Y, literal.z, +; EG-NEXT: LSHR T1.W, T0.X, literal.x, +; EG-NEXT: LSHR * T2.W, T0.Y, literal.w, +; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T19.X, PS, 0.0, literal.x, -; EG-NEXT: ASHR T17.Y, PV.X, literal.y, -; EG-NEXT: BFE_INT T18.Z, PV.W, 0.0, literal.x, -; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z, -; EG-NEXT: ASHR * T20.W, T7.Z, literal.y, +; EG-NEXT: BFE_INT T12.X, PS, 0.0, literal.x, +; EG-NEXT: ASHR T10.Y, PV.X, literal.y, +; EG-NEXT: BFE_INT T11.Z, PV.W, 0.0, literal.x, +; EG-NEXT: ADD_INT T1.W, KC0[2].Y, literal.z, +; EG-NEXT: ASHR * T13.W, T0.Z, literal.y, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) -; EG-NEXT: BFE_INT T7.X, T7.Z, 0.0, literal.x, -; EG-NEXT: ASHR T19.Y, PV.X, literal.y, -; EG-NEXT: ASHR T20.Z, T7.Z, literal.z, -; EG-NEXT: LSHR T1.W, T7.Z, literal.w, -; EG-NEXT: ASHR * T21.W, T7.W, literal.y, +; EG-NEXT: BFE_INT T0.X, T0.Z, 0.0, literal.x, +; EG-NEXT: ASHR T12.Y, PV.X, literal.y, +; EG-NEXT: ASHR T13.Z, T0.Z, literal.z, +; EG-NEXT: LSHR T2.W, T0.Z, literal.w, +; EG-NEXT: ASHR * T14.W, T0.W, literal.y, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T20.X, PV.W, 0.0, literal.x, -; EG-NEXT: ASHR T7.Y, PV.X, literal.y, -; EG-NEXT: ASHR T21.Z, T7.W, literal.z, -; EG-NEXT: LSHR T1.W, T7.Z, literal.x, -; EG-NEXT: LSHR * T2.W, T7.W, literal.w, +; EG-NEXT: BFE_INT T13.X, PV.W, 0.0, literal.x, +; EG-NEXT: ASHR T0.Y, PV.X, literal.y, +; EG-NEXT: ASHR T14.Z, T0.W, literal.z, +; EG-NEXT: LSHR T2.W, T0.Z, literal.x, +; EG-NEXT: LSHR * T0.W, T0.W, literal.w, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T21.X, PS, 0.0, literal.x, -; EG-NEXT: ASHR T20.Y, PV.X, literal.y, -; EG-NEXT: BFE_INT T7.Z, PV.W, 0.0, literal.x, -; EG-NEXT: ASHR T18.W, T18.Z, literal.y, -; EG-NEXT: ASHR * T14.W, T14.Z, literal.y, +; EG-NEXT: BFE_INT T14.X, PS, 0.0, literal.x, +; EG-NEXT: ASHR T13.Y, PV.X, literal.y, +; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, literal.x, +; EG-NEXT: ASHR T11.W, T11.Z, literal.y, +; EG-NEXT: ASHR * T7.W, T7.Z, literal.y, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) -; EG-NEXT: LSHR T22.X, T0.W, literal.x, -; EG-NEXT: ASHR T21.Y, PV.X, literal.y, -; EG-NEXT: ASHR T7.W, PV.Z, literal.y, -; EG-NEXT: ASHR * T13.W, T13.Z, literal.y, +; EG-NEXT: LSHR T15.X, T1.W, literal.x, +; EG-NEXT: ASHR T14.Y, PV.X, literal.y, +; EG-NEXT: ASHR T0.W, PV.Z, literal.y, +; EG-NEXT: ASHR * T6.W, T6.Z, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i64: @@ -7693,170 +7633,169 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @22 -; EG-NEXT: ALU 103, @27, KC0[CB0:0-32], KC1[] -; EG-NEXT: ALU 33, @131, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T42.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T41.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T40.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T39.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T38.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T37.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T36.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T35.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T34.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T33.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T32.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T31.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T30.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T29.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T28.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T27.X, 1 +; EG-NEXT: ALU 102, @27, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 33, @130, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T31.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T30.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T29.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T28.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T27.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T26.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T25.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T24.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T23.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T22.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T21.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T20.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T19.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T18.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T17.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T16.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1 -; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1 +; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 26: -; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 27: -; EG-NEXT: MOV * T0.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T13.X, T11.W, literal.x, PV.W, -; EG-NEXT: LSHR * T13.Z, T11.W, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T13.Y, 0.0, -; EG-NEXT: BFE_UINT * T14.Z, T11.W, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T14.X, T11.W, literal.x, -; EG-NEXT: MOV * T14.Y, 0.0, -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T15.X, T11.Z, literal.x, T0.W, -; EG-NEXT: LSHR * T15.Z, T11.Z, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T15.Y, 0.0, -; EG-NEXT: BFE_UINT * T16.Z, T11.Z, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T16.X, T11.Z, literal.x, -; EG-NEXT: MOV * T16.Y, 0.0, +; EG-NEXT: LSHR T2.Z, T1.W, literal.x, +; EG-NEXT: MOV * T2.W, literal.y, +; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44) +; EG-NEXT: BFE_UINT T2.X, T1.W, literal.x, PV.W, +; EG-NEXT: MOV T2.Y, 0.0, +; EG-NEXT: BFE_UINT T3.Z, T1.W, literal.y, PV.W, +; EG-NEXT: AND_INT * T3.X, T1.W, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T17.X, T11.Y, literal.x, T0.W, -; EG-NEXT: LSHR * T17.Z, T11.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T17.Y, 0.0, -; EG-NEXT: BFE_UINT * T18.Z, T11.Y, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T18.X, T11.Y, literal.x, -; EG-NEXT: MOV * T18.Y, 0.0, +; EG-NEXT: MOV T3.Y, 0.0, +; EG-NEXT: LSHR * T4.Z, T1.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T4.X, T1.Z, literal.x, T2.W, +; EG-NEXT: MOV T4.Y, 0.0, +; EG-NEXT: BFE_UINT T5.Z, T1.Z, literal.y, T2.W, +; EG-NEXT: AND_INT * T5.X, T1.Z, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T19.X, T11.X, literal.x, T0.W, -; EG-NEXT: LSHR * T19.Z, T11.X, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T19.Y, 0.0, -; EG-NEXT: BFE_UINT * T11.Z, T11.X, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T11.X, T11.X, literal.x, -; EG-NEXT: MOV * T11.Y, 0.0, +; EG-NEXT: MOV T5.Y, 0.0, +; EG-NEXT: LSHR * T6.Z, T1.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T1.Y, literal.x, T2.W, +; EG-NEXT: MOV T6.Y, 0.0, +; EG-NEXT: BFE_UINT T7.Z, T1.Y, literal.y, T2.W, +; EG-NEXT: AND_INT * T7.X, T1.Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T20.X, T12.W, literal.x, T0.W, -; EG-NEXT: LSHR * T20.Z, T12.W, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T20.Y, 0.0, -; EG-NEXT: BFE_UINT * T21.Z, T12.W, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T21.X, T12.W, literal.x, -; EG-NEXT: MOV * T21.Y, 0.0, +; EG-NEXT: MOV T7.Y, 0.0, +; EG-NEXT: LSHR * T8.Z, T1.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T8.X, T1.X, literal.x, T2.W, +; EG-NEXT: MOV T8.Y, 0.0, +; EG-NEXT: BFE_UINT T1.Z, T1.X, literal.y, T2.W, +; EG-NEXT: AND_INT * T1.X, T1.X, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T22.X, T12.Z, literal.x, T0.W, -; EG-NEXT: LSHR * T22.Z, T12.Z, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T22.Y, 0.0, -; EG-NEXT: BFE_UINT * T23.Z, T12.Z, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T23.X, T12.Z, literal.x, -; EG-NEXT: MOV * T23.Y, 0.0, +; EG-NEXT: MOV T1.Y, 0.0, +; EG-NEXT: LSHR * T9.Z, T0.W, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T0.W, literal.x, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: MOV T9.Y, 0.0, +; EG-NEXT: BFE_UINT T10.Z, T0.W, literal.y, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: AND_INT * T10.X, T0.W, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T24.X, T12.Y, literal.x, T0.W, -; EG-NEXT: LSHR * T24.Z, T12.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T24.Y, 0.0, -; EG-NEXT: BFE_UINT * T25.Z, T12.Y, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T25.X, T12.Y, literal.x, -; EG-NEXT: MOV * T25.Y, 0.0, +; EG-NEXT: MOV T10.Y, 0.0, +; EG-NEXT: LSHR * T11.Z, T0.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T11.X, T0.Z, literal.x, T2.W, +; EG-NEXT: MOV T11.Y, 0.0, +; EG-NEXT: BFE_UINT T12.Z, T0.Z, literal.y, T2.W, +; EG-NEXT: AND_INT * T12.X, T0.Z, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T26.X, T12.X, literal.x, T0.W, -; EG-NEXT: LSHR * T26.Z, T12.X, literal.y, -; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: MOV T26.Y, 0.0, -; EG-NEXT: BFE_UINT * T12.Z, T12.X, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T12.X, T12.X, literal.x, ; EG-NEXT: MOV T12.Y, 0.0, +; EG-NEXT: LSHR * T13.Z, T0.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T13.X, T0.Y, literal.x, T2.W, +; EG-NEXT: MOV T13.Y, 0.0, +; EG-NEXT: BFE_UINT T14.Z, T0.Y, literal.y, T2.W, +; EG-NEXT: AND_INT * T14.X, T0.Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T14.Y, 0.0, +; EG-NEXT: LSHR * T15.Z, T0.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T15.X, T0.X, literal.x, T2.W, +; EG-NEXT: MOV T15.Y, 0.0, +; EG-NEXT: BFE_UINT T0.Z, T0.X, literal.y, T2.W, +; EG-NEXT: AND_INT * T0.X, T0.X, literal.z, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV T2.W, 0.0, +; EG-NEXT: MOV * T3.W, 0.0, +; EG-NEXT: MOV T4.W, 0.0, +; EG-NEXT: MOV * T5.W, 0.0, +; EG-NEXT: MOV T6.W, 0.0, +; EG-NEXT: MOV * T7.W, 0.0, +; EG-NEXT: MOV T8.W, 0.0, +; EG-NEXT: MOV * T1.W, 0.0, +; EG-NEXT: MOV T9.W, 0.0, +; EG-NEXT: MOV * T10.W, 0.0, +; EG-NEXT: MOV T11.W, 0.0, +; EG-NEXT: MOV * T12.W, 0.0, ; EG-NEXT: MOV T13.W, 0.0, ; EG-NEXT: MOV * T14.W, 0.0, -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) ; EG-NEXT: MOV T15.W, 0.0, -; EG-NEXT: MOV * T16.W, 0.0, -; EG-NEXT: MOV T17.W, 0.0, -; EG-NEXT: MOV * T18.W, 0.0, -; EG-NEXT: MOV T19.W, 0.0, -; EG-NEXT: MOV * T11.W, 0.0, -; EG-NEXT: MOV T20.W, 0.0, -; EG-NEXT: MOV * T21.W, 0.0, -; EG-NEXT: MOV T22.W, 0.0, -; EG-NEXT: MOV * T23.W, 0.0, -; EG-NEXT: MOV T24.W, 0.0, -; EG-NEXT: MOV * T25.W, 0.0, -; EG-NEXT: MOV T26.W, 0.0, -; EG-NEXT: MOV * T12.W, 0.0, -; EG-NEXT: LSHR T27.X, KC0[2].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: MOV * T0.W, 0.0, +; EG-NEXT: LSHR T16.X, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR T28.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T17.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) -; EG-NEXT: LSHR T29.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T18.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44) -; EG-NEXT: LSHR T30.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T19.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44) -; EG-NEXT: LSHR * T31.X, PV.W, literal.x, +; EG-NEXT: LSHR * T20.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 131: -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: ALU clause starting at 130: +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.x, ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T32.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T21.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43) -; EG-NEXT: LSHR T33.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T22.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) -; EG-NEXT: LSHR T34.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T23.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43) -; EG-NEXT: LSHR T35.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T24.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43) -; EG-NEXT: LSHR T36.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T25.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43) -; EG-NEXT: LSHR T37.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T26.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43) -; EG-NEXT: LSHR T38.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T27.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43) -; EG-NEXT: LSHR T39.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T28.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43) -; EG-NEXT: LSHR T40.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T29.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43) -; EG-NEXT: LSHR T41.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T30.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43) -; EG-NEXT: LSHR * T42.X, PV.W, literal.x, +; EG-NEXT: LSHR * T31.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i64: @@ -8558,187 +8497,187 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; EG-NEXT: TEX 1 @22 ; EG-NEXT: ALU 84, @27, KC0[CB0:0-32], KC1[] ; EG-NEXT: ALU 71, @112, KC0[], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T42.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T31.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T30.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T25.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T24.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T23.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T22.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T21.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T20.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T19.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T18.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T17.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T16.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T15.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T14.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T13.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T1.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T20.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T19.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T14.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T13.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T12.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T11.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T10.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T9.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T8.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T7.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T6.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T5.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T2.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1 -; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1 +; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1 ; EG-NEXT: ALU clause starting at 26: -; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 27: -; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR T14.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T3.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) -; EG-NEXT: LSHR T15.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T4.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44) -; EG-NEXT: LSHR T16.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T5.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44) -; EG-NEXT: LSHR T17.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T6.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) -; EG-NEXT: LSHR T18.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T7.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43) -; EG-NEXT: LSHR T19.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T8.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) -; EG-NEXT: LSHR T20.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T9.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43) -; EG-NEXT: LSHR T21.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T10.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43) -; EG-NEXT: LSHR T22.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T11.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43) -; EG-NEXT: LSHR T23.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T12.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43) -; EG-NEXT: LSHR T24.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: LSHR T13.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43) -; EG-NEXT: LSHR * T25.X, PV.W, literal.x, +; EG-NEXT: LSHR * T14.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_INT * T26.X, T11.W, 0.0, literal.x, +; EG-NEXT: BFE_INT * T15.X, T0.W, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T27.X, T11.Y, 0.0, literal.x, -; EG-NEXT: ASHR T26.Y, PV.X, literal.y, -; EG-NEXT: LSHR * T0.W, T11.W, literal.x, +; EG-NEXT: BFE_INT T16.X, T0.Y, 0.0, literal.x, +; EG-NEXT: ASHR T15.Y, PV.X, literal.y, +; EG-NEXT: LSHR * T2.W, T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) -; EG-NEXT: BFE_INT T28.X, T11.X, 0.0, literal.x, -; EG-NEXT: ASHR T27.Y, PV.X, literal.y, -; EG-NEXT: BFE_INT T26.Z, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, +; EG-NEXT: BFE_INT T17.X, T0.X, 0.0, literal.x, +; EG-NEXT: ASHR T16.Y, PV.X, literal.y, +; EG-NEXT: BFE_INT T15.Z, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR * T2.W, T0.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) -; EG-NEXT: BFE_INT T29.X, T12.W, 0.0, literal.x, -; EG-NEXT: ASHR T28.Y, PV.X, literal.y, -; EG-NEXT: BFE_INT T27.Z, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR T0.W, T11.X, literal.x, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, +; EG-NEXT: BFE_INT T18.X, T1.W, 0.0, literal.x, +; EG-NEXT: ASHR T17.Y, PV.X, literal.y, +; EG-NEXT: BFE_INT T16.Z, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR T2.W, T0.X, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T30.X, PS, literal.x, -; EG-NEXT: ASHR T29.Y, PV.X, literal.y, -; EG-NEXT: BFE_INT T28.Z, PV.W, 0.0, literal.z, -; EG-NEXT: LSHR T0.W, T12.W, literal.z, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, +; EG-NEXT: LSHR T19.X, PS, literal.x, +; EG-NEXT: ASHR T18.Y, PV.X, literal.y, +; EG-NEXT: BFE_INT T17.Z, PV.W, 0.0, literal.z, +; EG-NEXT: LSHR T2.W, T1.W, literal.z, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) ; EG-NEXT: 8(1.121039e-44), 224(3.138909e-43) -; EG-NEXT: LSHR T31.X, PS, literal.x, -; EG-NEXT: BFE_INT T29.Z, PV.W, 0.0, literal.y, -; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z, -; EG-NEXT: ASHR * T32.W, T12.X, literal.w, +; EG-NEXT: LSHR T20.X, PS, literal.x, +; EG-NEXT: BFE_INT T18.Z, PV.W, 0.0, literal.y, +; EG-NEXT: ADD_INT T2.W, KC0[2].Y, literal.z, +; EG-NEXT: ASHR * T21.W, T1.X, literal.w, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 240(3.363116e-43), 31(4.344025e-44) -; EG-NEXT: BFE_INT T33.X, T12.Z, 0.0, literal.x, -; EG-NEXT: LSHR T0.Y, T11.Z, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: ASHR T32.Z, T12.X, literal.y, -; EG-NEXT: LSHR T1.W, T12.X, literal.z, -; EG-NEXT: ASHR * T34.W, T12.Y, literal.w, +; EG-NEXT: BFE_INT T22.X, T1.Z, 0.0, literal.x, +; EG-NEXT: LSHR T2.Y, T0.Z, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ASHR T21.Z, T1.X, literal.y, +; EG-NEXT: LSHR T3.W, T1.X, literal.z, +; EG-NEXT: ASHR * T23.W, T1.Y, literal.w, ; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) -; EG-NEXT: BFE_INT T32.X, PV.W, 0.0, literal.x, -; EG-NEXT: ASHR T33.Y, PV.X, literal.y, -; EG-NEXT: ASHR T34.Z, T12.Y, literal.z, -; EG-NEXT: LSHR T1.W, T12.Z, literal.x, -; EG-NEXT: LSHR * T2.W, T12.Y, literal.w, +; EG-NEXT: BFE_INT T21.X, PV.W, 0.0, literal.x, +; EG-NEXT: ASHR T22.Y, PV.X, literal.y, +; EG-NEXT: ASHR T23.Z, T1.Y, literal.z, +; EG-NEXT: LSHR T3.W, T1.Z, literal.x, +; EG-NEXT: LSHR * T4.W, T1.Y, literal.w, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT * T34.X, PS, 0.0, literal.x, +; EG-NEXT: BFE_INT * T23.X, PS, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 112: -; EG-NEXT: ASHR T32.Y, T32.X, literal.x, -; EG-NEXT: BFE_INT T33.Z, T1.W, 0.0, literal.y, -; EG-NEXT: LSHR T1.W, T11.W, literal.z, BS:VEC_120/SCL_212 -; EG-NEXT: ASHR * T35.W, T12.Z, literal.x, +; EG-NEXT: ASHR T21.Y, T21.X, literal.x, +; EG-NEXT: BFE_INT T22.Z, T3.W, 0.0, literal.y, +; EG-NEXT: LSHR T3.W, T0.W, literal.z, BS:VEC_120/SCL_212 +; EG-NEXT: ASHR * T24.W, T1.Z, literal.x, ; EG-NEXT: 31(4.344025e-44), 8(1.121039e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T36.X, T12.X, 0.0, literal.x, -; EG-NEXT: ASHR T34.Y, T34.X, literal.y, BS:VEC_120/SCL_212 -; EG-NEXT: ASHR T35.Z, T12.Z, literal.z, -; EG-NEXT: LSHR T2.W, T12.Z, literal.w, -; EG-NEXT: ASHR * T37.W, T12.W, literal.y, +; EG-NEXT: BFE_INT T25.X, T1.X, 0.0, literal.x, +; EG-NEXT: ASHR T23.Y, T23.X, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: ASHR T24.Z, T1.Z, literal.z, +; EG-NEXT: LSHR T4.W, T1.Z, literal.w, +; EG-NEXT: ASHR * T26.W, T1.W, literal.y, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T35.X, PV.W, 0.0, literal.x, -; EG-NEXT: ASHR T36.Y, PV.X, literal.y, -; EG-NEXT: ASHR T37.Z, T12.W, literal.z, -; EG-NEXT: LSHR T2.W, T12.X, literal.x, -; EG-NEXT: LSHR * T3.W, T12.W, literal.w, +; EG-NEXT: BFE_INT T24.X, PV.W, 0.0, literal.x, +; EG-NEXT: ASHR T25.Y, PV.X, literal.y, +; EG-NEXT: ASHR T26.Z, T1.W, literal.z, +; EG-NEXT: LSHR T4.W, T1.X, literal.x, +; EG-NEXT: LSHR * T1.W, T1.W, literal.w, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T37.X, PS, 0.0, literal.x, -; EG-NEXT: ASHR T35.Y, PV.X, literal.y, -; EG-NEXT: BFE_INT T36.Z, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR T2.W, T11.Z, literal.z, -; EG-NEXT: ASHR * T12.W, T11.X, literal.y, +; EG-NEXT: BFE_INT T26.X, PS, 0.0, literal.x, +; EG-NEXT: ASHR T24.Y, PV.X, literal.y, +; EG-NEXT: BFE_INT T25.Z, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR T1.W, T0.Z, literal.z, +; EG-NEXT: ASHR * T27.W, T0.X, literal.y, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T38.X, T12.Y, 0.0, literal.x, -; EG-NEXT: ASHR T37.Y, PV.X, literal.y, -; EG-NEXT: ASHR T12.Z, T11.X, literal.z, -; EG-NEXT: LSHR T3.W, T11.X, literal.w, -; EG-NEXT: ASHR * T39.W, T11.Y, literal.y, +; EG-NEXT: BFE_INT T28.X, T1.Y, 0.0, literal.x, +; EG-NEXT: ASHR T26.Y, PV.X, literal.y, +; EG-NEXT: ASHR T27.Z, T0.X, literal.z, +; EG-NEXT: LSHR T4.W, T0.X, literal.w, +; EG-NEXT: ASHR * T29.W, T0.Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T12.X, PV.W, 0.0, literal.x, -; EG-NEXT: ASHR T38.Y, PV.X, literal.y, -; EG-NEXT: ASHR T39.Z, T11.Y, literal.z, -; EG-NEXT: LSHR T3.W, T12.Y, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR * T4.W, T11.Y, literal.w, +; EG-NEXT: BFE_INT T27.X, PV.W, 0.0, literal.x, +; EG-NEXT: ASHR T28.Y, PV.X, literal.y, +; EG-NEXT: ASHR T29.Z, T0.Y, literal.z, +; EG-NEXT: LSHR T4.W, T1.Y, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR * T5.W, T0.Y, literal.w, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) -; EG-NEXT: BFE_INT T39.X, PS, 0.0, literal.x, -; EG-NEXT: ASHR T12.Y, PV.X, literal.y, -; EG-NEXT: BFE_INT T38.Z, PV.W, 0.0, literal.x, -; EG-NEXT: ASHR T36.W, T36.Z, literal.y, -; EG-NEXT: ASHR * T40.W, T11.Z, literal.y, +; EG-NEXT: BFE_INT T29.X, PS, 0.0, literal.x, +; EG-NEXT: ASHR T27.Y, PV.X, literal.y, +; EG-NEXT: BFE_INT T28.Z, PV.W, 0.0, literal.x, +; EG-NEXT: ASHR T25.W, T25.Z, literal.y, +; EG-NEXT: ASHR * T30.W, T0.Z, literal.y, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) -; EG-NEXT: BFE_INT T11.X, T11.Z, 0.0, literal.x, -; EG-NEXT: ASHR T39.Y, PV.X, literal.y, -; EG-NEXT: ASHR T40.Z, T11.Z, literal.z, -; EG-NEXT: ASHR T38.W, PV.Z, literal.y, -; EG-NEXT: ASHR * T41.W, T11.W, literal.y, +; EG-NEXT: BFE_INT T0.X, T0.Z, 0.0, literal.x, +; EG-NEXT: ASHR T29.Y, PV.X, literal.y, +; EG-NEXT: ASHR T30.Z, T0.Z, literal.z, +; EG-NEXT: ASHR T28.W, PV.Z, literal.y, +; EG-NEXT: ASHR * T31.W, T0.W, literal.y, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T40.X, T2.W, 0.0, literal.x, -; EG-NEXT: ASHR T11.Y, PV.X, literal.y, -; EG-NEXT: ASHR T41.Z, T11.W, literal.z, BS:VEC_120/SCL_212 -; EG-NEXT: ASHR T33.W, T33.Z, literal.y, -; EG-NEXT: ASHR * T29.W, T29.Z, literal.y, +; EG-NEXT: BFE_INT T30.X, T1.W, 0.0, literal.x, +; EG-NEXT: ASHR T0.Y, PV.X, literal.y, +; EG-NEXT: ASHR T31.Z, T0.W, literal.z, BS:VEC_120/SCL_212 +; EG-NEXT: ASHR T22.W, T22.Z, literal.y, +; EG-NEXT: ASHR * T18.W, T18.Z, literal.y, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T41.X, T1.W, 0.0, literal.x, -; EG-NEXT: ASHR T40.Y, PV.X, literal.y, -; EG-NEXT: BFE_INT T11.Z, T0.Y, 0.0, literal.x, -; EG-NEXT: ASHR T28.W, T28.Z, literal.y, -; EG-NEXT: ASHR * T27.W, T27.Z, literal.y, +; EG-NEXT: BFE_INT T31.X, T3.W, 0.0, literal.x, +; EG-NEXT: ASHR T30.Y, PV.X, literal.y, +; EG-NEXT: BFE_INT T0.Z, T2.Y, 0.0, literal.x, +; EG-NEXT: ASHR T17.W, T17.Z, literal.y, +; EG-NEXT: ASHR * T16.W, T16.Z, literal.y, ; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44) -; EG-NEXT: LSHR T42.X, T0.W, literal.x, -; EG-NEXT: ASHR T41.Y, PV.X, literal.y, -; EG-NEXT: ASHR T11.W, PV.Z, literal.y, -; EG-NEXT: ASHR * T26.W, T26.Z, literal.y, +; EG-NEXT: LSHR T1.X, T2.W, literal.x, +; EG-NEXT: ASHR T31.Y, PV.X, literal.y, +; EG-NEXT: ASHR T0.W, PV.Z, literal.y, +; EG-NEXT: ASHR * T15.W, T15.Z, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i64: @@ -9280,21 +9219,21 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T6.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T5.X, T5.X, 0, #1 +; EG-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T5.X, KC0[2].Z, +; EG-NEXT: MOV * T4.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHL * T0.W, T5.X, literal.x, +; EG-NEXT: LSHL * T0.W, T4.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T0.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T5.X, literal.y, +; EG-NEXT: AND_INT * T1.W, T4.X, literal.y, ; EG-NEXT: 16711680(2.341805e-38), 255(3.573311e-43) -; EG-NEXT: OR_INT T5.X, PS, PV.W, -; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT T4.X, PS, PV.W, +; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i16: @@ -9384,16 +9323,16 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 16, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T6.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T5.X, T5.X, 0, #1 +; EG-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: MOV * T5.X, KC0[2].Z, +; EG-NEXT: MOV * T4.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: AND_INT T0.W, T5.X, literal.x, +; EG-NEXT: AND_INT T0.W, T4.X, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), -65536(nan) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, @@ -9407,8 +9346,8 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; EG-NEXT: LSHL T0.W, PV.W, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: OR_INT T5.X, PS, PV.W, -; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT T4.X, PS, PV.W, +; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i16: @@ -9499,47 +9438,47 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 31, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 +; EG-NEXT: VTX_READ_32 T5.X, T5.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: MOV * T0.Y, T2.X, +; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: AND_INT T0.W, T7.X, literal.x, +; EG-NEXT: LSHL * T0.W, T5.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: 16711680(2.341805e-38), 65535(9.183409e-41) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T0.W, T7.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T0.W, T7.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT T0.W, T5.X, literal.x, PV.W, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, ; EG-NEXT: 16(2.242078e-44), -65536(nan) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV * T3.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T7.X, literal.x, +; EG-NEXT: LSHR * T0.W, T5.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T8.Y, PV.W, PS, +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.Y, -; EG-NEXT: MOV * T8.X, T4.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T5.X, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T5.X, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.X, +; EG-NEXT: MOV * T5.Y, T3.X, ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: @@ -9641,34 +9580,26 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 +; EG-NEXT: VTX_READ_32 T5.X, T5.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: MOV * T0.Y, T2.X, +; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T5.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, PV.W, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T7.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: LSHR * T0.W, T5.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -9676,18 +9607,26 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T7.X, literal.x, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: ASHR * T0.W, T5.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: LSHL * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T8.Y, PV.W, PS, +; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T6.Y, PV.W, PS, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.Y, -; EG-NEXT: MOV * T8.X, T4.X, +; EG-NEXT: MOV T3.X, PV.Y, +; EG-NEXT: MOV * T6.X, T2.X, ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: @@ -9808,77 +9747,77 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 61, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1 +; EG-NEXT: VTX_READ_64 T7.XY, T7.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.Y, T8.X, -; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: MOV * T7.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: AND_INT T0.W, T11.X, literal.x, +; EG-NEXT: LSHL * T0.W, T7.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: 16711680(2.341805e-38), 65535(9.183409e-41) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T0.W, T11.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, T11.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T7.X, literal.x, PV.W, ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, ; EG-NEXT: 16(2.242078e-44), -65536(nan) ; EG-NEXT: OR_INT * T1.W, PS, PV.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T11.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T12.Y, PV.W, PS, -; EG-NEXT: MOV T9.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T11.Y, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T11.Y, literal.x, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: LSHL * T1.W, T7.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: BFE_UINT * T0.W, T11.Y, literal.x, T0.W, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: BFE_UINT * T0.W, T7.Y, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T12.W, PV.W, PS, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, ; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T12.X, T8.X, -; EG-NEXT: MOV * T12.Z, T4.X, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T7.X, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T7.X, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.X, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: LSHR * T0.W, T7.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV * T0.Y, T2.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T7.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T7.Z, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T2.X, PV.Z, +; EG-NEXT: MOV T7.Y, T5.X, +; EG-NEXT: MOV * T7.W, T3.X, BS:VEC_120/SCL_212 ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: @@ -10025,24 +9964,36 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 74, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1 +; EG-NEXT: VTX_READ_64 T7.XY, T7.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.Y, T8.X, -; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: MOV * T7.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: BFE_INT * T0.W, T11.X, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T7.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, PV.W, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T11.X, literal.x, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: LSHR * T0.W, T7.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -10050,9 +10001,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: LSHR * T0.W, T7.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -10060,55 +10011,43 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T11.X, literal.x, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: ASHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: LSHL * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T12.Y, PV.W, PS, -; EG-NEXT: MOV T9.X, PV.Y, -; EG-NEXT: MOV T0.Y, T4.X, -; EG-NEXT: BFE_INT * T0.W, T11.Y, 0.0, literal.x, +; EG-NEXT: OR_INT * T8.Y, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.Y, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: BFE_INT * T0.W, T7.Y, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: -65536(nan), 65535(9.183409e-41) ; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T11.Y, literal.x, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: ASHR * T0.W, T7.Y, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: LSHL * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T12.W, PV.W, PS, +; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T8.W, PV.W, PS, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T12.X, T8.X, -; EG-NEXT: MOV * T12.Z, T4.X, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV * T8.X, T4.X, +; EG-NEXT: MOV * T8.Z, T2.X, ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: @@ -10303,144 +10242,145 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @8 -; EG-NEXT: ALU 103, @12, KC0[], KC1[] -; EG-NEXT: ALU 20, @116, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1 +; EG-NEXT: ALU 104, @12, KC0[], KC1[] +; EG-NEXT: ALU 20, @117, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 +; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 ; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.Y, T16.X, -; EG-NEXT: MOV * T19.X, KC0[2].Z, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: MOV * T11.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: AND_INT T0.W, T19.X, literal.x, +; EG-NEXT: LSHL * T0.W, T11.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: 16711680(2.341805e-38), 65535(9.183409e-41) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T16.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T0.W, T19.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T16.X, PV.W, -; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, T19.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T11.X, literal.x, PV.W, ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, ; EG-NEXT: 16(2.242078e-44), -65536(nan) ; EG-NEXT: OR_INT * T1.W, PS, PV.W, -; EG-NEXT: MOV * T17.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T19.X, literal.x, +; EG-NEXT: MOV T9.X, PV.W, +; EG-NEXT: MOV T0.Y, T6.X, +; EG-NEXT: LSHL * T1.W, T11.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T20.Y, PV.W, PS, -; EG-NEXT: MOV T17.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T12.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T19.Y, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T12.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T19.Y, literal.x, +; EG-NEXT: MOV T6.X, PV.W, +; EG-NEXT: MOV T0.Y, T7.X, +; EG-NEXT: BFE_UINT * T1.W, T11.Y, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV T7.X, PV.W, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: LSHL * T1.W, T11.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T12.X, PV.W, -; EG-NEXT: MOV T0.Y, T13.X, -; EG-NEXT: BFE_UINT * T1.W, T19.Y, literal.x, T0.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: BFE_UINT * T1.W, T11.Z, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T13.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T19.Y, literal.x, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: LSHL * T1.W, T11.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T20.W, PV.W, PS, -; EG-NEXT: MOV T13.X, PV.W, -; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: BFE_UINT * T0.W, T11.W, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T19.Z, literal.y, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.W, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T11.X, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T19.Z, literal.x, +; EG-NEXT: OR_INT * T12.X, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.X, +; EG-NEXT: MOV T0.Y, T7.X, +; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: BFE_UINT * T1.W, T19.Z, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T19.Z, literal.x, +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T7.X, PV.W, +; EG-NEXT: MOV * T0.Y, T6.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T11.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T12.Z, PV.W, PS, +; EG-NEXT: MOV T6.X, PV.Z, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T11.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T19.Y, PV.W, PS, -; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.W, ; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T19.W, literal.y, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T11.Z, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T19.W, literal.x, +; EG-NEXT: OR_INT * T11.X, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.X, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: LSHR * T0.W, T11.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: ALU clause starting at 117: +; EG-NEXT: AND_INT T1.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, T0.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W, +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 116: -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR T0.W, T19.W, literal.x, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: LSHR T21.X, PS, literal.x, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.y, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.z, -; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) -; EG-NEXT: 16711680(2.341805e-38), 0(0.000000e+00) -; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T19.W, PV.W, PS, +; EG-NEXT: LSHR T13.X, PV.W, literal.x, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.y, +; EG-NEXT: AND_INT * T1.W, T11.W, literal.z, +; EG-NEXT: 2(2.802597e-45), -65536(nan) +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: LSHR T14.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T11.Z, PV.W, PS, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T20.X, T16.X, -; EG-NEXT: MOV * T20.Z, T12.X, -; EG-NEXT: MOV T19.X, T8.X, -; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV T2.X, PV.Z, +; EG-NEXT: MOV T12.Y, T9.X, +; EG-NEXT: MOV T12.W, T7.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T11.Y, T5.X, +; EG-NEXT: MOV * T11.W, T3.X, ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: @@ -10690,27 +10630,39 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @8 -; EG-NEXT: ALU 104, @12, KC0[], KC1[] -; EG-NEXT: ALU 46, @117, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1 +; EG-NEXT: ALU 105, @12, KC0[], KC1[] +; EG-NEXT: ALU 45, @118, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 +; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 ; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.Y, T16.X, -; EG-NEXT: MOV * T19.X, KC0[2].Z, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: MOV * T11.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: BFE_INT * T0.W, T19.X, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T11.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, PV.W, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T16.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T19.X, literal.x, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T9.X, PV.W, +; EG-NEXT: MOV T0.Y, T6.X, +; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -10718,9 +10670,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T16.X, PV.W, -; EG-NEXT: MOV T0.Y, T17.X, -; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: MOV T6.X, PV.W, +; EG-NEXT: MOV T0.Y, T7.X, +; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -10728,25 +10680,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T17.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T19.X, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T20.Y, PV.W, PS, -; EG-NEXT: MOV T17.X, PV.Y, -; EG-NEXT: MOV T0.Y, T12.X, -; EG-NEXT: BFE_INT * T0.W, T19.Y, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T12.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T19.Y, literal.x, +; EG-NEXT: MOV T7.X, PV.W, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: LSHR * T0.W, T11.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -10754,9 +10690,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T12.X, PV.W, -; EG-NEXT: MOV T0.Y, T13.X, -; EG-NEXT: LSHR * T0.W, T19.Y, literal.x, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T11.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -10764,25 +10700,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T13.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T19.Y, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T20.W, PV.W, PS, -; EG-NEXT: MOV T13.X, PV.W, -; EG-NEXT: MOV T0.Y, T8.X, -; EG-NEXT: BFE_INT * T0.W, T19.Z, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T19.Z, literal.x, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: LSHR * T0.W, T11.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -10790,9 +10710,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: LSHR * T0.W, T19.Z, literal.x, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: LSHR * T0.W, T11.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -10800,61 +10720,81 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T19.Z, literal.x, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV T0.Y, T8.X, +; EG-NEXT: BFE_INT * T0.W, T11.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: ASHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: LSHL * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: ALU clause starting at 117: -; EG-NEXT: OR_INT * T19.Y, T1.W, T0.W, +; EG-NEXT: OR_INT * T12.Y, PV.W, PS, ; EG-NEXT: MOV T9.X, PV.Y, -; EG-NEXT: MOV T0.Y, T4.X, -; EG-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x, +; EG-NEXT: MOV T0.Y, T6.X, +; EG-NEXT: BFE_INT * T0.W, T11.Y, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: -65536(nan), 65535(9.183409e-41) ; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T19.W, literal.x, +; EG-NEXT: MOV T6.X, PV.W, +; EG-NEXT: MOV T0.Y, T7.X, +; EG-NEXT: ASHR * T0.W, T11.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 118: +; EG-NEXT: AND_INT T1.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T0.W, T0.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T12.W, PV.W, PS, +; EG-NEXT: MOV T7.X, PV.W, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: BFE_INT * T0.W, T11.Z, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, ; EG-NEXT: MOV T4.X, PV.W, ; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: LSHR * T0.W, T19.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR T0.W, T19.W, literal.x, +; EG-NEXT: ASHR * T0.W, T11.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T11.Y, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.Y, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: BFE_INT * T0.W, T11.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: ASHR T0.W, T11.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) -; EG-NEXT: LSHR T21.X, PS, literal.x, +; EG-NEXT: LSHR T13.X, PS, literal.x, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.y, ; EG-NEXT: LSHL * T0.W, PV.W, literal.z, ; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T19.W, PV.W, PS, +; EG-NEXT: LSHR T14.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T11.W, PV.W, PS, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T20.X, T16.X, -; EG-NEXT: MOV * T20.Z, T12.X, -; EG-NEXT: MOV T19.X, T8.X, -; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV * T12.X, T8.X, +; EG-NEXT: MOV * T12.Z, T6.X, +; EG-NEXT: MOV T11.X, T4.X, +; EG-NEXT: MOV * T11.Z, T2.X, BS:VEC_120/SCL_212 ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: @@ -11201,274 +11141,275 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @10 -; EG-NEXT: ALU 103, @16, KC0[], KC1[] -; EG-NEXT: ALU 104, @120, KC0[], KC1[] -; EG-NEXT: ALU 41, @225, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1 +; EG-NEXT: ALU 107, @16, KC0[], KC1[] +; EG-NEXT: ALU 101, @124, KC0[], KC1[] +; EG-NEXT: ALU 41, @226, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T26.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T24.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 10: -; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 -; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 +; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1 +; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: MOV * T0.Y, T16.X, -; EG-NEXT: MOV * T35.X, KC0[2].Z, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: MOV * T19.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: AND_INT T0.W, T37.X, literal.x, +; EG-NEXT: LSHL * T0.W, T20.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: 16711680(2.341805e-38), 65535(9.183409e-41) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T16.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T0.W, T37.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T16.X, PV.W, -; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, T37.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T20.X, literal.x, PV.W, ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, ; EG-NEXT: 16(2.242078e-44), -65536(nan) ; EG-NEXT: OR_INT * T1.W, PS, PV.W, -; EG-NEXT: MOV * T17.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T36.Y, PV.W, PS, -; EG-NEXT: MOV T17.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T12.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T37.Y, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T12.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T37.Y, literal.x, +; EG-NEXT: MOV T9.X, PV.W, +; EG-NEXT: MOV T0.Y, T6.X, +; EG-NEXT: LSHL * T1.W, T20.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T12.X, PV.W, -; EG-NEXT: MOV T0.Y, T13.X, -; EG-NEXT: BFE_UINT * T1.W, T37.Y, literal.x, T0.W, +; EG-NEXT: MOV T6.X, PV.W, +; EG-NEXT: MOV T0.Y, T7.X, +; EG-NEXT: BFE_UINT * T1.W, T20.Y, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T13.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T36.W, PV.W, PS, -; EG-NEXT: MOV T13.X, PV.W, -; EG-NEXT: MOV * T0.Y, T8.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T37.Z, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T37.Z, literal.x, +; EG-NEXT: MOV T7.X, PV.W, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: LSHL * T1.W, T20.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: BFE_UINT * T1.W, T37.Z, literal.x, T0.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: BFE_UINT * T1.W, T20.Z, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.Z, literal.x, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: LSHL * T1.W, T20.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T37.Y, PV.W, PS, -; EG-NEXT: MOV T9.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T37.W, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T37.W, literal.x, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: BFE_UINT * T1.W, T20.W, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV T0.Y, T16.X, +; EG-NEXT: LSHL * T1.W, T19.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: BFE_UINT * T1.W, T19.X, literal.x, T0.W, BS:VEC_120/SCL_212 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 120: -; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.W, literal.x, +; EG-NEXT: MOV T17.X, PV.W, +; EG-NEXT: MOV T0.Y, T14.X, +; EG-NEXT: LSHL * T1.W, T19.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T37.W, PV.W, PS, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T0.Y, T32.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T35.X, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T32.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T35.X, literal.x, +; EG-NEXT: MOV T14.X, PV.W, +; EG-NEXT: MOV T0.Y, T15.X, +; EG-NEXT: BFE_UINT * T1.W, T19.Y, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV T15.X, PV.W, +; EG-NEXT: MOV T0.Y, T12.X, +; EG-NEXT: LSHL * T1.W, T19.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T32.X, PV.W, -; EG-NEXT: MOV T0.Y, T33.X, -; EG-NEXT: BFE_UINT * T1.W, T35.X, literal.x, T0.W, BS:VEC_120/SCL_212 +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: BFE_UINT * T1.W, T19.Z, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T33.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T35.X, literal.x, +; EG-NEXT: MOV T13.X, PV.W, +; EG-NEXT: MOV T0.Y, T10.X, +; EG-NEXT: LSHL * T1.W, T19.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: ALU clause starting at 124: +; EG-NEXT: AND_INT T2.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T1.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T38.Y, PV.W, PS, -; EG-NEXT: MOV T33.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T28.X, +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T10.X, PV.W, +; EG-NEXT: MOV T0.Y, T11.X, +; EG-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, +; EG-NEXT: MOV T11.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: LSHR * T0.W, T20.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T35.Y, literal.y, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.W, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T20.X, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T28.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T35.Y, literal.x, +; EG-NEXT: OR_INT * T21.X, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.X, +; EG-NEXT: MOV T0.Y, T7.X, +; EG-NEXT: LSHR * T0.W, T20.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T28.X, PV.W, -; EG-NEXT: MOV T0.Y, T29.X, -; EG-NEXT: BFE_UINT * T1.W, T35.Y, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T29.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T35.Y, literal.x, +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T7.X, PV.W, +; EG-NEXT: MOV * T0.Y, T6.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T20.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T21.Z, PV.W, PS, +; EG-NEXT: MOV T6.X, PV.Z, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T20.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T38.W, PV.W, PS, -; EG-NEXT: MOV T29.X, PV.W, -; EG-NEXT: MOV * T0.Y, T24.X, +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T20.Z, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T20.X, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.X, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: LSHR * T0.W, T20.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T35.Z, literal.y, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV * T0.Y, T2.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T20.W, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T24.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T35.Z, literal.x, +; EG-NEXT: OR_INT * T20.Z, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.Z, +; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T24.X, PV.W, -; EG-NEXT: MOV T0.Y, T25.X, -; EG-NEXT: BFE_UINT * T1.W, T35.Z, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T25.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T35.Z, literal.x, +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T17.X, PV.W, +; EG-NEXT: MOV * T0.Y, T16.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T19.X, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T22.X, PV.W, PS, +; EG-NEXT: MOV T16.X, PV.X, +; EG-NEXT: MOV T0.Y, T15.X, +; EG-NEXT: LSHR * T0.W, T19.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T35.Y, PV.W, PS, -; EG-NEXT: MOV T25.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T20.X, +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T15.X, PV.W, +; EG-NEXT: MOV * T0.Y, T14.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T19.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T22.Z, PV.W, PS, +; EG-NEXT: MOV T14.X, PV.Z, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: LSHR * T0.W, T19.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T35.W, literal.y, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: ALU clause starting at 226: +; EG-NEXT: OR_INT * T0.W, T1.W, T0.W, +; EG-NEXT: MOV T13.X, PV.W, +; EG-NEXT: MOV * T0.Y, T12.X, +; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T19.Z, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T20.X, PV.W, -; EG-NEXT: ALU clause starting at 225: -; EG-NEXT: MOV T0.Y, T20.X, -; EG-NEXT: LSHL * T1.W, T35.W, literal.x, +; EG-NEXT: OR_INT * T19.X, PV.W, PS, +; EG-NEXT: MOV T12.X, PV.X, +; EG-NEXT: MOV T0.Y, T11.X, +; EG-NEXT: LSHR * T0.W, T19.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T20.X, PV.W, -; EG-NEXT: MOV T0.Y, T21.X, -; EG-NEXT: BFE_UINT * T0.W, T35.W, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, -; EG-NEXT: MOV * T21.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T11.X, PV.W, +; EG-NEXT: MOV T0.Y, T10.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T39.X, PV.W, literal.x, -; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x, +; EG-NEXT: LSHR T23.X, PV.W, literal.x, +; EG-NEXT: LSHR * T24.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: LSHR T0.W, T35.W, literal.x, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44) -; EG-NEXT: LSHR T41.X, PS, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T25.X, PV.W, literal.x, ; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y, -; EG-NEXT: AND_INT T0.W, PV.W, literal.z, +; EG-NEXT: AND_INT T0.W, T19.W, literal.z, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, -; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) -; EG-NEXT: 16711680(2.341805e-38), 32(4.484155e-44) -; EG-NEXT: LSHR T42.X, PS, literal.x, -; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W, +; EG-NEXT: 2(2.802597e-45), -65536(nan) +; EG-NEXT: 255(3.573311e-43), 32(4.484155e-44) +; EG-NEXT: LSHR T26.X, PS, literal.x, +; EG-NEXT: OR_INT * T19.Z, PV.Z, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T21.X, PV.W, -; EG-NEXT: MOV * T36.X, T16.X, -; EG-NEXT: MOV * T36.Z, T12.X, -; EG-NEXT: MOV T37.X, T8.X, -; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212 -; EG-NEXT: MOV * T38.X, T32.X, -; EG-NEXT: MOV * T38.Z, T28.X, -; EG-NEXT: MOV T35.X, T24.X, -; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV T10.X, PV.Z, +; EG-NEXT: MOV T21.Y, T9.X, +; EG-NEXT: MOV T21.W, T7.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T20.Y, T5.X, +; EG-NEXT: MOV * T20.W, T3.X, +; EG-NEXT: MOV T22.Y, T17.X, +; EG-NEXT: MOV T22.W, T15.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T19.Y, T13.X, +; EG-NEXT: MOV * T19.W, T11.X, ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: @@ -11929,27 +11870,39 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: ALU 104, @16, KC0[], KC1[] ; EG-NEXT: ALU 104, @121, KC0[], KC1[] ; EG-NEXT: ALU 95, @226, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T26.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T24.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 10: -; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 -; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 +; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1 +; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: MOV * T0.Y, T16.X, -; EG-NEXT: MOV * T35.X, KC0[2].Z, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: MOV * T19.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: BFE_INT * T0.W, T37.X, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T20.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, PV.W, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T16.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T37.X, literal.x, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: LSHR * T0.W, T20.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T9.X, PV.W, +; EG-NEXT: MOV T0.Y, T6.X, +; EG-NEXT: LSHR * T0.W, T20.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -11957,9 +11910,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T16.X, PV.W, -; EG-NEXT: MOV T0.Y, T17.X, -; EG-NEXT: LSHR * T0.W, T37.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: MOV T6.X, PV.W, +; EG-NEXT: MOV T0.Y, T7.X, +; EG-NEXT: LSHR * T0.W, T20.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -11967,25 +11920,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T17.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T37.X, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T36.Y, PV.W, PS, -; EG-NEXT: MOV T17.X, PV.Y, -; EG-NEXT: MOV T0.Y, T12.X, -; EG-NEXT: BFE_INT * T0.W, T37.Y, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T12.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T37.Y, literal.x, +; EG-NEXT: MOV T7.X, PV.W, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: LSHR * T0.W, T20.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -11993,9 +11930,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T12.X, PV.W, -; EG-NEXT: MOV T0.Y, T13.X, -; EG-NEXT: LSHR * T0.W, T37.Y, literal.x, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T20.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -12003,25 +11940,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T13.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T37.Y, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T36.W, PV.W, PS, -; EG-NEXT: MOV T13.X, PV.W, -; EG-NEXT: MOV T0.Y, T8.X, -; EG-NEXT: BFE_INT * T0.W, T37.Z, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T37.Z, literal.x, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: LSHR * T0.W, T20.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -12029,9 +11950,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: LSHR * T0.W, T37.Z, literal.x, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: LSHR * T0.W, T20.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -12039,26 +11960,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T37.Z, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: ALU clause starting at 121: -; EG-NEXT: OR_INT * T37.Y, T1.W, T0.W, -; EG-NEXT: MOV T9.X, PV.Y, -; EG-NEXT: MOV T0.Y, T4.X, -; EG-NEXT: BFE_INT * T0.W, T37.W, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T37.W, literal.x, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV T0.Y, T16.X, +; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -12066,9 +11970,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: LSHR * T0.W, T37.W, literal.x, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -12076,35 +11980,20 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T37.W, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T37.W, PV.W, PS, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV T0.Y, T32.X, -; EG-NEXT: BFE_INT * T0.W, T35.X, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T32.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T35.X, literal.x, +; EG-NEXT: MOV T17.X, PV.W, +; EG-NEXT: MOV T0.Y, T14.X, +; EG-NEXT: LSHR * T0.W, T19.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: ALU clause starting at 121: +; EG-NEXT: LSHL * T0.W, T0.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T32.X, PV.W, -; EG-NEXT: MOV T0.Y, T33.X, -; EG-NEXT: LSHR * T0.W, T35.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: MOV T14.X, PV.W, +; EG-NEXT: MOV T0.Y, T15.X, +; EG-NEXT: LSHR * T0.W, T19.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -12112,25 +12001,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T33.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T35.X, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T38.Y, PV.W, PS, -; EG-NEXT: MOV T33.X, PV.Y, -; EG-NEXT: MOV T0.Y, T28.X, -; EG-NEXT: BFE_INT * T0.W, T35.Y, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T28.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T35.Y, literal.x, +; EG-NEXT: MOV T15.X, PV.W, +; EG-NEXT: MOV T0.Y, T12.X, +; EG-NEXT: LSHR * T0.W, T19.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -12138,9 +12011,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T28.X, PV.W, -; EG-NEXT: MOV T0.Y, T29.X, -; EG-NEXT: LSHR * T0.W, T35.Y, literal.x, +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: LSHR * T0.W, T19.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -12148,26 +12021,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T29.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T35.Y, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 226: -; EG-NEXT: AND_INT T1.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T0.W, T0.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T38.W, PV.W, PS, -; EG-NEXT: MOV T29.X, PV.W, -; EG-NEXT: MOV T0.Y, T24.X, -; EG-NEXT: BFE_INT * T0.W, T35.Z, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T24.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T35.Z, literal.x, +; EG-NEXT: MOV T13.X, PV.W, +; EG-NEXT: MOV T0.Y, T10.X, +; EG-NEXT: LSHR * T0.W, T19.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -12175,9 +12031,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T24.X, PV.W, -; EG-NEXT: MOV T0.Y, T25.X, -; EG-NEXT: LSHR * T0.W, T35.Z, literal.x, +; EG-NEXT: MOV T10.X, PV.W, +; EG-NEXT: MOV T0.Y, T11.X, +; EG-NEXT: LSHR * T0.W, T19.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, @@ -12185,70 +12041,155 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T25.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T35.Z, literal.x, +; EG-NEXT: MOV T11.X, PV.W, +; EG-NEXT: MOV T0.Y, T8.X, +; EG-NEXT: BFE_INT * T0.W, T20.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: ASHR * T0.W, T20.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: LSHL * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T35.Y, PV.W, PS, -; EG-NEXT: MOV T25.X, PV.Y, -; EG-NEXT: MOV T0.Y, T20.X, -; EG-NEXT: BFE_INT * T0.W, T35.W, 0.0, literal.x, +; EG-NEXT: OR_INT * T21.Y, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV T0.Y, T6.X, +; EG-NEXT: BFE_INT * T0.W, T20.Y, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, ; EG-NEXT: -65536(nan), 65535(9.183409e-41) ; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T20.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T35.W, literal.x, +; EG-NEXT: MOV T6.X, PV.W, +; EG-NEXT: MOV T0.Y, T7.X, +; EG-NEXT: ASHR * T0.W, T20.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T21.W, PV.W, PS, +; EG-NEXT: MOV T7.X, PV.W, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: BFE_INT * T0.W, T20.Z, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T20.X, PV.W, -; EG-NEXT: MOV T0.Y, T21.X, -; EG-NEXT: LSHR * T0.W, T35.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T21.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: ASHR * T0.W, T20.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T20.Y, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.Y, +; EG-NEXT: MOV T0.Y, T2.X, +; EG-NEXT: BFE_INT * T0.W, T20.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 226: +; EG-NEXT: AND_INT T1.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, T0.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV T0.Y, T3.X, +; EG-NEXT: ASHR * T0.W, T20.W, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T20.W, PV.W, PS, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV T0.Y, T16.X, +; EG-NEXT: BFE_INT * T0.W, T19.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: ASHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T22.Y, PV.W, PS, +; EG-NEXT: MOV T17.X, PV.Y, +; EG-NEXT: MOV T0.Y, T14.X, +; EG-NEXT: BFE_INT * T0.W, T19.Y, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T14.X, PV.W, +; EG-NEXT: MOV T0.Y, T15.X, +; EG-NEXT: ASHR * T0.W, T19.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T22.W, PV.W, PS, +; EG-NEXT: MOV T15.X, PV.W, +; EG-NEXT: MOV T0.Y, T12.X, +; EG-NEXT: BFE_INT * T0.W, T19.Z, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: ASHR * T0.W, T19.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T19.Y, PV.W, PS, +; EG-NEXT: MOV T13.X, PV.Y, +; EG-NEXT: MOV T0.Y, T10.X, +; EG-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T10.X, PV.W, +; EG-NEXT: MOV T0.Y, T11.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T39.X, PV.W, literal.x, -; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x, +; EG-NEXT: LSHR T23.X, PV.W, literal.x, +; EG-NEXT: LSHR * T24.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ASHR T0.W, T35.W, literal.x, +; EG-NEXT: ASHR T0.W, T19.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 24(3.363116e-44), 48(6.726233e-44) -; EG-NEXT: LSHR T41.X, PS, literal.x, +; EG-NEXT: LSHR T25.X, PS, literal.x, ; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y, ; EG-NEXT: LSHL T0.W, PV.W, literal.z, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) ; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) -; EG-NEXT: LSHR T42.X, PS, literal.x, -; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W, +; EG-NEXT: LSHR T26.X, PS, literal.x, +; EG-NEXT: OR_INT * T19.W, PV.Z, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T21.X, PV.W, -; EG-NEXT: MOV * T36.X, T16.X, -; EG-NEXT: MOV * T36.Z, T12.X, -; EG-NEXT: MOV T37.X, T8.X, -; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212 -; EG-NEXT: MOV * T38.X, T32.X, -; EG-NEXT: MOV * T38.Z, T28.X, -; EG-NEXT: MOV T35.X, T24.X, -; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV T11.X, PV.W, +; EG-NEXT: MOV * T21.X, T8.X, +; EG-NEXT: MOV * T21.Z, T6.X, +; EG-NEXT: MOV T20.X, T4.X, +; EG-NEXT: MOV T20.Z, T2.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T22.X, T16.X, +; EG-NEXT: MOV * T22.Z, T14.X, +; EG-NEXT: MOV T19.X, T12.X, +; EG-NEXT: MOV * T19.Z, T10.X, BS:VEC_120/SCL_212 ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 64f1f45bf734c..c0f377eccf4fa 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1206,18 +1206,18 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHR * T4.Y, T4.X, literal.x, +; EG-NEXT: LSHR * T0.Y, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, T4.X, literal.x, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) ; ; CM-LABEL: global_zextload_v2i16_to_v2i32: @@ -1225,19 +1225,19 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 ; CM-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: -; CM-NEXT: MOV * T4.X, KC0[2].Z, +; CM-NEXT: MOV * T0.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: LSHR * T4.Y, T4.X, literal.x, +; CM-NEXT: LSHR * T0.Y, T0.X, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT * T4.X, T4.X, literal.x, +; CM-NEXT: AND_INT * T0.X, T0.X, literal.x, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %load = load <2 x i16>, ptr addrspace(1) %in %ext = zext <2 x i16> %load to <2 x i32> @@ -1304,41 +1304,40 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1 +; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x, -; EG-NEXT: LSHR T0.W, T4.X, literal.x, -; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) -; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x, +; EG-NEXT: ASHR * T0.Y, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) ; ; CM-LABEL: global_sextload_v2i16_to_v2i32: ; CM: ; %bb.0: ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 ; CM-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T4.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: -; CM-NEXT: MOV * T4.X, KC0[2].Z, +; CM-NEXT: MOV * T0.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x, -; CM-NEXT: LSHR * T0.W, T4.X, literal.x, +; CM-NEXT: ASHR * T0.Y, T0.X, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T4.X, KC0[2].Y, literal.x, -; CM-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y, -; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %load = load <2 x i16>, ptr addrspace(1) %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, ptr addrspace(1) %out @@ -5575,20 +5574,20 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHR * T4.Z, T4.X, literal.x, +; EG-NEXT: LSHR * T0.Z, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, T4.X, literal.x, -; EG-NEXT: MOV T4.Y, 0.0, -; EG-NEXT: MOV T4.W, 0.0, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV T0.W, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) ; ; CM-LABEL: global_zextload_v2i16_to_v2i64: @@ -5596,21 +5595,21 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 ; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: -; CM-NEXT: MOV * T4.X, KC0[2].Z, +; CM-NEXT: MOV * T0.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: LSHR * T4.Z, T4.X, literal.x, +; CM-NEXT: LSHR * T0.Z, T0.X, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT T4.X, T4.X, literal.x, -; CM-NEXT: MOV T4.Y, 0.0, -; CM-NEXT: MOV * T4.W, 0.0, +; CM-NEXT: AND_INT T0.X, T0.X, literal.x, +; CM-NEXT: MOV T0.Y, 0.0, +; CM-NEXT: MOV * T0.W, 0.0, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %load = load <2 x i16>, ptr addrspace(1) %in %ext = zext <2 x i16> %load to <2 x i64> @@ -5686,22 +5685,22 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 ; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: ASHR * T4.W, T4.X, literal.x, +; EG-NEXT: ASHR * T0.W, T0.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: ASHR * T4.Z, T4.X, literal.x, +; EG-NEXT: ASHR * T0.Z, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T4.X, T4.X, 0.0, literal.x, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) -; EG-NEXT: ASHR * T4.Y, PV.X, literal.x, +; EG-NEXT: ASHR * T0.Y, PV.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; ; CM-LABEL: global_sextload_v2i16_to_v2i64: @@ -5709,22 +5708,22 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 ; CM-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: -; CM-NEXT: MOV * T4.X, KC0[2].Z, +; CM-NEXT: MOV * T0.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: ASHR * T4.W, T4.X, literal.x, +; CM-NEXT: ASHR * T0.W, T0.X, literal.x, ; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; CM-NEXT: ASHR * T4.Z, T4.X, literal.x, +; CM-NEXT: ASHR * T0.Z, T0.X, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT * T4.X, T4.X, 0.0, literal.x, +; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x, -; CM-NEXT: ASHR * T4.Y, PV.X, literal.y, +; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x, +; CM-NEXT: ASHR * T0.Y, PV.X, literal.y, ; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44) %load = load <2 x i16>, ptr addrspace(1) %in %ext = sext <2 x i16> %load to <2 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index 1dd08c561b2ab..bd84e753cbc84 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,GFX89,FUNC %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX89,FUNC %s @@ -7,240 +8,653 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s -; FUNC-LABEL: {{^}}local_load_i16: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_u16 v{{[0-9]+}} - -; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z -; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]] -; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP -; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y -; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]] define amdgpu_kernel void @local_load_i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { +; SI-LABEL: local_load_i16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_u16 v0, v0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write_b16 v1, v0 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_load_i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_load_i16: +; EG: ; %bb.0: ; %entry +; EG-NEXT: ALU 4, @0, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_SHORT_WRITE * T0.W, T0.X, +; EG-NEXT: RETURN entry: %ld = load i16, ptr addrspace(3) %in store i16 %ld, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_load_v2i16: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_b32 - -; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z -; EG: LDS_READ_RET {{.*}} [[FROM]] -; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP -; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y -; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] define amdgpu_kernel void @local_load_v2i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { +; SI-LABEL: local_load_v2i16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write_b32 v1, v0 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_load_v2i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_load_v2i16: +; EG: ; %bb.0: ; %entry +; EG-NEXT: ALU 4, @1, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: RETURN entry: %ld = load <2 x i16>, ptr addrspace(3) %in store <2 x i16> %ld, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_load_v3i16: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_b64 -; GCN-DAG: ds_write_b32 -; GCN-DAG: ds_write_b16 - -; EG-DAG: LDS_USHORT_READ_RET -; EG-DAG: LDS_USHORT_READ_RET define amdgpu_kernel void @local_load_v3i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { +; SI-LABEL: local_load_v3i16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b64 v[0:1], v0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write_b32 v2, v0 +; SI-NEXT: ds_write_b16 v2, v1 offset:4 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_load_v3i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_write_b16 v2, v1 offset:4 +; GFX9-NEXT: ds_write_b32 v2, v0 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_load_v3i16: +; EG: ; %bb.0: ; %entry +; EG-NEXT: ALU 19, @2, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV * T0.Z, OQAP, +; EG-NEXT: LSHL T0.Z, PV.Z, literal.x, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.z, +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: OR_INT T0.W, T0.Z, T0.W, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_SHORT_WRITE * T0.W, T0.Y, +; EG-NEXT: RETURN entry: %ld = load <3 x i16>, ptr addrspace(3) %in store <3 x i16> %ld, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_load_v4i16: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_b64 - -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_load_v4i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { +; SI-LABEL: local_load_v4i16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b64 v[0:1], v0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write_b64 v2, v[0:1] +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_load_v4i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_load_v4i16: +; EG: ; %bb.0: ; %entry +; EG-NEXT: ALU 11, @3, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: RETURN entry: %ld = load <4 x i16>, ptr addrspace(3) %in store <4 x i16> %ld, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_load_v8i16: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_load_v8i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { +; SI-LABEL: local_load_v8i16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_load_v8i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_load_v8i16: +; EG: ; %bb.0: ; %entry +; EG-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: RETURN entry: %ld = load <8 x i16>, ptr addrspace(3) %in store <8 x i16> %ld, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_load_v16i16: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} - - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_load_v16i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { +; SI-LABEL: local_load_v16i16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 +; SI-NEXT: v_mov_b32_e32 v8, s0 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: ds_write2_b64 v8, v[4:5], v[6:7] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_load_v16i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 +; GFX9-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v8, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: ds_write2_b64 v8, v[4:5], v[6:7] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_load_v16i16: +; EG: ; %bb.0: ; %entry +; EG-NEXT: ALU 53, @5, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: RETURN entry: %ld = load <16 x i16>, ptr addrspace(3) %in store <16 x i16> %ld, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_i16_to_i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_u16 -; GCN: ds_write_b32 - -; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z -; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]] -; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP -; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y -; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] define amdgpu_kernel void @local_zextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_i16_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_u16 v0, v0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write_b32 v1, v0 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_i16_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_i16_to_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: RETURN %a = load i16, ptr addrspace(3) %in %ext = zext i16 %a to i32 store i32 %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_i16_to_i32: -; GCN-NOT: s_wqm_b64 - -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_i16 - -; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z -; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]] -; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP -; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y -; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal -; EG: 16 -; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] define amdgpu_kernel void @local_sextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_i16_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_i16 v0, v0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write_b32 v1, v0 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_i16_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_i16 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_i16_to_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 6, @7, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV * T0.X, OQAP, +; EG-NEXT: BFE_INT T0.W, PV.X, 0.0, literal.x, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %a = load i16, ptr addrspace(3) %in %ext = sext i16 %a to i32 store i32 %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_u16 -; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z -; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]] -; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP -; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y -; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v1i16_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_u16 v0, v0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write_b32 v1, v0 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v1i16_to_v1i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v1i16_to_v1i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: RETURN %load = load <1 x i16>, ptr addrspace(3) %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_i16 - -; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z -; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]] -; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP -; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y -; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal -; EG: 16 -; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v1i16_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_i16 v0, v0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write_b32 v1, v0 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v1i16_to_v1i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_i16 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v1i16_to_v1i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV * T0.X, OQAP, +; EG-NEXT: BFE_INT T0.W, PV.X, 0.0, literal.x, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <1 x i16>, ptr addrspace(3) %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32: -; GCN-NOT: s_wqm_b64 -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_b32 - -; EG: LDS_READ_RET define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v2i16_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: ds_write_b64 v2, v[0:1] +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v2i16_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v2i16_to_v2i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 10, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV * T0.X, OQAP, +; EG-NEXT: AND_INT T0.W, PV.X, literal.x, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.X, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <2 x i16>, ptr addrspace(3) %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32: -; GCN-NOT: s_wqm_b64 -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_b32 - -; EG: LDS_READ_RET -; EG: BFE_INT -; EG: BFE_INT define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v2i16_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: ds_write_b64 v2, v[0:1] +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v2i16_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v2i16_to_v2i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV * T0.X, OQAP, +; EG-NEXT: BFE_INT T0.W, PV.X, 0.0, literal.x, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T0.X, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <2 x i16>, ptr addrspace(3) %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_b64 -; SI-DAG: ds_write_b32 -; SI-DAG: ds_write_b64 -; CIVI-DAG: ds_write_b96 -; GFX9-DAG: ds_write_b96 - -; EG: LDS_USHORT_READ_RET -; EG: LDS_USHORT_READ_RET -; EG: LDS_USHORT_READ_RET define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) { +; SI-LABEL: local_local_zextload_v3i16_to_v3i32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b64 v[0:1], v0 +; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: ds_write_b32 v4, v0 offset:8 +; SI-NEXT: ds_write_b64 v4, v[2:3] +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_local_zextload_v3i16_to_v3i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_local_zextload_v3i16_to_v3i32: +; EG: ; %bb.0: ; %entry +; EG-NEXT: ALU 18, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.Z, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T0.W, T0.Y, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: RETURN entry: %ld = load <3 x i16>, ptr addrspace(3) %in %ext = zext <3 x i16> %ld to <3 x i32> @@ -248,23 +662,64 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_b64 -; SI-DAG: ds_write_b32 -; SI-DAG: ds_write_b64 -; CIVI-DAG: ds_write_b96 -; GFX9-DAG: ds_write_b96 - -; EG: LDS_USHORT_READ_RET -; EG: LDS_USHORT_READ_RET -; EG: LDS_USHORT_READ_RET -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) { +; SI-LABEL: local_local_sextload_v3i16_to_v3i32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b64 v[0:1], v0 +; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 +; SI-NEXT: v_bfe_i32 v2, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v0, v1, 0, 16 +; SI-NEXT: ds_write_b32 v4, v0 offset:8 +; SI-NEXT: ds_write_b64 v4, v[2:3] +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_local_sextload_v3i16_to_v3i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b64 v[3:4], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v3 +; GFX9-NEXT: v_bfe_i32 v2, v4, 0, 16 +; GFX9-NEXT: v_bfe_i32 v0, v3, 0, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_local_sextload_v3i16_to_v3i32: +; EG: ; %bb.0: ; %entry +; EG-NEXT: ALU 22, @13, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN entry: %ld = load <3 x i16>, ptr addrspace(3) %in %ext = sext <3 x i16> %ld to <3 x i32> @@ -272,698 +727,5078 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32: -; GCN-NOT: s_wqm_b64 -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_b64 - -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_local_zextload_v4i16_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b64 v[0:1], v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_local_zextload_v4i16_to_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_local_zextload_v4i16_to_v4i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 22, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <4 x i16>, ptr addrspace(3) %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32: -; GCN-NOT: s_wqm_b64 -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read_b64 - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v4i16_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b64 v[0:1], v0 +; SI-NEXT: v_mov_b32_e32 v6, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 +; SI-NEXT: v_ashr_i64 v[4:5], v[0:1], 48 +; SI-NEXT: v_bfe_i32 v2, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v0, v1, 0, 16 +; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: ds_write2_b64 v6, v[2:3], v[0:1] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v4i16_to_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 16, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 16, v1 +; GFX9-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v4i16_to_v4i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 25, @15, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: LSHR * T0.W, T0.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T1.Z, PV.Z, literal.x, +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <4 x i16>, ptr addrspace(3) %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v8i16_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; SI-NEXT: v_mov_b32_e32 v12, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; SI-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v8i16_to_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GFX9-NEXT: ds_write2_b64 v10, v[0:1], v[8:9] offset0:2 offset1:3 +; GFX9-NEXT: ds_write2_b64 v10, v[4:5], v[6:7] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v8i16_to_v8i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 46, @16, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: AND_INT T1.W, T0.W, literal.x, +; EG-NEXT: MOV * T2.W, KC0[2].Y, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T0.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <8 x i16>, ptr addrspace(3) %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v8i16_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; SI-NEXT: v_mov_b32_e32 v12, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v1 +; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v2 +; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v3 +; SI-NEXT: v_bfe_i32 v4, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v6, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v8, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v10, v3, 0, 16 +; SI-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v8i16_to_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v12, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v9, 16, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 16, v3 +; GFX9-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GFX9-NEXT: v_bfe_i32 v10, v3, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 16, v1 +; GFX9-NEXT: v_bfe_i32 v4, v0, 0, 16 +; GFX9-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GFX9-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3 +; GFX9-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v8i16_to_v8i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 51, @17, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: LSHR * T1.W, T0.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: LSHR T1.Z, T0.W, literal.x, +; EG-NEXT: BFE_INT T1.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T2.Z, T0.Y, literal.x, +; EG-NEXT: BFE_INT T1.W, T1.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.Z, T1.Y, literal.x, +; EG-NEXT: BFE_INT T1.W, T2.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: BFE_INT T1.W, T1.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x, +; EG-NEXT: MOV * T2.W, KC0[2].Y, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <8 x i16>, ptr addrspace(3) %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} - -; GCN: ds_write2_b64 -; GCN: ds_write2_b64 -; GCN: ds_write2_b64 -; GCN: ds_write2_b64 - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v16i16_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 +; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; SI-NEXT: v_mov_b32_e32 v24, s0 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v6 +; SI-NEXT: ds_write2_b64 v24, v[22:23], v[20:21] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v24, v[18:19], v[16:17] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v24, v[14:15], v[12:13] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v24, v[10:11], v[8:9] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v16i16_to_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 +; GFX9-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; GFX9-NEXT: v_mov_b32_e32 v16, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7 +; GFX9-NEXT: ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5 +; GFX9-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3 +; GFX9-NEXT: ds_write2_b64 v16, v[0:1], v[8:9] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v16i16_to_v16i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 94, @18, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Z, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.W, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Y, OQAP, +; EG-NEXT: MOV * T2.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Z, OQAP, +; EG-NEXT: LSHR T2.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T2.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T2.W, T2.Z, literal.x, +; EG-NEXT: MOV * T3.W, KC0[2].Y, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T1.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T1.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T0.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <16 x i16>, ptr addrspace(3) %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v16i16_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 +; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; SI-NEXT: v_mov_b32_e32 v24, s0 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v1 +; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v0 +; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v3 +; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v5 +; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v4 +; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v7 +; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v6 +; SI-NEXT: v_bfe_i32 v8, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v10, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v12, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v14, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v16, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v18, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v20, v7, 0, 16 +; SI-NEXT: v_bfe_i32 v22, v6, 0, 16 +; SI-NEXT: ds_write2_b64 v24, v[22:23], v[20:21] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v24, v[18:19], v[16:17] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v24, v[14:15], v[12:13] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v24, v[10:11], v[8:9] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v16i16_to_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 +; GFX9-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 16, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v21, 16, v7 +; GFX9-NEXT: v_ashrrev_i32_e32 v23, 16, v6 +; GFX9-NEXT: v_bfe_i32 v10, v0, 0, 16 +; GFX9-NEXT: v_bfe_i32 v20, v7, 0, 16 +; GFX9-NEXT: v_bfe_i32 v22, v6, 0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_ashrrev_i32_e32 v9, 16, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v13, 16, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v15, 16, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v17, 16, v5 +; GFX9-NEXT: v_ashrrev_i32_e32 v19, 16, v4 +; GFX9-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GFX9-NEXT: v_bfe_i32 v12, v3, 0, 16 +; GFX9-NEXT: v_bfe_i32 v14, v2, 0, 16 +; GFX9-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GFX9-NEXT: v_bfe_i32 v18, v4, 0, 16 +; GFX9-NEXT: ds_write2_b64 v0, v[22:23], v[20:21] offset0:6 offset1:7 +; GFX9-NEXT: ds_write2_b64 v0, v[18:19], v[16:17] offset0:4 offset1:5 +; GFX9-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:2 offset1:3 +; GFX9-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v16i16_to_v16i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 95, @19, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Z, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.W, OQAP, +; EG-NEXT: MOV * T2.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Y, OQAP, +; EG-NEXT: LSHR T2.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T2.Z, OQAP, +; EG-NEXT: LSHR * T3.Z, T2.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T2.W, T2.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T4.Z, T0.Y, literal.x, +; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T3.Z, T0.Z, literal.x, +; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T4.Z, T0.W, literal.x, +; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T3.Z, T1.Y, literal.x, +; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T4.Z, T1.Z, literal.x, +; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T3.Z, T2.Z, literal.x, +; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: BFE_INT T1.W, T1.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: BFE_INT T1.W, T2.Y, 0.0, literal.x, +; EG-NEXT: MOV * T2.W, KC0[2].Y, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: BFE_INT T1.W, T0.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ALU 7, @20, KC0[CB0:0-32], KC1[] +; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <16 x i16>, ptr addrspace(3) %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v32i16_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 +; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 +; SI-NEXT: s_waitcnt lgkmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v2 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v32, s0 +; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15 +; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11 +; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v32i16_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; GFX9-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 +; GFX9-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 +; GFX9-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 +; GFX9-NEXT: v_mov_b32_e32 v32, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: ds_write2_b64 v32, v[12:13], v[30:31] offset0:12 offset1:13 +; GFX9-NEXT: ds_write2_b64 v32, v[14:15], v[28:29] offset0:14 offset1:15 +; GFX9-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11 +; GFX9-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9 +; GFX9-NEXT: ds_write2_b64 v32, v[4:5], v[22:23] offset0:4 offset1:5 +; GFX9-NEXT: ds_write2_b64 v32, v[6:7], v[20:21] offset0:6 offset1:7 +; GFX9-NEXT: ds_write2_b64 v32, v[0:1], v[18:19] offset1:1 +; GFX9-NEXT: ds_write2_b64 v32, v[2:3], v[16:17] offset0:2 offset1:3 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v32i16_to_v32i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 105, @21, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Z, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.W, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Y, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Z, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.W, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.Y, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.Z, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.W, OQAP, +; EG-NEXT: MOV * T4.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.Y, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.Z, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.W, OQAP, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T5.W +; EG-NEXT: MOV T5.Y, OQAP, +; EG-NEXT: LSHR T5.W, T4.W, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: AND_INT T4.W, T4.W, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: LSHR T4.W, T5.Y, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: AND_INT T4.W, T5.Y, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: LSHR T4.W, T4.Z, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: AND_INT T4.W, T4.Z, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: LSHR T4.W, T4.Y, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: AND_INT T4.W, T4.Y, literal.x, +; EG-NEXT: MOV * T5.W, KC0[2].Y, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: LSHR T4.W, T3.W, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: AND_INT T3.W, T3.W, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: LSHR T3.W, T3.Z, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) +; EG-NEXT: ALU 84, @22, KC0[CB0:0-32], KC1[] +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: AND_INT T3.W, T3.Z, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: LSHR T3.W, T3.Y, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: AND_INT T3.W, T3.Y, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: LSHR T3.W, T2.W, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: AND_INT T2.W, T2.W, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T2.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 88(1.233143e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T2.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T1.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 72(1.008935e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T1.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 120(1.681558e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T0.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 104(1.457350e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <32 x i16>, ptr addrspace(3) %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v32i16_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 +; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 +; SI-NEXT: s_waitcnt lgkmcnt(3) +; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v1 +; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v0 +; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v3 +; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v2 +; SI-NEXT: v_bfe_i32 v16, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v18, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v20, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v22, v2, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v5 +; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v4 +; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v4, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v7 +; SI-NEXT: v_bfe_i32 v4, v7, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v6 +; SI-NEXT: v_bfe_i32 v6, v6, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v9 +; SI-NEXT: v_bfe_i32 v24, v9, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8 +; SI-NEXT: v_bfe_i32 v8, v8, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v11 +; SI-NEXT: v_bfe_i32 v26, v11, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10 +; SI-NEXT: v_bfe_i32 v10, v10, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v13 +; SI-NEXT: v_bfe_i32 v28, v13, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12 +; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v15 +; SI-NEXT: v_bfe_i32 v30, v15, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14 +; SI-NEXT: v_bfe_i32 v14, v14, 0, 16 +; SI-NEXT: v_mov_b32_e32 v32, s0 +; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15 +; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11 +; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v32i16_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; GFX9-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 +; GFX9-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 +; GFX9-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 +; GFX9-NEXT: v_mov_b32_e32 v32, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_ashrrev_i32_e32 v17, 16, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v19, 16, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v21, 16, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v31, 16, v13 +; GFX9-NEXT: v_bfe_i32 v30, v13, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v13, 16, v12 +; GFX9-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v23, 16, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v25, 16, v7 +; GFX9-NEXT: v_ashrrev_i32_e32 v27, 16, v6 +; GFX9-NEXT: v_bfe_i32 v16, v3, 0, 16 +; GFX9-NEXT: v_bfe_i32 v18, v2, 0, 16 +; GFX9-NEXT: v_bfe_i32 v20, v1, 0, 16 +; GFX9-NEXT: v_bfe_i32 v22, v0, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v5 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 16, v4 +; GFX9-NEXT: v_bfe_i32 v24, v7, 0, 16 +; GFX9-NEXT: v_bfe_i32 v26, v6, 0, 16 +; GFX9-NEXT: v_bfe_i32 v0, v5, 0, 16 +; GFX9-NEXT: v_bfe_i32 v2, v4, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 16, v9 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 16, v8 +; GFX9-NEXT: v_bfe_i32 v4, v9, 0, 16 +; GFX9-NEXT: v_bfe_i32 v6, v8, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v9, 16, v11 +; GFX9-NEXT: v_bfe_i32 v8, v11, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 16, v10 +; GFX9-NEXT: v_bfe_i32 v10, v10, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v29, 16, v15 +; GFX9-NEXT: v_bfe_i32 v28, v15, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v15, 16, v14 +; GFX9-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GFX9-NEXT: ds_write2_b64 v32, v[12:13], v[30:31] offset0:12 offset1:13 +; GFX9-NEXT: ds_write2_b64 v32, v[14:15], v[28:29] offset0:14 offset1:15 +; GFX9-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:10 offset1:11 +; GFX9-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:8 offset1:9 +; GFX9-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5 +; GFX9-NEXT: ds_write2_b64 v32, v[26:27], v[24:25] offset0:6 offset1:7 +; GFX9-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset1:1 +; GFX9-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:2 offset1:3 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v32i16_to_v32i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 101, @23, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Z, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.W, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Y, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Z, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.W, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.Y, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.Z, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.W, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.Y, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.Z, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.W, OQAP, +; EG-NEXT: LSHR * T5.W, T4.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T6.W +; EG-NEXT: MOV T5.Y, OQAP, +; EG-NEXT: LSHR T5.Z, T4.W, literal.x, +; EG-NEXT: BFE_INT T5.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T6.Z, T0.Y, literal.x, +; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T5.Z, T0.Z, literal.x, +; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T6.Z, T0.W, literal.x, +; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T5.Z, T1.Y, literal.x, +; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T6.Z, T1.Z, literal.x, +; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T5.Z, T1.W, literal.x, +; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR * T6.Z, T2.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ALU 89, @24, KC0[CB0:0-32], KC1[] +; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T5.Z, T2.Z, literal.x, +; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T6.Z, T2.W, literal.x, +; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T5.Z, T3.Y, literal.x, +; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T6.Z, T3.Z, literal.x, +; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T5.Z, T3.W, literal.x, +; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T6.Z, T4.Y, literal.x, +; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T5.Z, T5.Y, literal.x, +; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: BFE_INT T5.W, T4.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: BFE_INT T4.W, T4.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: BFE_INT T4.W, T0.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: BFE_INT T4.W, T0.Z, 0.0, literal.x, +; EG-NEXT: MOV * T5.W, KC0[2].Y, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T4.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44) +; EG-NEXT: LDS_WRITE * T4.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T4.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T1.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T2.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T2.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T3.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44) +; EG-NEXT: ALU 16, @25, KC0[CB0:0-32], KC1[] +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T3.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T3.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T4.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 96(1.345247e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <32 x i16>, ptr addrspace(3) %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 -; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v64i16_to_v64i32: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0xe8f000 +; SI-NEXT: s_add_u32 s12, s12, s11 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v24, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v24 offset0:8 offset1:9 +; SI-NEXT: ds_read2_b64 v[4:7], v24 offset0:10 offset1:11 +; SI-NEXT: ds_read2_b64 v[12:15], v24 offset0:12 offset1:13 +; SI-NEXT: ds_read2_b64 v[8:11], v24 offset0:14 offset1:15 +; SI-NEXT: ds_read2_b64 v[20:23], v24 offset1:1 +; SI-NEXT: ds_read2_b64 v[16:19], v24 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[36:39], v24 offset0:4 offset1:5 +; SI-NEXT: ds_read2_b64 v[40:43], v24 offset0:6 offset1:7 +; SI-NEXT: s_waitcnt lgkmcnt(7) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: s_waitcnt lgkmcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v1 +; SI-NEXT: buffer_store_dword v24, off, s[12:15], 0 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v44, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v46, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) lgkmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: s_waitcnt lgkmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt lgkmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_and_b32_e32 v48, 0xffff, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; SI-NEXT: v_and_b32_e32 v50, 0xffff, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_and_b32_e32 v52, 0xffff, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; SI-NEXT: v_and_b32_e32 v54, 0xffff, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v37 +; SI-NEXT: v_and_b32_e32 v56, 0xffff, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v39 +; SI-NEXT: v_and_b32_e32 v58, 0xffff, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38 +; SI-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v41 +; SI-NEXT: v_and_b32_e32 v60, 0xffff, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v40 +; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; SI-NEXT: v_and_b32_e32 v62, 0xffff, v43 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v42 +; SI-NEXT: v_and_b32_e32 v42, 0xffff, v42 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: ds_write2_b64 v0, v[42:43], v[62:63] offset0:14 offset1:15 +; SI-NEXT: ds_write2_b64 v0, v[40:41], v[60:61] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v0, v[38:39], v[58:59] offset0:10 offset1:11 +; SI-NEXT: ds_write2_b64 v0, v[36:37], v[56:57] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v0, v[18:19], v[54:55] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v0, v[16:17], v[52:53] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v0, v[22:23], v[50:51] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v0, v[20:21], v[48:49] offset1:1 +; SI-NEXT: ds_write2_b64 v0, v[10:11], v[14:15] offset0:30 offset1:31 +; SI-NEXT: ds_write2_b64 v0, v[8:9], v[12:13] offset0:28 offset1:29 +; SI-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:26 offset1:27 +; SI-NEXT: ds_write2_b64 v0, v[2:3], v[24:25] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v0, v[46:47], v[44:45] offset0:22 offset1:23 +; SI-NEXT: ds_write2_b64 v0, v[34:35], v[32:33] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:18 offset1:19 +; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: ds_write2_b64 v0, v[26:27], v[1:2] offset0:16 offset1:17 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v64i16_to_v64i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v56, s1 +; GFX9-NEXT: ds_read2_b64 v[16:19], v56 offset1:1 +; GFX9-NEXT: ds_read2_b64 v[20:23], v56 offset0:2 offset1:3 +; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; GFX9-NEXT: ds_read2_b64 v[16:19], v56 offset0:4 offset1:5 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v22 +; GFX9-NEXT: ds_read2_b64 v[20:23], v56 offset0:6 offset1:7 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v18 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX9-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9 +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff, v22 +; GFX9-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v40, 0xffff, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v42, 0xffff, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v44, 0xffff, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v46, 0xffff, v18 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; GFX9-NEXT: ds_read2_b64 v[16:19], v56 offset0:12 offset1:13 +; GFX9-NEXT: v_and_b32_e32 v52, 0xffff, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v54, 0xffff, v22 +; GFX9-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v56, 0xffff, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v62, 0xffff, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v58, 0xffff, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v60, 0xffff, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: ds_write2_b64 v0, v[22:23], v[62:63] offset0:30 offset1:31 +; GFX9-NEXT: ds_write2_b64 v0, v[20:21], v[60:61] offset0:28 offset1:29 +; GFX9-NEXT: ds_write2_b64 v0, v[18:19], v[58:59] offset0:26 offset1:27 +; GFX9-NEXT: ds_write2_b64 v0, v[16:17], v[56:57] offset0:24 offset1:25 +; GFX9-NEXT: ds_write2_b64 v0, v[54:55], v[52:53] offset0:22 offset1:23 +; GFX9-NEXT: ds_write2_b64 v0, v[50:51], v[48:49] offset0:20 offset1:21 +; GFX9-NEXT: ds_write2_b64 v0, v[46:47], v[44:45] offset0:18 offset1:19 +; GFX9-NEXT: ds_write2_b64 v0, v[42:43], v[40:41] offset0:16 offset1:17 +; GFX9-NEXT: ds_write2_b64 v0, v[38:39], v[36:37] offset0:14 offset1:15 +; GFX9-NEXT: ds_write2_b64 v0, v[34:35], v[32:33] offset0:12 offset1:13 +; GFX9-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:10 offset1:11 +; GFX9-NEXT: ds_write2_b64 v0, v[26:27], v[24:25] offset0:8 offset1:9 +; GFX9-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:6 offset1:7 +; GFX9-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset0:4 offset1:5 +; GFX9-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:2 offset1:3 +; GFX9-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v64i16_to_v64i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 116, @26, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 120(1.681558e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Z, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.W, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 104(1.457350e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Y, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Z, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.W, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.Y, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.Z, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.W, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.Y, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.Z, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.W, OQAP, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, +; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T5.W +; EG-NEXT: MOV T5.Y, OQAP, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T5.W +; EG-NEXT: MOV T5.Z, OQAP, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, +; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T5.W +; EG-NEXT: MOV T5.W, OQAP, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, +; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T6.W +; EG-NEXT: MOV T6.Y, OQAP, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, +; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T6.W +; EG-NEXT: MOV T6.Z, OQAP, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T6.W +; EG-NEXT: MOV T6.W, OQAP, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T7.W +; EG-NEXT: MOV T7.Y, OQAP, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, +; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T7.W +; EG-NEXT: MOV T7.Z, OQAP, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, +; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T7.W +; EG-NEXT: MOV T7.W, OQAP, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T8.W +; EG-NEXT: MOV T8.Y, OQAP, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T8.W +; EG-NEXT: MOV T8.Z, OQAP, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T8.W +; EG-NEXT: MOV T8.W, OQAP, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T9.W +; EG-NEXT: MOV T9.Y, OQAP, +; EG-NEXT: MOV * T9.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T9.W +; EG-NEXT: MOV T9.Z, OQAP, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: ALU 95, @27, KC0[CB0:0-32], KC1[] +; EG-NEXT: LDS_READ_RET * OQAP, T9.W +; EG-NEXT: MOV T9.W, OQAP, +; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T10.W +; EG-NEXT: MOV T10.Y, OQAP, +; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T10.W +; EG-NEXT: MOV T10.Z, OQAP, +; EG-NEXT: LSHR T10.W, T10.Y, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: AND_INT T10.W, T10.Y, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T10.W, T10.Z, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: AND_INT T10.W, T10.Z, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T10.W, T9.W, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: AND_INT T9.W, T9.W, literal.x, +; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T10.W, T9.W, +; EG-NEXT: LSHR T9.W, T9.Z, literal.x, +; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T10.W, T9.W, +; EG-NEXT: AND_INT T9.W, T9.Z, literal.x, +; EG-NEXT: MOV * T10.W, KC0[2].Y, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T10.W, T9.W, +; EG-NEXT: LSHR T9.W, T9.Y, literal.x, +; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) +; EG-NEXT: LDS_WRITE * T10.W, T9.W, +; EG-NEXT: AND_INT T9.W, T9.Y, literal.x, +; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T10.W, T9.W, +; EG-NEXT: LSHR T9.W, T8.W, literal.x, +; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) +; EG-NEXT: LDS_WRITE * T10.W, T9.W, +; EG-NEXT: AND_INT T8.W, T8.W, literal.x, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) +; EG-NEXT: LDS_WRITE * T9.W, T8.W, +; EG-NEXT: LSHR T8.W, T8.Z, literal.x, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) +; EG-NEXT: LDS_WRITE * T9.W, T8.W, +; EG-NEXT: AND_INT T8.W, T8.Z, literal.x, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T9.W, T8.W, +; EG-NEXT: LSHR T8.W, T8.Y, literal.x, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) +; EG-NEXT: LDS_WRITE * T9.W, T8.W, +; EG-NEXT: AND_INT T8.W, T8.Y, literal.x, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) +; EG-NEXT: LDS_WRITE * T9.W, T8.W, +; EG-NEXT: LSHR T8.W, T7.W, literal.x, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43) +; EG-NEXT: LDS_WRITE * T9.W, T8.W, +; EG-NEXT: AND_INT T7.W, T7.W, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 88(1.233143e-43) +; EG-NEXT: LDS_WRITE * T8.W, T7.W, +; EG-NEXT: LSHR T7.W, T7.Z, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43) +; EG-NEXT: LDS_WRITE * T8.W, T7.W, +; EG-NEXT: AND_INT T7.W, T7.Z, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43) +; EG-NEXT: LDS_WRITE * T8.W, T7.W, +; EG-NEXT: LSHR T7.W, T7.Y, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43) +; EG-NEXT: LDS_WRITE * T8.W, T7.W, +; EG-NEXT: AND_INT * T7.W, T7.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: ALU 93, @28, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.x, +; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T8.W, T7.W, +; EG-NEXT: LSHR T7.W, T6.W, literal.x, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44) +; EG-NEXT: LDS_WRITE * T8.W, T7.W, +; EG-NEXT: AND_INT T6.W, T6.W, literal.x, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: LSHR T6.W, T6.Z, literal.x, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: AND_INT T6.W, T6.Z, literal.x, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 120(1.681558e-43) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: LSHR T6.W, T6.Y, literal.x, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: AND_INT T6.W, T6.Y, literal.x, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: LSHR T6.W, T5.W, literal.x, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: AND_INT T5.W, T5.W, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 104(1.457350e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T5.W, T5.Z, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: AND_INT T5.W, T5.Z, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T5.W, T5.Y, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 156(2.186026e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: AND_INT T5.W, T5.Y, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 152(2.129974e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: LSHR T5.W, T4.W, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 148(2.073922e-43) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: AND_INT T4.W, T4.W, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 144(2.017870e-43) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: LSHR T4.W, T4.Z, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 140(1.961818e-43) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: AND_INT T4.W, T4.Z, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 136(1.905766e-43) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: LSHR T4.W, T4.Y, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 132(1.849714e-43) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: AND_INT T4.W, T4.Y, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 128(1.793662e-43) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: LSHR T4.W, T3.W, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 188(2.634441e-43) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: AND_INT T3.W, T3.W, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 184(2.578389e-43) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: LSHR T3.W, T3.Z, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 180(2.522337e-43) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: AND_INT T3.W, T3.Z, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 176(2.466285e-43) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: LSHR T3.W, T3.Y, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 172(2.410233e-43) +; EG-NEXT: ALU 76, @29, KC0[CB0:0-32], KC1[] +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: AND_INT T3.W, T3.Y, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 168(2.354181e-43) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: LSHR T3.W, T2.W, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 164(2.298129e-43) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: AND_INT T2.W, T2.W, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 160(2.242078e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 220(3.082857e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T2.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 216(3.026805e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 212(2.970753e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T2.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 208(2.914701e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 204(2.858649e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T1.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 200(2.802597e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 196(2.746545e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 192(2.690493e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 252(3.531272e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T1.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 248(3.475220e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 244(3.419168e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T0.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 240(3.363116e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 236(3.307064e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 232(3.251012e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 228(3.194960e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 224(3.138909e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <64 x i16>, ptr addrspace(3) %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v64i16_to_v64i32: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0xe8f000 +; SI-NEXT: s_add_u32 s12, s12, s11 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[8:11], v20 offset0:8 offset1:9 +; SI-NEXT: ds_read2_b64 v[4:7], v20 offset0:10 offset1:11 +; SI-NEXT: ds_read2_b64 v[0:3], v20 offset0:12 offset1:13 +; SI-NEXT: ds_read2_b64 v[12:15], v20 offset0:14 offset1:15 +; SI-NEXT: ds_read2_b64 v[16:19], v20 offset1:1 +; SI-NEXT: ds_read2_b64 v[32:35], v20 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[36:39], v20 offset0:4 offset1:5 +; SI-NEXT: ds_read2_b64 v[40:43], v20 offset0:6 offset1:7 +; SI-NEXT: s_waitcnt lgkmcnt(7) +; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v9 +; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v8 +; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v11 +; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v10 +; SI-NEXT: s_waitcnt lgkmcnt(6) +; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v5 +; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v4 +; SI-NEXT: v_ashrrev_i32_e32 v45, 16, v7 +; SI-NEXT: v_bfe_i32 v20, v9, 0, 16 +; SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_i32 v22, v8, 0, 16 +; SI-NEXT: v_bfe_i32 v24, v11, 0, 16 +; SI-NEXT: v_bfe_i32 v26, v10, 0, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v6 +; SI-NEXT: s_waitcnt lgkmcnt(5) +; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1 +; SI-NEXT: v_bfe_i32 v28, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v30, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v44, v7, 0, 16 +; SI-NEXT: v_bfe_i32 v20, v6, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v0 +; SI-NEXT: v_ashrrev_i32_e32 v47, 16, v3 +; SI-NEXT: v_bfe_i32 v9, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v6, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v46, v3, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v2 +; SI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(4) +; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v13 +; SI-NEXT: v_bfe_i32 v2, v13, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12 +; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v49, 16, v15 +; SI-NEXT: v_bfe_i32 v48, v15, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14 +; SI-NEXT: v_bfe_i32 v14, v14, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(3) +; SI-NEXT: v_ashrrev_i32_e32 v51, 16, v17 +; SI-NEXT: v_bfe_i32 v50, v17, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v16 +; SI-NEXT: v_bfe_i32 v16, v16, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v53, 16, v19 +; SI-NEXT: v_bfe_i32 v52, v19, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v18 +; SI-NEXT: v_bfe_i32 v18, v18, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_ashrrev_i32_e32 v55, 16, v33 +; SI-NEXT: v_bfe_i32 v54, v33, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v33, 16, v32 +; SI-NEXT: v_bfe_i32 v32, v32, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v57, 16, v35 +; SI-NEXT: v_bfe_i32 v56, v35, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v35, 16, v34 +; SI-NEXT: v_bfe_i32 v34, v34, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_ashrrev_i32_e32 v59, 16, v37 +; SI-NEXT: v_bfe_i32 v58, v37, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v37, 16, v36 +; SI-NEXT: v_bfe_i32 v36, v36, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v61, 16, v39 +; SI-NEXT: v_bfe_i32 v60, v39, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v39, 16, v38 +; SI-NEXT: v_bfe_i32 v38, v38, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v63, 16, v41 +; SI-NEXT: v_bfe_i32 v62, v41, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v41, 16, v40 +; SI-NEXT: v_bfe_i32 v40, v40, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v43 +; SI-NEXT: v_bfe_i32 v0, v43, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v43, 16, v42 +; SI-NEXT: v_bfe_i32 v42, v42, 0, 16 +; SI-NEXT: v_mov_b32_e32 v8, s0 +; SI-NEXT: ds_write2_b64 v8, v[42:43], v[0:1] offset0:14 offset1:15 +; SI-NEXT: ds_write2_b64 v8, v[40:41], v[62:63] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v8, v[38:39], v[60:61] offset0:10 offset1:11 +; SI-NEXT: ds_write2_b64 v8, v[36:37], v[58:59] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v8, v[34:35], v[56:57] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v8, v[32:33], v[54:55] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v8, v[18:19], v[52:53] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v8, v[16:17], v[50:51] offset1:1 +; SI-NEXT: ds_write2_b64 v8, v[14:15], v[48:49] offset0:30 offset1:31 +; SI-NEXT: ds_write2_b64 v8, v[12:13], v[2:3] offset0:28 offset1:29 +; SI-NEXT: ds_write2_b64 v8, v[4:5], v[46:47] offset0:26 offset1:27 +; SI-NEXT: ds_write2_b64 v8, v[6:7], v[9:10] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v8, v[20:21], v[44:45] offset0:22 offset1:23 +; SI-NEXT: ds_write2_b64 v8, v[30:31], v[28:29] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v8, v[26:27], v[24:25] offset0:18 offset1:19 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: ds_write2_b64 v8, v[22:23], v[0:1] offset0:16 offset1:17 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v64i16_to_v64i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v28, s1 +; GFX9-NEXT: ds_read2_b64 v[16:19], v28 offset1:1 +; GFX9-NEXT: ds_read2_b64 v[20:23], v28 offset0:2 offset1:3 +; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v17 +; GFX9-NEXT: v_bfe_i32 v0, v17, 0, 16 +; GFX9-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: ds_read2_b64 v[24:27], v28 offset0:4 offset1:5 +; GFX9-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 16, v16 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 16, v19 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 16, v18 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_ashrrev_i32_e32 v9, 16, v21 +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 16, v20 +; GFX9-NEXT: v_ashrrev_i32_e32 v13, 16, v23 +; GFX9-NEXT: v_ashrrev_i32_e32 v15, 16, v22 +; GFX9-NEXT: v_bfe_i32 v2, v16, 0, 16 +; GFX9-NEXT: v_bfe_i32 v4, v19, 0, 16 +; GFX9-NEXT: v_bfe_i32 v6, v18, 0, 16 +; GFX9-NEXT: v_bfe_i32 v8, v21, 0, 16 +; GFX9-NEXT: v_bfe_i32 v10, v20, 0, 16 +; GFX9-NEXT: v_bfe_i32 v12, v23, 0, 16 +; GFX9-NEXT: v_bfe_i32 v14, v22, 0, 16 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_ashrrev_i32_e32 v17, 16, v25 +; GFX9-NEXT: v_bfe_i32 v16, v25, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v19, 16, v24 +; GFX9-NEXT: v_bfe_i32 v18, v24, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v21, 16, v27 +; GFX9-NEXT: v_bfe_i32 v20, v27, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v23, 16, v26 +; GFX9-NEXT: v_bfe_i32 v22, v26, 0, 16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v25, 16, v30 +; GFX9-NEXT: v_bfe_i32 v24, v30, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v27, 16, v29 +; GFX9-NEXT: v_bfe_i32 v26, v29, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v38, 16, v32 +; GFX9-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 +; GFX9-NEXT: v_bfe_i32 v37, v32, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v40, 16, v31 +; GFX9-NEXT: v_bfe_i32 v39, v31, 0, 16 +; GFX9-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_ashrrev_i32_e32 v42, 16, v34 +; GFX9-NEXT: v_bfe_i32 v41, v34, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v44, 16, v33 +; GFX9-NEXT: v_bfe_i32 v43, v33, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v46, 16, v36 +; GFX9-NEXT: v_bfe_i32 v45, v36, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v48, 16, v35 +; GFX9-NEXT: v_bfe_i32 v47, v35, 0, 16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v50, 16, v30 +; GFX9-NEXT: v_bfe_i32 v49, v30, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v52, 16, v29 +; GFX9-NEXT: v_bfe_i32 v51, v29, 0, 16 +; GFX9-NEXT: ds_read2_b64 v[33:36], v28 offset0:12 offset1:13 +; GFX9-NEXT: v_ashrrev_i32_e32 v56, 16, v31 +; GFX9-NEXT: v_bfe_i32 v55, v31, 0, 16 +; GFX9-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 +; GFX9-NEXT: v_ashrrev_i32_e32 v54, 16, v32 +; GFX9-NEXT: v_bfe_i32 v53, v32, 0, 16 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_ashrrev_i32_e32 v58, 16, v34 +; GFX9-NEXT: v_bfe_i32 v57, v34, 0, 16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v32, 16, v31 +; GFX9-NEXT: v_bfe_i32 v31, v31, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v30 +; GFX9-NEXT: v_bfe_i32 v0, v30, 0, 16 +; GFX9-NEXT: v_mov_b32_e32 v30, s0 +; GFX9-NEXT: v_ashrrev_i32_e32 v34, 16, v33 +; GFX9-NEXT: v_bfe_i32 v33, v33, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v60, 16, v36 +; GFX9-NEXT: v_bfe_i32 v59, v36, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v36, 16, v35 +; GFX9-NEXT: v_bfe_i32 v35, v35, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v62, 16, v29 +; GFX9-NEXT: v_bfe_i32 v61, v29, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v29, 16, v28 +; GFX9-NEXT: v_bfe_i32 v28, v28, 0, 16 +; GFX9-NEXT: ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31 +; GFX9-NEXT: ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29 +; GFX9-NEXT: ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27 +; GFX9-NEXT: ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25 +; GFX9-NEXT: ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23 +; GFX9-NEXT: ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21 +; GFX9-NEXT: ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19 +; GFX9-NEXT: ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17 +; GFX9-NEXT: ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15 +; GFX9-NEXT: ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13 +; GFX9-NEXT: ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11 +; GFX9-NEXT: ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9 +; GFX9-NEXT: ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7 +; GFX9-NEXT: ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5 +; GFX9-NEXT: ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_write2_b64 v30, v[2:3], v[0:1] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v64i16_to_v64i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 116, @30, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Z, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.W, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Y, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Z, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.W, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.Y, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.Z, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.W, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.Y, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.Z, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.W, OQAP, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, +; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T5.W +; EG-NEXT: MOV T5.Y, OQAP, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, +; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T5.W +; EG-NEXT: MOV T5.Z, OQAP, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, +; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T5.W +; EG-NEXT: MOV T5.W, OQAP, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, +; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T6.W +; EG-NEXT: MOV T6.Y, OQAP, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, +; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T6.W +; EG-NEXT: MOV T6.Z, OQAP, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, +; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T6.W +; EG-NEXT: MOV T6.W, OQAP, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, +; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T7.W +; EG-NEXT: MOV T7.Y, OQAP, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, +; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T7.W +; EG-NEXT: MOV T7.Z, OQAP, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, +; EG-NEXT: 104(1.457350e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T7.W +; EG-NEXT: MOV T7.W, OQAP, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, +; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T8.W +; EG-NEXT: MOV T8.Y, OQAP, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, +; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T8.W +; EG-NEXT: MOV T8.Z, OQAP, +; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, +; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T8.W +; EG-NEXT: MOV T8.W, OQAP, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x, +; EG-NEXT: 120(1.681558e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T9.W +; EG-NEXT: MOV T9.Y, OQAP, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x, +; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T9.W +; EG-NEXT: MOV T9.Z, OQAP, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: ALU 85, @31, KC0[CB0:0-32], KC1[] +; EG-NEXT: LDS_READ_RET * OQAP, T9.W +; EG-NEXT: MOV T9.W, OQAP, +; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T10.W +; EG-NEXT: MOV T10.Y, OQAP, +; EG-NEXT: LSHR T10.W, T9.W, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Z, literal.y, +; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43) +; EG-NEXT: LDS_READ_RET * OQAP, T11.W +; EG-NEXT: MOV T10.Z, OQAP, +; EG-NEXT: LSHR * T11.Z, T10.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T10.W, T10.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T0.Y, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T0.Z, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T0.W, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T1.Y, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T1.Z, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T1.W, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T2.Y, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T2.Z, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T2.W, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T3.Y, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T3.Z, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T3.W, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T4.Y, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T4.Z, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43) +; EG-NEXT: ALU 83, @32, KC0[CB0:0-32], KC1[] +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T4.W, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T5.Y, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 156(2.186026e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T5.Z, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 148(2.073922e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T5.W, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 140(1.961818e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T6.Y, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 132(1.849714e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T6.Z, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 188(2.634441e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T6.W, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 180(2.522337e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T7.Y, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 172(2.410233e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T7.Z, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 164(2.298129e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T7.W, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 220(3.082857e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T8.Y, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 212(2.970753e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T8.Z, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 204(2.858649e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T8.W, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 196(2.746545e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T9.Y, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 252(3.531272e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T12.Z, T9.Z, literal.x, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 244(3.419168e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: LSHR T11.Z, T10.Z, literal.x, +; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 236(3.307064e-43) +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 228(3.194960e-43) +; EG-NEXT: ALU 94, @33, KC0[CB0:0-32], KC1[] +; EG-NEXT: LDS_WRITE * T11.W, T10.W, +; EG-NEXT: BFE_INT T9.W, T9.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T10.W, T9.W, +; EG-NEXT: BFE_INT T9.W, T10.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T10.W, T9.W, +; EG-NEXT: BFE_INT T9.W, T0.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T10.W, T9.W, +; EG-NEXT: BFE_INT T9.W, T0.Z, 0.0, literal.x, +; EG-NEXT: MOV * T10.W, KC0[2].Y, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T10.W, T9.W, +; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T9.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44) +; EG-NEXT: LDS_WRITE * T9.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T9.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T1.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T2.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T2.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T3.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T3.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T3.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T4.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T4.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 96(1.345247e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T4.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 144(2.017870e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T5.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T5.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 128(1.793662e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T6.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T6.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 176(2.466285e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T6.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT * T0.W, T7.Y, 0.0, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ALU 34, @34, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, +; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T7.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T7.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 208(2.914701e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T8.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 200(2.802597e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T8.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 192(2.690493e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T8.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 240(3.363116e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T9.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: BFE_INT T0.W, T10.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 224(3.138909e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <64 x i16>, ptr addrspace(3) %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_i16_to_i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]], -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} - -; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]] - -; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z -; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]] -; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP -; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y -; EG-DAG: LDS_WRITE define amdgpu_kernel void @local_zextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_i16_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_u16 v0, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write_b64 v2, v[0:1] +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_i16_to_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_i16_to_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 8, @35, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: MOV T0.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 0(0.000000e+00), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %a = load i16, ptr addrspace(3) %in %ext = zext i16 %a to i64 store i64 %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_i16_to_i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - -; FIXME: Need to optimize this sequence to avoid an extra shift. -; t25: i32,ch = load t12, t10, undef:i32 -; t28: i64 = any_extend t25 -; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16 -; SI: ds_read_i16 v[[LO:[0-9]+]], -; GFX89: ds_read_u16 v[[ULO:[0-9]+]] -; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 -; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] - -; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]] - -; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z -; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]] -; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP -; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y -; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal -; EG-DAG: LDS_WRITE -; EG-DAG: 16 -; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] define amdgpu_kernel void @local_sextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_i16_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_i16 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: ds_write_b64 v2, v[0:1] +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_i16_to_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_i16_to_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 10, @36, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV * T0.X, OQAP, +; EG-NEXT: BFE_INT * T0.W, PV.X, 0.0, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ASHR T1.W, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %a = load i16, ptr addrspace(3) %in %ext = sext i16 %a to i64 store i64 %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z -; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]] -; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP -; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y -; EG-DAG: LDS_WRITE define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v1i16_to_v1i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_u16 v0, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write_b64 v2, v[0:1] +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v1i16_to_v1i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v1i16_to_v1i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 8, @37, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: MOV T0.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 0(0.000000e+00), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <1 x i16>, ptr addrspace(3) %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z -; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]] -; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP -; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y -; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal -; EG-DAG: LDS_WRITE -; EG-DAG: 16 -; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v1i16_to_v1i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_i16 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: ds_write_b64 v2, v[0:1] +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v1i16_to_v1i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v1i16_to_v1i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 10, @38, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV * T0.X, OQAP, +; EG-NEXT: BFE_INT * T0.W, PV.X, 0.0, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ASHR T1.W, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <1 x i16>, ptr addrspace(3) %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: LDS_READ_RET define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v2i16_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b32 v2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v2i16_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_mov_b32 s1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v2i16_to_v2i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 17, @39, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV * T0.X, OQAP, +; EG-NEXT: AND_INT T0.W, PV.X, literal.x, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.X, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: MOV * T1.W, literal.y, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: RETURN %load = load <2 x i16>, ptr addrspace(3) %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: LDS_READ_RET -; EG-DAG: BFE_INT -; EG-DAG: ASHR define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v2i16_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v2i16_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v2i16_to_v2i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 18, @40, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV * T0.X, OQAP, +; EG-NEXT: BFE_INT * T0.W, PV.X, 0.0, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ASHR T1.W, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T0.X, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T0.X, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <2 x i16>, ptr addrspace(3) %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v4i16_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b64 v[0:1], v0 +; SI-NEXT: v_mov_b32_e32 v3, 0 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v10, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; SI-NEXT: ds_write2_b64 v10, v[4:5], v[2:3] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v10, v[6:7], v[8:9] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v4i16_to_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s0 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v7, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v5, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX9-NEXT: ds_write2_b64 v9, v[3:4], v[5:6] offset0:2 offset1:3 +; GFX9-NEXT: ds_write2_b64 v9, v[1:2], v[7:8] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v4i16_to_v4i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 35, @41, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: MOV * T1.W, literal.y, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: RETURN %load = load <4 x i16>, ptr addrspace(3) %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v4i16_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_b64 v[0:1], v0 +; SI-NEXT: v_mov_b32_e32 v8, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_ashr_i64 v[4:5], v[0:1], 48 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v6, v3, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; SI-NEXT: ds_write2_b64 v8, v[2:3], v[4:5] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v8, v[0:1], v[6:7] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v4i16_to_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: v_mov_b32_e32 v8, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_bfe_i32 v4, v3, 0, 16 +; GFX9-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3 +; GFX9-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v4i16_to_v4i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 39, @42, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: BFE_INT * T0.W, T0.Y, 0.0, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T1.Z, PV.Z, 0.0, literal.x, +; EG-NEXT: ASHR T1.W, PV.W, literal.y, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ASHR T1.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 20(2.802597e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.Z, +; EG-NEXT: ASHR T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <4 x i16>, ptr addrspace(3) %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v8i16_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; SI-NEXT: v_mov_b32_e32 v5, 0 +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v11, v5 +; SI-NEXT: v_mov_b32_e32 v13, v5 +; SI-NEXT: v_mov_b32_e32 v15, v5 +; SI-NEXT: v_mov_b32_e32 v17, v5 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v20, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; SI-NEXT: ds_write2_b64 v20, v[8:9], v[6:7] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v20, v[12:13], v[4:5] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v20, v[10:11], v[16:17] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v20, v[14:15], v[18:19] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v8i16_to_v8i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; GFX9-NEXT: s_mov_b32 s1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v13, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v7, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v3 +; GFX9-NEXT: v_and_b32_sdwa v6, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, v12 +; GFX9-NEXT: ds_write2_b64 v13, v[11:12], v[7:8] offset0:6 offset1:7 +; GFX9-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-NEXT: v_and_b32_sdwa v5, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v1 +; GFX9-NEXT: ds_write2_b64 v13, v[2:3], v[6:7] offset0:4 offset1:5 +; GFX9-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-NEXT: v_and_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v12 +; GFX9-NEXT: ds_write2_b64 v13, v[9:10], v[5:6] offset0:2 offset1:3 +; GFX9-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-NEXT: ds_write2_b64 v13, v[0:1], v[4:5] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v8i16_to_v8i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 71, @43, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: AND_INT T1.W, T0.W, literal.x, +; EG-NEXT: MOV * T2.W, KC0[2].Y, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T0.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: MOV * T1.W, literal.y, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: RETURN %load = load <8 x i16>, ptr addrspace(3) %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v8i16_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; SI-NEXT: v_mov_b32_e32 v16, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_bfe_i32 v4, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v6, v1, 0, 16 +; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 48 +; SI-NEXT: v_bfe_i32 v8, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v10, v5, 0, 16 +; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 +; SI-NEXT: v_bfe_i32 v12, v9, 0, 16 +; SI-NEXT: v_bfe_i32 v14, v7, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; SI-NEXT: ds_write2_b64 v16, v[10:11], v[2:3] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v16, v[6:7], v[0:1] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v16, v[8:9], v[14:15] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v16, v[4:5], v[12:13] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v8i16_to_v8i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v16, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX9-NEXT: v_bfe_i32 v14, v2, 0, 16 +; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-NEXT: v_bfe_i32 v10, v9, 0, 16 +; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX9-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX9-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GFX9-NEXT: v_bfe_i32 v8, v7, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX9-NEXT: v_bfe_i32 v12, v0, 0, 16 +; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX9-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:6 offset1:7 +; GFX9-NEXT: ds_write2_b64 v16, v[14:15], v[8:9] offset0:4 offset1:5 +; GFX9-NEXT: ds_write2_b64 v16, v[0:1], v[6:7] offset0:2 offset1:3 +; GFX9-NEXT: ds_write2_b64 v16, v[12:13], v[4:5] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v8i16_to_v8i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 80, @44, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV * T0.W, OQAP, +; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.y, +; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: BFE_INT T1.Z, T0.W, 0.0, literal.x, +; EG-NEXT: ASHR T2.W, T1.W, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: BFE_INT T2.Z, T0.Y, 0.0, literal.x, +; EG-NEXT: ASHR T2.W, T1.Z, literal.y, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: BFE_INT T3.Z, T1.Y, 0.0, literal.x, +; EG-NEXT: ASHR T2.W, T2.Z, literal.y, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: ASHR T2.W, T3.Z, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 52(7.286752e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: MOV * T2.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ASHR T1.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ASHR T1.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T1.Z, +; EG-NEXT: ASHR T1.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ASHR T0.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T2.Z, +; EG-NEXT: ASHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T3.Z, +; EG-NEXT: ASHR T0.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: RETURN %load = load <8 x i16>, ptr addrspace(3) %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v16i16_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 +; SI-NEXT: v_mov_b32_e32 v9, 0 +; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 +; SI-NEXT: v_mov_b32_e32 v11, v9 +; SI-NEXT: v_mov_b32_e32 v13, v9 +; SI-NEXT: v_mov_b32_e32 v15, v9 +; SI-NEXT: v_mov_b32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v19, v9 +; SI-NEXT: v_mov_b32_e32 v21, v9 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v29, s0 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v1 +; SI-NEXT: ds_write2_b64 v29, v[17:18], v[14:15] offset0:10 offset1:11 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v14, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; SI-NEXT: ds_write2_b64 v29, v[20:21], v[12:13] offset0:14 offset1:15 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_mov_b32_e32 v26, v9 +; SI-NEXT: v_mov_b32_e32 v28, v9 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v5, v9 +; SI-NEXT: ds_write2_b64 v29, v[21:22], v[10:11] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v29, v[13:14], v[8:9] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v29, v[15:16], v[23:24] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v29, v[18:19], v[25:26] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v29, v[0:1], v[27:28] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v29, v[2:3], v[4:5] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v16i16_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 +; GFX9-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; GFX9-NEXT: v_mov_b32_e32 v17, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GFX9-NEXT: ds_write2_b64 v17, v[13:14], v[15:16] offset0:10 offset1:11 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: ds_write2_b64 v17, v[15:16], v[7:8] offset0:8 offset1:9 +; GFX9-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-NEXT: ds_write2_b64 v17, v[14:15], v[6:7] offset0:12 offset1:13 +; GFX9-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: ds_write2_b64 v17, v[5:6], v[13:14] offset0:14 offset1:15 +; GFX9-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: ds_write2_b64 v17, v[3:4], v[12:13] offset0:6 offset1:7 +; GFX9-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: ds_write2_b64 v17, v[2:3], v[11:12] offset0:4 offset1:5 +; GFX9-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: ds_write2_b64 v17, v[1:2], v[10:11] offset0:2 offset1:3 +; GFX9-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-NEXT: ds_write2_b64 v17, v[0:1], v[9:10] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v16i16_to_v16i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 100, @45, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Z, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.W, OQAP, +; EG-NEXT: MOV * T2.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Y, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Z, OQAP, +; EG-NEXT: LSHR T2.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T2.W, T2.Y, literal.x, +; EG-NEXT: MOV * T3.W, KC0[2].Y, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T2.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T1.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T1.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T0.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: MOV * T1.W, literal.y, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: ALU 42, @46, KC0[CB0:0-32], KC1[] +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: RETURN %load = load <16 x i16>, ptr addrspace(3) %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v16i16_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 +; SI-NEXT: v_mov_b32_e32 v25, s0 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_bfe_i32 v8, v4, 0, 16 +; SI-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 +; SI-NEXT: v_bfe_i32 v4, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v12, v6, 0, 16 +; SI-NEXT: v_ashr_i64 v[6:7], v[6:7], 48 +; SI-NEXT: v_bfe_i32 v14, v13, 0, 16 +; SI-NEXT: v_bfe_i32 v16, v0, 0, 16 +; SI-NEXT: v_ashr_i64 v[17:18], v[2:3], 48 +; SI-NEXT: v_bfe_i32 v19, v9, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v25, v[19:20], v[17:18] offset0:14 offset1:15 +; SI-NEXT: v_ashr_i64 v[17:18], v[0:1], 48 +; SI-NEXT: v_bfe_i32 v0, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v19, v23, 0, 16 +; SI-NEXT: v_bfe_i32 v21, v21, 0, 16 +; SI-NEXT: v_bfe_i32 v23, v22, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: ds_write2_b64 v25, v[0:1], v[17:18] offset0:10 offset1:11 +; SI-NEXT: v_bfe_i32 v0, v15, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: ds_write2_b64 v25, v[14:15], v[6:7] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v25, v[4:5], v[10:11] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v25, v[2:3], v[0:1] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v25, v[16:17], v[23:24] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v25, v[12:13], v[21:22] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v25, v[8:9], v[19:20] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v16i16_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 +; GFX9-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; GFX9-NEXT: v_mov_b32_e32 v26, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GFX9-NEXT: v_bfe_i32 v22, v22, 0, 16 +; GFX9-NEXT: v_bfe_i32 v24, v4, 0, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GFX9-NEXT: v_bfe_i32 v20, v20, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX9-NEXT: v_bfe_i32 v4, v5, 0, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; GFX9-NEXT: v_bfe_i32 v18, v18, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-NEXT: ds_write2_b64 v26, v[24:25], v[22:23] offset0:8 offset1:9 +; GFX9-NEXT: v_mov_b32_e32 v24, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX9-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX9-NEXT: v_bfe_i32 v10, v9, 0, 16 +; GFX9-NEXT: v_bfe_i32 v12, v11, 0, 16 +; GFX9-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GFX9-NEXT: v_bfe_i32 v16, v16, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-NEXT: v_bfe_i32 v22, v0, 0, 16 +; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX9-NEXT: ds_write2_b64 v26, v[4:5], v[20:21] offset0:10 offset1:11 +; GFX9-NEXT: v_bfe_i32 v4, v24, 0, 16 +; GFX9-NEXT: v_bfe_i32 v20, v7, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-NEXT: ds_write2_b64 v26, v[6:7], v[18:19] offset0:12 offset1:13 +; GFX9-NEXT: ds_write2_b64 v26, v[20:21], v[16:17] offset0:14 offset1:15 +; GFX9-NEXT: ds_write2_b64 v26, v[4:5], v[14:15] offset0:6 offset1:7 +; GFX9-NEXT: ds_write2_b64 v26, v[2:3], v[12:13] offset0:4 offset1:5 +; GFX9-NEXT: ds_write2_b64 v26, v[0:1], v[10:11] offset0:2 offset1:3 +; GFX9-NEXT: ds_write2_b64 v26, v[22:23], v[8:9] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v16i16_to_v16i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 101, @47, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Z, OQAP, +; EG-NEXT: MOV * T1.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.W, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Y, OQAP, +; EG-NEXT: BFE_INT T2.W, T1.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.y, +; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV * T2.Z, OQAP, +; EG-NEXT: BFE_INT T3.Z, T2.Y, 0.0, literal.x, +; EG-NEXT: ASHR T3.W, T2.W, literal.y, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: BFE_INT T4.Z, T0.Y, 0.0, literal.x, +; EG-NEXT: ASHR T3.W, T3.Z, literal.y, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: BFE_INT T5.Z, T0.Z, 0.0, literal.x, +; EG-NEXT: ASHR T3.W, T4.Z, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: BFE_INT T6.Z, T0.W, 0.0, literal.x, +; EG-NEXT: ASHR T3.W, T5.Z, literal.y, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: BFE_INT T7.Z, T1.Y, 0.0, literal.x, +; EG-NEXT: ASHR T3.W, T6.Z, literal.y, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: BFE_INT T8.Z, T1.Z, 0.0, literal.x, +; EG-NEXT: ASHR T3.W, T7.Z, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: BFE_INT T9.Z, T2.Z, 0.0, literal.x, +; EG-NEXT: ASHR T3.W, T8.Z, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: ASHR T3.W, T9.Z, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 116(1.625506e-43) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: ASHR T3.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: ASHR T1.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T3.W, T1.W, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T1.W, T2.W, +; EG-NEXT: ASHR T1.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ASHR T1.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T3.Z, +; EG-NEXT: ASHR T1.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44) +; EG-NEXT: ALU 62, @48, KC0[CB0:0-32], KC1[] +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ASHR T1.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T4.Z, +; EG-NEXT: ASHR T1.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ASHR T1.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T5.Z, +; EG-NEXT: ASHR T1.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 76(1.064987e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: ASHR T0.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T6.Z, +; EG-NEXT: ASHR T0.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 92(1.289195e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T7.Z, +; EG-NEXT: ASHR T0.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 108(1.513402e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T8.Z, +; EG-NEXT: ASHR T0.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 124(1.737610e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T9.Z, +; EG-NEXT: RETURN %load = load <16 x i16>, ptr addrspace(3) %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET + define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_zextload_v32i16_to_v32i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[5:8], v13 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[0:3], v13 offset1:1 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: ds_read2_b64 v[9:12], v13 offset0:4 offset1:5 +; SI-NEXT: ds_read2_b64 v[13:16], v13 offset0:6 offset1:7 +; SI-NEXT: v_mov_b32_e32 v28, s0 +; SI-NEXT: s_waitcnt lgkmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v8 +; SI-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:14 offset1:15 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v6 +; SI-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:10 offset1:11 +; SI-NEXT: s_waitcnt lgkmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v3 +; SI-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:6 offset1:7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v1 +; SI-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:2 offset1:3 +; SI-NEXT: s_waitcnt lgkmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; SI-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:30 offset1:31 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v14 +; SI-NEXT: ds_write2_b64 v28, v[20:21], v[18:19] offset0:26 offset1:27 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v12 +; SI-NEXT: ds_write2_b64 v28, v[22:23], v[16:17] offset0:22 offset1:23 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; SI-NEXT: ds_write2_b64 v28, v[7:8], v[20:21] offset0:12 offset1:13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: ds_write2_b64 v28, v[18:19], v[5:6] offset0:8 offset1:9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v2 +; SI-NEXT: v_mov_b32_e32 v9, v4 +; SI-NEXT: ds_write2_b64 v28, v[8:9], v[3:4] offset0:18 offset1:19 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: ds_write2_b64 v28, v[26:27], v[20:21] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v28, v[24:25], v[22:23] offset1:1 +; SI-NEXT: ds_write2_b64 v28, v[18:19], v[1:2] offset0:28 offset1:29 +; SI-NEXT: ds_write2_b64 v28, v[16:17], v[12:13] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v28, v[10:11], v[7:8] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v28, v[5:6], v[14:15] offset0:16 offset1:17 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_zextload_v32i16_to_v32i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v22, v1 +; GFX9-NEXT: v_mov_b32_e32 v20, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read2_b64 v[2:5], v0 offset1:1 +; GFX9-NEXT: ds_read2_b64 v[6:9], v0 offset0:4 offset1:5 +; GFX9-NEXT: ds_read2_b64 v[10:13], v0 offset0:6 offset1:7 +; GFX9-NEXT: v_mov_b32_e32 v23, s0 +; GFX9-NEXT: ds_read2_b64 v[14:17], v0 offset0:2 offset1:3 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v12 +; GFX9-NEXT: ds_write2_b64 v23, v[21:22], v[19:20] offset0:28 offset1:29 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-NEXT: ds_write2_b64 v23, v[11:12], v[19:20] offset0:26 offset1:27 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX9-NEXT: ds_write2_b64 v23, v[19:20], v[11:12] offset0:24 offset1:25 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX9-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-NEXT: ds_write2_b64 v23, v[19:20], v[10:11] offset0:22 offset1:23 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: ds_write2_b64 v23, v[11:12], v[9:10] offset0:20 offset1:21 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v7 +; GFX9-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: ds_write2_b64 v23, v[10:11], v[8:9] offset0:18 offset1:19 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v6 +; GFX9-NEXT: ds_write2_b64 v23, v[10:11], v[8:9] offset0:16 offset1:17 +; GFX9-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v17 +; GFX9-NEXT: ds_write2_b64 v23, v[10:11], v[8:9] offset0:14 offset1:15 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX9-NEXT: ds_write2_b64 v23, v[10:11], v[8:9] offset0:12 offset1:13 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: ds_write2_b64 v23, v[12:13], v[10:11] offset0:8 offset1:9 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: ds_write2_b64 v23, v[5:6], v[9:10] offset0:6 offset1:7 +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: ds_write2_b64 v23, v[4:5], v[8:9] offset0:4 offset1:5 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v15 +; GFX9-NEXT: ds_write2_b64 v23, v[0:1], v[19:20] offset0:30 offset1:31 +; GFX9-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: ds_write2_b64 v23, v[3:4], v[7:8] offset0:2 offset1:3 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: ds_write2_b64 v23, v[16:17], v[14:15] offset0:10 offset1:11 +; GFX9-NEXT: ds_write2_b64 v23, v[2:3], v[18:19] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_zextload_v32i16_to_v32i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 105, @49, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.W, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.Z, OQAP, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, +; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T1.W, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Y, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.Z, OQAP, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T2.W +; EG-NEXT: MOV T2.W, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.Y, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.Z, OQAP, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T3.W +; EG-NEXT: MOV T3.W, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.Y, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.Z, OQAP, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T4.W +; EG-NEXT: MOV T4.W, OQAP, +; EG-NEXT: MOV * T5.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T5.W +; EG-NEXT: MOV T5.Y, OQAP, +; EG-NEXT: LSHR T5.W, T4.W, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T6.W, T5.W, +; EG-NEXT: AND_INT T4.W, T4.W, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: LSHR T4.W, T5.Y, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: AND_INT T4.W, T5.Y, literal.x, +; EG-NEXT: MOV * T5.W, KC0[2].Y, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: LSHR T4.W, T4.Z, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: AND_INT T4.W, T4.Z, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: LSHR T4.W, T4.Y, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: AND_INT T4.W, T4.Y, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: LSHR T4.W, T3.W, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43) +; EG-NEXT: LDS_WRITE * T5.W, T4.W, +; EG-NEXT: AND_INT T3.W, T3.W, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: LSHR T3.W, T3.Z, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) +; EG-NEXT: ALU 93, @50, KC0[CB0:0-32], KC1[] +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: AND_INT T3.W, T3.Z, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: LSHR T3.W, T3.Y, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: AND_INT T3.W, T3.Y, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: LSHR T3.W, T2.W, literal.x, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) +; EG-NEXT: LDS_WRITE * T4.W, T3.W, +; EG-NEXT: AND_INT T2.W, T2.W, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T2.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 144(2.017870e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T2.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 128(1.793662e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: LSHR T2.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43) +; EG-NEXT: LDS_WRITE * T3.W, T2.W, +; EG-NEXT: AND_INT T1.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 176(2.466285e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 160(2.242078e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T1.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 208(2.914701e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: LSHR T1.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 200(2.802597e-43) +; EG-NEXT: LDS_WRITE * T2.W, T1.W, +; EG-NEXT: AND_INT T0.W, T0.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 192(2.690493e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 240(3.363116e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: LSHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 224(3.138909e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: MOV * T1.W, literal.y, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: ALU 87, @51, KC0[CB0:0-32], KC1[] +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 156(2.186026e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 148(2.073922e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 140(1.961818e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 132(1.849714e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 188(2.634441e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 180(2.522337e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 172(2.410233e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 164(2.298129e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 220(3.082857e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 212(2.970753e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 204(2.858649e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 196(2.746545e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 252(3.531272e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 244(3.419168e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 236(3.307064e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 228(3.194960e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T1.W, +; EG-NEXT: RETURN %load = load <32 x i16>, ptr addrspace(3) %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64: -; GFX9-NOT: m0 -; SICIVI: s_mov_b32 m0 - - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: BFE_INT -; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { +; SI-LABEL: local_sextload_v32i16_to_v32i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v12 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[4:7], v12 offset1:1 +; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:6 offset1:7 +; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5 +; SI-NEXT: v_mov_b32_e32 v16, s0 +; SI-NEXT: s_waitcnt lgkmcnt(3) +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v15 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_ashr_i64 v[17:18], v[2:3], 48 +; SI-NEXT: v_bfe_i32 v19, v19, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:14 offset1:15 +; SI-NEXT: v_ashr_i64 v[17:18], v[0:1], 48 +; SI-NEXT: v_bfe_i32 v19, v1, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:10 offset1:11 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_ashr_i64 v[17:18], v[6:7], 48 +; SI-NEXT: v_bfe_i32 v19, v21, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:6 offset1:7 +; SI-NEXT: v_ashr_i64 v[17:18], v[4:5], 48 +; SI-NEXT: v_bfe_i32 v19, v5, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:2 offset1:3 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_ashr_i64 v[17:18], v[10:11], 48 +; SI-NEXT: v_bfe_i32 v19, v22, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:30 offset1:31 +; SI-NEXT: v_ashr_i64 v[17:18], v[8:9], 48 +; SI-NEXT: v_bfe_i32 v19, v9, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:26 offset1:27 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_ashr_i64 v[17:18], v[14:15], 48 +; SI-NEXT: v_bfe_i32 v19, v23, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:22 offset1:23 +; SI-NEXT: v_ashr_i64 v[17:18], v[12:13], 48 +; SI-NEXT: v_bfe_i32 v19, v13, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:18 offset1:19 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_bfe_i32 v1, v12, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v14, 0, 16 +; SI-NEXT: v_bfe_i32 v5, v8, 0, 16 +; SI-NEXT: v_bfe_i32 v7, v10, 0, 16 +; SI-NEXT: v_bfe_i32 v9, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v11, v6, 0, 16 +; SI-NEXT: v_bfe_i32 v12, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v13, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v17, v15, 0, 16 +; SI-NEXT: v_bfe_i32 v19, v21, 0, 16 +; SI-NEXT: v_bfe_i32 v20, v24, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; SI-NEXT: ds_write2_b64 v16, v[13:14], v[20:21] offset0:12 offset1:13 +; SI-NEXT: v_bfe_i32 v14, v29, 0, 16 +; SI-NEXT: v_bfe_i32 v21, v28, 0, 16 +; SI-NEXT: v_bfe_i32 v23, v27, 0, 16 +; SI-NEXT: v_bfe_i32 v24, v25, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; SI-NEXT: ds_write2_b64 v16, v[12:13], v[24:25] offset0:8 offset1:9 +; SI-NEXT: v_bfe_i32 v25, v26, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; SI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; SI-NEXT: ds_write2_b64 v16, v[11:12], v[25:26] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v16, v[9:10], v[23:24] offset1:1 +; SI-NEXT: ds_write2_b64 v16, v[7:8], v[21:22] offset0:28 offset1:29 +; SI-NEXT: ds_write2_b64 v16, v[5:6], v[14:15] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v16, v[3:4], v[19:20] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v16, v[1:2], v[17:18] offset0:16 offset1:17 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_sextload_v32i16_to_v32i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-NEXT: ds_read2_b64 v[4:7], v12 offset1:1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v12 offset0:2 offset1:3 +; GFX9-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 +; GFX9-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_bfe_i32 v23, v23, 0, 16 +; GFX9-NEXT: v_bfe_i32 v25, v15, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX9-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GFX9-NEXT: v_mov_b32_e32 v15, s0 +; GFX9-NEXT: ds_write2_b64 v15, v[25:26], v[23:24] offset0:30 offset1:31 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX9-NEXT: v_bfe_i32 v23, v23, 0, 16 +; GFX9-NEXT: v_bfe_i32 v25, v14, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX9-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; GFX9-NEXT: ds_write2_b64 v15, v[25:26], v[23:24] offset0:28 offset1:29 +; GFX9-NEXT: v_bfe_i32 v23, v14, 0, 16 +; GFX9-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; GFX9-NEXT: ds_write2_b64 v15, v[13:14], v[23:24] offset0:26 offset1:27 +; GFX9-NEXT: v_bfe_i32 v24, v12, 0, 16 +; GFX9-NEXT: v_bfe_i32 v26, v25, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; GFX9-NEXT: ds_write2_b64 v15, v[24:25], v[26:27] offset0:24 offset1:25 +; GFX9-NEXT: v_bfe_i32 v24, v12, 0, 16 +; GFX9-NEXT: v_bfe_i32 v11, v11, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX9-NEXT: ds_write2_b64 v15, v[11:12], v[24:25] offset0:22 offset1:23 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GFX9-NEXT: v_bfe_i32 v11, v11, 0, 16 +; GFX9-NEXT: v_bfe_i32 v24, v10, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX9-NEXT: ds_write2_b64 v15, v[24:25], v[11:12] offset0:20 offset1:21 +; GFX9-NEXT: v_bfe_i32 v10, v10, 0, 16 +; GFX9-NEXT: v_bfe_i32 v24, v9, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX9-NEXT: ds_write2_b64 v15, v[24:25], v[10:11] offset0:18 offset1:19 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; GFX9-NEXT: v_bfe_i32 v24, v8, 0, 16 +; GFX9-NEXT: v_bfe_i32 v26, v25, 0, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX9-NEXT: ds_write2_b64 v15, v[24:25], v[26:27] offset0:16 offset1:17 +; GFX9-NEXT: v_bfe_i32 v24, v8, 0, 16 +; GFX9-NEXT: v_bfe_i32 v26, v3, 0, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX9-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GFX9-NEXT: v_bfe_i32 v11, v11, 0, 16 +; GFX9-NEXT: ds_write2_b64 v15, v[26:27], v[24:25] offset0:14 offset1:15 +; GFX9-NEXT: v_mov_b32_e32 v26, v7 +; GFX9-NEXT: v_bfe_i32 v7, v2, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-NEXT: v_bfe_i32 v13, v22, 0, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; GFX9-NEXT: v_bfe_i32 v9, v28, 0, 16 +; GFX9-NEXT: ds_write2_b64 v15, v[7:8], v[11:12] offset0:12 offset1:13 +; GFX9-NEXT: v_bfe_i32 v11, v0, 0, 16 +; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX9-NEXT: v_bfe_i32 v16, v16, 0, 16 +; GFX9-NEXT: v_bfe_i32 v18, v17, 0, 16 +; GFX9-NEXT: v_bfe_i32 v20, v19, 0, 16 +; GFX9-NEXT: v_bfe_i32 v22, v22, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GFX9-NEXT: v_bfe_i32 v3, v4, 0, 16 +; GFX9-NEXT: v_bfe_i32 v24, v5, 0, 16 +; GFX9-NEXT: v_bfe_i32 v5, v6, 0, 16 +; GFX9-NEXT: v_bfe_i32 v7, v26, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GFX9-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX9-NEXT: ds_write2_b64 v15, v[0:1], v[9:10] offset0:10 offset1:11 +; GFX9-NEXT: ds_write2_b64 v15, v[11:12], v[22:23] offset0:8 offset1:9 +; GFX9-NEXT: ds_write2_b64 v15, v[7:8], v[13:14] offset0:6 offset1:7 +; GFX9-NEXT: ds_write2_b64 v15, v[5:6], v[20:21] offset0:4 offset1:5 +; GFX9-NEXT: ds_write2_b64 v15, v[24:25], v[18:19] offset0:2 offset1:3 +; GFX9-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_sextload_v32i16_to_v32i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 107, @52, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T1.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T1.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T1.W, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T2.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T2.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T2.W, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T3.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T3.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T3.W, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T4.Y, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T4.Z, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T4.W, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T5.Y, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T5.Z, OQAP, +; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x, +; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.y, +; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) +; EG-NEXT: LDS_READ_RET * OQAP, T5.W +; EG-NEXT: MOV * T5.W, OQAP, +; EG-NEXT: BFE_INT T0.Z, T5.Z, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T0.W, literal.y, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T6.Z, T0.Y, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T0.Z, literal.y, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T7.Z, T1.Y, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T6.Z, literal.y, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T8.Z, T1.Z, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T7.Z, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T9.Z, T1.W, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T8.Z, literal.y, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T10.Z, T2.Y, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T9.Z, literal.y, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T11.Z, T2.Z, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T10.Z, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT * T12.Z, T2.W, 0.0, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ALU 98, @53, KC0[CB0:0-32], KC1[] +; EG-NEXT: ASHR T6.W, T11.Z, literal.x, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 100(1.401298e-43) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T13.Z, T3.Y, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T12.Z, literal.y, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 148(2.073922e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T14.Z, T3.Z, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T13.Z, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 132(1.849714e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T15.Z, T3.W, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T14.Z, literal.y, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 180(2.522337e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T16.Z, T4.Y, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T15.Z, literal.y, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 164(2.298129e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T17.Z, T4.Z, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T16.Z, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 212(2.970753e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T18.Z, T4.W, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T17.Z, literal.y, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 196(2.746545e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: BFE_INT T19.Z, T5.W, 0.0, literal.x, +; EG-NEXT: ASHR T6.W, T18.Z, literal.y, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: 244(3.419168e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: ASHR T6.W, T19.Z, literal.x, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 228(3.194960e-43) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: ASHR T6.W, T5.Y, literal.x, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: ASHR T6.W, T5.Y, literal.x, +; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) +; EG-NEXT: LDS_WRITE * T7.W, T6.W, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T6.W, T0.W, +; EG-NEXT: ASHR T0.W, T5.Z, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44) +; EG-NEXT: LDS_WRITE * T6.W, T0.W, +; EG-NEXT: ASHR T0.W, T5.Z, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) +; EG-NEXT: LDS_WRITE * T6.W, T0.W, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T0.W, T0.Z, +; EG-NEXT: ASHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44) +; EG-NEXT: LDS_WRITE * T6.W, T0.W, +; EG-NEXT: ASHR T0.W, T0.Y, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) +; EG-NEXT: LDS_WRITE * T6.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T6.Z, +; EG-NEXT: ASHR T0.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44) +; EG-NEXT: LDS_WRITE * T6.W, T0.W, +; EG-NEXT: ASHR T0.W, T1.Y, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) +; EG-NEXT: LDS_WRITE * T6.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T7.Z, +; EG-NEXT: ASHR T0.W, T1.Z, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 92(1.289195e-43) +; EG-NEXT: LDS_WRITE * T6.W, T0.W, +; EG-NEXT: ASHR * T0.W, T1.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ALU 99, @54, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.x, +; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T6.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T8.Z, +; EG-NEXT: ASHR T0.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 76(1.064987e-43) +; EG-NEXT: LDS_WRITE * T6.W, T0.W, +; EG-NEXT: ASHR T0.W, T1.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T9.Z, +; EG-NEXT: ASHR T0.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 124(1.737610e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T2.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T10.Z, +; EG-NEXT: ASHR T0.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 108(1.513402e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T2.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T11.Z, +; EG-NEXT: ASHR T0.W, T2.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 156(2.186026e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T2.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T12.Z, +; EG-NEXT: ASHR T0.W, T3.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 140(1.961818e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T3.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T13.Z, +; EG-NEXT: ASHR T0.W, T3.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 188(2.634441e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T3.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 176(2.466285e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T14.Z, +; EG-NEXT: ASHR T0.W, T3.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 172(2.410233e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T3.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T15.Z, +; EG-NEXT: ASHR T0.W, T4.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 220(3.082857e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T4.Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T16.Z, +; EG-NEXT: ASHR T0.W, T4.Z, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 204(2.858649e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR * T0.W, T4.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ALU 27, @55, KC0[CB0:0-32], KC1[] +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, +; EG-NEXT: 200(2.802597e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T17.Z, +; EG-NEXT: ASHR T0.W, T4.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 252(3.531272e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T4.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T18.Z, +; EG-NEXT: ASHR T0.W, T5.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 236(3.307064e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ASHR T0.W, T5.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43) +; EG-NEXT: LDS_WRITE * T1.W, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T19.Z, +; EG-NEXT: RETURN %load = load <32 x i16>, ptr addrspace(3) %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, ptr addrspace(3) %out ret void } -; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64: -; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { -; %load = load <64 x i16>, ptr addrspace(3) %in -; %ext = zext <64 x i16> %load to <64 x i64> -; store <64 x i64> %ext, ptr addrspace(3) %out -; ret void -; } - -; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64: -; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { -; %load = load <64 x i16>, ptr addrspace(3) %in -; %ext = sext <64 x i16> %load to <64 x i64> -; store <64 x i64> %ext, ptr addrspace(3) %out -; ret void -; } - -; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load. -; FUNC-LABEL: {{^}}local_v8i16_to_128: - -; SI-NOT: ds_read_b128 -; SI-NOT: ds_write_b128 - -; CIVI: ds_read_b128 -; CIVI: ds_write_b128 - -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET -; EG: LDS_READ_RET define amdgpu_kernel void @local_v8i16_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) { +; SI-LABEL: local_v8i16_to_128: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: local_v8i16_to_128: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: local_v8i16_to_128: +; EG: ; %bb.0: +; EG-NEXT: ALU 25, @56, KC0[CB0:0-32], KC1[] +; EG-NEXT: MOV * T0.W, KC0[2].Z, +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: MOV * T0.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_READ_RET * OQAP, T0.W +; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LDS_WRITE * T0.W, T0.X, +; EG-NEXT: RETURN %ld = load <8 x i16>, ptr addrspace(3) %in, align 16 store <8 x i16> %ld, ptr addrspace(3) %out, align 16 ret void } attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CIVI: {{.*}} +; FUNC: {{.*}} +; GCN: {{.*}} +; GFX89: {{.*}} +; SICIVI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index af7f92798a931..336e29b53dfbe 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -496,52 +496,51 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @22, KC0[], KC1[] ; EG-NEXT: TEX 7 @6 -; EG-NEXT: ALU 30, @23, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 +; EG-NEXT: ALU 29, @23, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_8 T5.X, T4.X, 74, #3 -; EG-NEXT: VTX_READ_8 T6.X, T4.X, 108, #3 -; EG-NEXT: VTX_READ_8 T7.X, T4.X, 72, #3 -; EG-NEXT: VTX_READ_8 T8.X, T4.X, 111, #3 -; EG-NEXT: VTX_READ_8 T9.X, T4.X, 75, #3 -; EG-NEXT: VTX_READ_8 T10.X, T4.X, 109, #3 -; EG-NEXT: VTX_READ_8 T11.X, T4.X, 73, #3 -; EG-NEXT: VTX_READ_8 T4.X, T4.X, 110, #3 +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 73, #3 +; EG-NEXT: VTX_READ_8 T2.X, T0.X, 111, #3 +; EG-NEXT: VTX_READ_8 T3.X, T0.X, 75, #3 +; EG-NEXT: VTX_READ_8 T4.X, T0.X, 108, #3 +; EG-NEXT: VTX_READ_8 T5.X, T0.X, 72, #3 +; EG-NEXT: VTX_READ_8 T6.X, T0.X, 110, #3 +; EG-NEXT: VTX_READ_8 T7.X, T0.X, 74, #3 +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 109, #3 ; EG-NEXT: ALU clause starting at 22: -; EG-NEXT: MOV * T4.X, 0.0, +; EG-NEXT: MOV * T0.X, 0.0, ; EG-NEXT: ALU clause starting at 23: -; EG-NEXT: BFE_INT T0.Z, T5.X, 0.0, literal.x, -; EG-NEXT: BFE_INT * T0.W, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x, +; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T4.X, T11.X, 0.0, literal.x, -; EG-NEXT: BFE_INT T0.Y, T10.X, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BFE_INT * T1.Z, T9.X, 0.0, literal.x, BS:VEC_201 +; EG-NEXT: BFE_INT T0.X, T7.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T0.Y, T6.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: BFE_INT * T1.Z, T5.X, 0.0, literal.x, BS:VEC_201 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T1.W, T8.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T1.W, T4.X, 0.0, literal.x, ; EG-NEXT: MIN_INT * T0.W, T0.Z, T0.W, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: MIN_INT T0.Z, T1.Z, PV.W, ; EG-NEXT: AND_INT T0.W, PS, literal.x, -; EG-NEXT: MIN_INT * T1.W, T4.X, T0.Y, +; EG-NEXT: MIN_INT * T1.W, T0.X, T0.Y, ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, PS, literal.x, +; EG-NEXT: AND_INT T0.X, PS, literal.x, ; EG-NEXT: LSHL T0.Y, PV.W, literal.y, -; EG-NEXT: BFE_INT T1.Z, T7.X, 0.0, literal.z, -; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.z, BS:VEC_120/SCL_212 -; EG-NEXT: LSHL * T1.W, PV.Z, literal.w, -; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) -; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: BFE_INT T1.Z, T3.X, 0.0, literal.y, +; EG-NEXT: BFE_INT T0.W, T2.X, 0.0, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x, +; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: MIN_INT T0.Z, PV.Z, PV.W, ; EG-NEXT: OR_INT T0.W, PS, PV.Y, ; EG-NEXT: LSHL * T1.W, PV.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT T0.W, PV.W, PS, -; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x, -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: OR_INT T4.X, PV.W, PS, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, +; EG-NEXT: LSHL * T1.W, PV.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.X, PV.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CI-LABEL: s_test_imin_sle_v4i8: @@ -728,30 +727,30 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; EG-NEXT: ALU 0, @14, KC0[], KC1[] ; EG-NEXT: TEX 3 @6 ; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3 -; EG-NEXT: VTX_READ_16 T6.X, T4.X, 44, #3 -; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3 -; EG-NEXT: VTX_READ_16 T4.X, T4.X, 46, #3 +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 +; EG-NEXT: VTX_READ_16 T2.X, T0.X, 46, #3 +; EG-NEXT: VTX_READ_16 T3.X, T0.X, 42, #3 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 ; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: MOV * T4.X, 0.0, +; EG-NEXT: MOV * T0.X, 0.0, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x, -; EG-NEXT: BFE_INT T0.Y, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BFE_INT * T0.Z, T7.X, 0.0, literal.x, BS:VEC_201 +; EG-NEXT: BFE_INT T1.X, T1.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T0.Y, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: BFE_INT * T0.Z, T3.X, 0.0, literal.x, BS:VEC_201 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT * T0.W, T6.X, 0.0, literal.x, +; EG-NEXT: BFE_INT * T0.W, T2.X, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: MIN_INT T0.W, T0.Z, PV.W, -; EG-NEXT: MIN_INT * T1.W, T5.X, T0.Y, -; EG-NEXT: LSHL T1.W, PS, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: OR_INT T4.X, PV.W, PS, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, +; EG-NEXT: MIN_INT * T1.W, T1.X, T0.Y, +; EG-NEXT: AND_INT T1.W, PS, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT T0.X, PV.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CI-LABEL: s_test_imin_sle_v2i16: @@ -3977,37 +3976,34 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; EG-NEXT: TEX 0 @8 ; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @10 -; EG-NEXT: ALU 16, @16, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1 +; EG-NEXT: ALU 13, @16, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 8: ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: Fetch clause starting at 10: -; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 ; EG-NEXT: ALU clause starting at 12: ; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: ADD_INT * T7.X, KC0[2].W, T0.W, +; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, T0.W, ; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: LSHR T1.W, T0.X, literal.x, -; EG-NEXT: LSHR * T2.W, T7.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T8.X, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T0.Z, T7.X, 0.0, literal.x, -; EG-NEXT: BFE_INT * T1.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: BFE_INT T0.Y, T0.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ASHR T1.W, T0.X, literal.x, +; EG-NEXT: ASHR * T2.W, T1.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: MIN_INT T1.W, PV.W, PV.Z, -; EG-NEXT: MIN_INT * T2.W, PV.Y, PV.X, -; EG-NEXT: LSHL T2.W, PS, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: OR_INT T0.X, PS, PV.W, +; EG-NEXT: MIN_INT T1.W, PS, PV.W, +; EG-NEXT: MIN_INT * T2.W, PV.Z, PV.Y, +; EG-NEXT: AND_INT T2.W, PS, literal.x, +; EG-NEXT: LSHL * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT T0.X, PV.W, PS, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, -; EG-NEXT: LSHR * T7.X, PV.W, literal.x, +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CI-LABEL: v_test_imin_sle_v2i16: @@ -4131,25 +4127,25 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @10 ; EG-NEXT: ALU 13, @16, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 8: ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: Fetch clause starting at 10: -; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 ; EG-NEXT: ALU clause starting at 12: ; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: ADD_INT * T7.X, KC0[2].Z, T0.W, +; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, T0.W, ; EG-NEXT: ALU clause starting at 16: ; EG-NEXT: LSHR T1.W, T0.X, literal.x, -; EG-NEXT: LSHR * T2.W, T7.X, literal.x, +; EG-NEXT: LSHR * T2.W, T1.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T0.Z, T0.X, literal.x, -; EG-NEXT: AND_INT T3.W, T7.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT T3.W, T1.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: MIN_UINT * T1.W, PS, PV.W, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LSHL T1.W, PS, literal.x, @@ -4157,7 +4153,7 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT T0.X, PS, PV.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, -; EG-NEXT: LSHR * T7.X, PV.W, literal.x, +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CI-LABEL: v_test_imin_ule_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll b/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll index 5c0192d0d1af5..5c06506a542e8 100644 --- a/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll @@ -141,35 +141,28 @@ define amdgpu_kernel void @v2i16_to_v4i8(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @v4i16_extract_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; EG-LABEL: v4i16_extract_i8: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 17, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T6.X, T5.X, 6, #1 -; EG-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T5.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: LSHL * T0.W, T6.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PV.W, T5.X, -; EG-NEXT: MOV * T3.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: MOV * T1.W, literal.y, +; EG-NEXT: LSHR * T1.W, T0.X, literal.y, ; EG-NEXT: 3(4.203895e-45), 8(1.121039e-44) -; EG-NEXT: BFE_UINT T1.W, PV.Y, literal.x, PS, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45) -; EG-NEXT: LSHL T5.X, PV.W, PS, -; EG-NEXT: LSHL * T5.W, literal.x, PS, +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T0.X, T1.W, PV.W, +; EG-NEXT: LSHL * T0.W, literal.x, PV.W, ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: MOV T5.Y, 0.0, -; EG-NEXT: MOV * T5.Z, 0.0, -; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %load = load <4 x i16>, ptr addrspace(1) %in, align 2 %bc = bitcast <4 x i16> %load to <8 x i8> diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 6b4bca11d80c7..2fa1fcaaebd4d 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -590,31 +590,31 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @10 ; EG-NEXT: ALU 11, @16, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 8: ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 ; EG-NEXT: Fetch clause starting at 10: -; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 ; EG-NEXT: ALU clause starting at 12: ; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: MOV * T1.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 16: ; EG-NEXT: AND_INT T0.Z, T0.X, literal.x, ; EG-NEXT: LSHR T0.W, T0.X, literal.y, -; EG-NEXT: LSHR * T1.W, T7.X, literal.y, +; EG-NEXT: LSHR * T1.W, T1.X, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LSHL T0.W, PS, PV.W, -; EG-NEXT: LSHL * T1.W, T7.X, PV.Z, +; EG-NEXT: LSHL * T1.W, T1.X, PV.Z, ; EG-NEXT: AND_INT T1.W, PS, literal.x, ; EG-NEXT: LSHL * T0.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: OR_INT T0.X, PV.W, PS, -; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 9d550ec27a63b..8d121988e98bf 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -206,29 +206,27 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1 +; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T6.XY, T6.X, 0, #1 +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T6.X, KC0[2].Z, +; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHR * T0.W, T6.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR T0.Z, T6.Y, literal.x, -; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, T6.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: ASHR T0.W, PV.W, PS, -; EG-NEXT: ASHR * T1.W, PV.Y, PV.Z, -; EG-NEXT: LSHL T1.W, PS, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: OR_INT T6.X, PS, PV.W, -; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT T1.Y, T0.Y, literal.x, +; EG-NEXT: BFE_INT T0.Z, T0.X, 0.0, literal.y, +; EG-NEXT: LSHR T0.W, T0.Y, literal.y, +; EG-NEXT: ASHR * T1.W, T0.X, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: ASHR T0.W, PS, PV.W, +; EG-NEXT: ASHR * T1.W, PV.Z, PV.Y, +; EG-NEXT: AND_INT T1.W, PS, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT T0.X, PV.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in, i16 1 %a = load <2 x i16>, ptr addrspace(1) %in diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll index 28a7dc046139b..8c16adc5f2351 100644 --- a/llvm/test/CodeGen/Mips/cconv/vector.ll +++ b/llvm/test/CodeGen/Mips/cconv/vector.ll @@ -514,77 +514,133 @@ define <4 x i8> @i8_4(<4 x i8> %a, <4 x i8> %b) { ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5-LABEL: i8_4: -; MIPS32R5: # %bb.0: -; MIPS32R5-NEXT: addiu $sp, $sp, -16 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS32R5-NEXT: sw $5, 8($sp) -; MIPS32R5-NEXT: sw $4, 12($sp) -; MIPS32R5-NEXT: lbu $1, 9($sp) -; MIPS32R5-NEXT: lbu $2, 8($sp) -; MIPS32R5-NEXT: insert.w $w0[0], $2 -; MIPS32R5-NEXT: insert.w $w0[1], $1 -; MIPS32R5-NEXT: lbu $1, 10($sp) -; MIPS32R5-NEXT: insert.w $w0[2], $1 -; MIPS32R5-NEXT: lbu $1, 11($sp) -; MIPS32R5-NEXT: insert.w $w0[3], $1 -; MIPS32R5-NEXT: lbu $1, 13($sp) -; MIPS32R5-NEXT: lbu $2, 12($sp) -; MIPS32R5-NEXT: insert.w $w1[0], $2 -; MIPS32R5-NEXT: insert.w $w1[1], $1 -; MIPS32R5-NEXT: lbu $1, 14($sp) -; MIPS32R5-NEXT: insert.w $w1[2], $1 -; MIPS32R5-NEXT: lbu $1, 15($sp) -; MIPS32R5-NEXT: insert.w $w1[3], $1 -; MIPS32R5-NEXT: addv.w $w0, $w1, $w0 -; MIPS32R5-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5-NEXT: copy_s.w $4, $w0[3] -; MIPS32R5-NEXT: sb $4, 7($sp) -; MIPS32R5-NEXT: sb $3, 6($sp) -; MIPS32R5-NEXT: sb $2, 5($sp) -; MIPS32R5-NEXT: sb $1, 4($sp) -; MIPS32R5-NEXT: lw $2, 4($sp) -; MIPS32R5-NEXT: addiu $sp, $sp, 16 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: i8_4: +; MIPS32R5EB: # %bb.0: +; MIPS32R5EB-NEXT: srl $1, $5, 16 +; MIPS32R5EB-NEXT: srl $2, $5, 24 +; MIPS32R5EB-NEXT: insert.w $w0[0], $2 +; MIPS32R5EB-NEXT: insert.w $w0[1], $1 +; MIPS32R5EB-NEXT: srl $1, $5, 8 +; MIPS32R5EB-NEXT: insert.w $w0[2], $1 +; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: srl $1, $4, 16 +; MIPS32R5EB-NEXT: srl $2, $4, 24 +; MIPS32R5EB-NEXT: insert.w $w1[0], $2 +; MIPS32R5EB-NEXT: insert.w $w1[1], $1 +; MIPS32R5EB-NEXT: srl $1, $4, 8 +; MIPS32R5EB-NEXT: insert.w $w1[2], $1 +; MIPS32R5EB-NEXT: insert.w $w1[3], $4 +; MIPS32R5EB-NEXT: addv.w $w0, $w1, $w0 +; MIPS32R5EB-NEXT: copy_s.w $1, $w0[0] +; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] +; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] +; MIPS32R5EB-NEXT: copy_s.w $4, $w0[2] +; MIPS32R5EB-NEXT: andi $4, $4, 255 +; MIPS32R5EB-NEXT: ins $3, $4, 8, 24 +; MIPS32R5EB-NEXT: andi $2, $2, 255 +; MIPS32R5EB-NEXT: sll $2, $2, 16 +; MIPS32R5EB-NEXT: or $2, $3, $2 +; MIPS32R5EB-NEXT: sll $1, $1, 24 +; MIPS32R5EB-NEXT: or $2, $2, $1 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; -; MIPS64R5-LABEL: i8_4: -; MIPS64R5: # %bb.0: -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sw $5, 8($sp) -; MIPS64R5-NEXT: sw $4, 12($sp) -; MIPS64R5-NEXT: lbu $1, 9($sp) -; MIPS64R5-NEXT: lbu $2, 8($sp) -; MIPS64R5-NEXT: insert.w $w0[0], $2 -; MIPS64R5-NEXT: insert.w $w0[1], $1 -; MIPS64R5-NEXT: lbu $1, 10($sp) -; MIPS64R5-NEXT: insert.w $w0[2], $1 -; MIPS64R5-NEXT: lbu $1, 11($sp) -; MIPS64R5-NEXT: insert.w $w0[3], $1 -; MIPS64R5-NEXT: lbu $1, 13($sp) -; MIPS64R5-NEXT: lbu $2, 12($sp) -; MIPS64R5-NEXT: insert.w $w1[0], $2 -; MIPS64R5-NEXT: insert.w $w1[1], $1 -; MIPS64R5-NEXT: lbu $1, 14($sp) -; MIPS64R5-NEXT: insert.w $w1[2], $1 -; MIPS64R5-NEXT: lbu $1, 15($sp) -; MIPS64R5-NEXT: insert.w $w1[3], $1 -; MIPS64R5-NEXT: addv.w $w0, $w1, $w0 -; MIPS64R5-NEXT: copy_s.w $1, $w0[0] -; MIPS64R5-NEXT: copy_s.w $2, $w0[1] -; MIPS64R5-NEXT: copy_s.w $3, $w0[2] -; MIPS64R5-NEXT: copy_s.w $4, $w0[3] -; MIPS64R5-NEXT: sb $4, 7($sp) -; MIPS64R5-NEXT: sb $3, 6($sp) -; MIPS64R5-NEXT: sb $2, 5($sp) -; MIPS64R5-NEXT: sb $1, 4($sp) -; MIPS64R5-NEXT: lw $2, 4($sp) -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: i8_4: +; MIPS64R5EB: # %bb.0: +; MIPS64R5EB-NEXT: sll $1, $5, 0 +; MIPS64R5EB-NEXT: srl $2, $1, 16 +; MIPS64R5EB-NEXT: srl $3, $1, 24 +; MIPS64R5EB-NEXT: insert.w $w0[0], $3 +; MIPS64R5EB-NEXT: insert.w $w0[1], $2 +; MIPS64R5EB-NEXT: srl $2, $1, 8 +; MIPS64R5EB-NEXT: insert.w $w0[2], $2 +; MIPS64R5EB-NEXT: sll $2, $4, 0 +; MIPS64R5EB-NEXT: insert.w $w0[3], $1 +; MIPS64R5EB-NEXT: srl $1, $2, 16 +; MIPS64R5EB-NEXT: srl $3, $2, 24 +; MIPS64R5EB-NEXT: insert.w $w1[0], $3 +; MIPS64R5EB-NEXT: insert.w $w1[1], $1 +; MIPS64R5EB-NEXT: srl $1, $2, 8 +; MIPS64R5EB-NEXT: insert.w $w1[2], $1 +; MIPS64R5EB-NEXT: insert.w $w1[3], $2 +; MIPS64R5EB-NEXT: addv.w $w0, $w1, $w0 +; MIPS64R5EB-NEXT: copy_s.w $1, $w0[0] +; MIPS64R5EB-NEXT: copy_s.w $2, $w0[1] +; MIPS64R5EB-NEXT: copy_s.w $3, $w0[3] +; MIPS64R5EB-NEXT: copy_s.w $4, $w0[2] +; MIPS64R5EB-NEXT: andi $4, $4, 255 +; MIPS64R5EB-NEXT: ins $3, $4, 8, 24 +; MIPS64R5EB-NEXT: andi $2, $2, 255 +; MIPS64R5EB-NEXT: sll $2, $2, 16 +; MIPS64R5EB-NEXT: or $2, $3, $2 +; MIPS64R5EB-NEXT: sll $1, $1, 24 +; MIPS64R5EB-NEXT: or $2, $2, $1 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop +; +; MIPS32R5EL-LABEL: i8_4: +; MIPS32R5EL: # %bb.0: +; MIPS32R5EL-NEXT: srl $1, $5, 8 +; MIPS32R5EL-NEXT: insert.w $w0[0], $5 +; MIPS32R5EL-NEXT: insert.w $w0[1], $1 +; MIPS32R5EL-NEXT: srl $1, $5, 16 +; MIPS32R5EL-NEXT: insert.w $w0[2], $1 +; MIPS32R5EL-NEXT: srl $1, $5, 24 +; MIPS32R5EL-NEXT: insert.w $w0[3], $1 +; MIPS32R5EL-NEXT: srl $1, $4, 8 +; MIPS32R5EL-NEXT: insert.w $w1[0], $4 +; MIPS32R5EL-NEXT: insert.w $w1[1], $1 +; MIPS32R5EL-NEXT: srl $1, $4, 16 +; MIPS32R5EL-NEXT: insert.w $w1[2], $1 +; MIPS32R5EL-NEXT: srl $1, $4, 24 +; MIPS32R5EL-NEXT: insert.w $w1[3], $1 +; MIPS32R5EL-NEXT: addv.w $w0, $w1, $w0 +; MIPS32R5EL-NEXT: copy_s.w $1, $w0[3] +; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2] +; MIPS32R5EL-NEXT: copy_s.w $3, $w0[0] +; MIPS32R5EL-NEXT: copy_s.w $4, $w0[1] +; MIPS32R5EL-NEXT: andi $4, $4, 255 +; MIPS32R5EL-NEXT: ins $3, $4, 8, 24 +; MIPS32R5EL-NEXT: andi $2, $2, 255 +; MIPS32R5EL-NEXT: sll $2, $2, 16 +; MIPS32R5EL-NEXT: or $2, $3, $2 +; MIPS32R5EL-NEXT: sll $1, $1, 24 +; MIPS32R5EL-NEXT: or $2, $2, $1 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop +; +; MIPS64R5EL-LABEL: i8_4: +; MIPS64R5EL: # %bb.0: +; MIPS64R5EL-NEXT: sll $1, $5, 0 +; MIPS64R5EL-NEXT: srl $2, $1, 8 +; MIPS64R5EL-NEXT: insert.w $w0[0], $1 +; MIPS64R5EL-NEXT: insert.w $w0[1], $2 +; MIPS64R5EL-NEXT: srl $2, $1, 16 +; MIPS64R5EL-NEXT: insert.w $w0[2], $2 +; MIPS64R5EL-NEXT: sll $2, $4, 0 +; MIPS64R5EL-NEXT: srl $1, $1, 24 +; MIPS64R5EL-NEXT: insert.w $w0[3], $1 +; MIPS64R5EL-NEXT: srl $1, $2, 8 +; MIPS64R5EL-NEXT: insert.w $w1[0], $2 +; MIPS64R5EL-NEXT: insert.w $w1[1], $1 +; MIPS64R5EL-NEXT: srl $1, $2, 16 +; MIPS64R5EL-NEXT: insert.w $w1[2], $1 +; MIPS64R5EL-NEXT: srl $1, $2, 24 +; MIPS64R5EL-NEXT: insert.w $w1[3], $1 +; MIPS64R5EL-NEXT: addv.w $w0, $w1, $w0 +; MIPS64R5EL-NEXT: copy_s.w $1, $w0[3] +; MIPS64R5EL-NEXT: copy_s.w $2, $w0[2] +; MIPS64R5EL-NEXT: copy_s.w $3, $w0[0] +; MIPS64R5EL-NEXT: copy_s.w $4, $w0[1] +; MIPS64R5EL-NEXT: andi $4, $4, 255 +; MIPS64R5EL-NEXT: ins $3, $4, 8, 24 +; MIPS64R5EL-NEXT: andi $2, $2, 255 +; MIPS64R5EL-NEXT: sll $2, $2, 16 +; MIPS64R5EL-NEXT: or $2, $3, $2 +; MIPS64R5EL-NEXT: sll $1, $1, 24 +; MIPS64R5EL-NEXT: or $2, $2, $1 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop %1 = add <4 x i8> %a, %b ret <4 x i8> %1 } @@ -771,65 +827,80 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) { ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop ; -; MIPS64R5-LABEL: i8_8: -; MIPS64R5: # %bb.0: -; MIPS64R5-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5-NEXT: sd $5, 16($sp) -; MIPS64R5-NEXT: lbu $1, 17($sp) -; MIPS64R5-NEXT: lbu $2, 16($sp) -; MIPS64R5-NEXT: sd $4, 24($sp) -; MIPS64R5-NEXT: insert.h $w0[0], $2 -; MIPS64R5-NEXT: insert.h $w0[1], $1 -; MIPS64R5-NEXT: lbu $1, 18($sp) -; MIPS64R5-NEXT: insert.h $w0[2], $1 -; MIPS64R5-NEXT: lbu $1, 19($sp) -; MIPS64R5-NEXT: insert.h $w0[3], $1 -; MIPS64R5-NEXT: lbu $1, 20($sp) -; MIPS64R5-NEXT: insert.h $w0[4], $1 -; MIPS64R5-NEXT: lbu $1, 25($sp) -; MIPS64R5-NEXT: lbu $2, 24($sp) -; MIPS64R5-NEXT: insert.h $w1[0], $2 -; MIPS64R5-NEXT: insert.h $w1[1], $1 -; MIPS64R5-NEXT: lbu $1, 21($sp) -; MIPS64R5-NEXT: lbu $2, 26($sp) -; MIPS64R5-NEXT: insert.h $w1[2], $2 -; MIPS64R5-NEXT: insert.h $w0[5], $1 -; MIPS64R5-NEXT: lbu $1, 27($sp) -; MIPS64R5-NEXT: lbu $2, 23($sp) -; MIPS64R5-NEXT: lbu $3, 22($sp) -; MIPS64R5-NEXT: lbu $4, 31($sp) -; MIPS64R5-NEXT: insert.h $w0[6], $3 -; MIPS64R5-NEXT: insert.h $w0[7], $2 -; MIPS64R5-NEXT: insert.h $w1[3], $1 -; MIPS64R5-NEXT: lbu $1, 28($sp) -; MIPS64R5-NEXT: insert.h $w1[4], $1 -; MIPS64R5-NEXT: lbu $1, 29($sp) -; MIPS64R5-NEXT: insert.h $w1[5], $1 -; MIPS64R5-NEXT: lbu $1, 30($sp) -; MIPS64R5-NEXT: insert.h $w1[6], $1 -; MIPS64R5-NEXT: insert.h $w1[7], $4 -; MIPS64R5-NEXT: addv.h $w0, $w1, $w0 -; MIPS64R5-NEXT: copy_s.h $1, $w0[0] -; MIPS64R5-NEXT: copy_s.h $2, $w0[1] -; MIPS64R5-NEXT: copy_s.h $3, $w0[2] -; MIPS64R5-NEXT: copy_s.h $4, $w0[3] -; MIPS64R5-NEXT: copy_s.h $5, $w0[4] -; MIPS64R5-NEXT: copy_s.h $6, $w0[5] -; MIPS64R5-NEXT: copy_s.h $7, $w0[6] -; MIPS64R5-NEXT: copy_s.h $8, $w0[7] -; MIPS64R5-NEXT: sb $8, 15($sp) -; MIPS64R5-NEXT: sb $7, 14($sp) -; MIPS64R5-NEXT: sb $6, 13($sp) -; MIPS64R5-NEXT: sb $5, 12($sp) -; MIPS64R5-NEXT: sb $4, 11($sp) -; MIPS64R5-NEXT: sb $3, 10($sp) -; MIPS64R5-NEXT: sb $2, 9($sp) -; MIPS64R5-NEXT: sb $1, 8($sp) -; MIPS64R5-NEXT: ld $2, 8($sp) -; MIPS64R5-NEXT: daddiu $sp, $sp, 32 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: i8_8: +; MIPS64R5EB: # %bb.0: +; MIPS64R5EB-NEXT: dsrl $1, $5, 48 +; MIPS64R5EB-NEXT: dsrl $2, $5, 40 +; MIPS64R5EB-NEXT: dsrl $3, $4, 48 +; MIPS64R5EB-NEXT: sll $1, $1, 0 +; MIPS64R5EB-NEXT: dsrl $6, $5, 56 +; MIPS64R5EB-NEXT: sll $6, $6, 0 +; MIPS64R5EB-NEXT: insert.h $w0[0], $6 +; MIPS64R5EB-NEXT: insert.h $w0[1], $1 +; MIPS64R5EB-NEXT: sll $1, $2, 0 +; MIPS64R5EB-NEXT: sll $2, $3, 0 +; MIPS64R5EB-NEXT: dsrl $3, $4, 56 +; MIPS64R5EB-NEXT: sll $3, $3, 0 +; MIPS64R5EB-NEXT: insert.h $w1[0], $3 +; MIPS64R5EB-NEXT: insert.h $w1[1], $2 +; MIPS64R5EB-NEXT: insert.h $w0[2], $1 +; MIPS64R5EB-NEXT: dsrl $1, $4, 40 +; MIPS64R5EB-NEXT: sll $1, $1, 0 +; MIPS64R5EB-NEXT: dsrl $2, $5, 32 +; MIPS64R5EB-NEXT: sll $2, $2, 0 +; MIPS64R5EB-NEXT: insert.h $w0[3], $2 +; MIPS64R5EB-NEXT: insert.h $w1[2], $1 +; MIPS64R5EB-NEXT: dsrl $1, $5, 24 +; MIPS64R5EB-NEXT: dsrl $2, $4, 24 +; MIPS64R5EB-NEXT: sll $1, $1, 0 +; MIPS64R5EB-NEXT: dsrl $3, $4, 32 +; MIPS64R5EB-NEXT: sll $3, $3, 0 +; MIPS64R5EB-NEXT: insert.h $w1[3], $3 +; MIPS64R5EB-NEXT: insert.h $w0[4], $1 +; MIPS64R5EB-NEXT: sll $1, $5, 0 +; MIPS64R5EB-NEXT: srl $3, $1, 16 +; MIPS64R5EB-NEXT: insert.h $w0[5], $3 +; MIPS64R5EB-NEXT: sll $2, $2, 0 +; MIPS64R5EB-NEXT: srl $3, $1, 8 +; MIPS64R5EB-NEXT: insert.h $w0[6], $3 +; MIPS64R5EB-NEXT: insert.h $w0[7], $1 +; MIPS64R5EB-NEXT: insert.h $w1[4], $2 +; MIPS64R5EB-NEXT: sll $1, $4, 0 +; MIPS64R5EB-NEXT: srl $2, $1, 16 +; MIPS64R5EB-NEXT: insert.h $w1[5], $2 +; MIPS64R5EB-NEXT: srl $2, $1, 8 +; MIPS64R5EB-NEXT: insert.h $w1[6], $2 +; MIPS64R5EB-NEXT: insert.h $w1[7], $1 +; MIPS64R5EB-NEXT: addv.h $w0, $w1, $w0 +; MIPS64R5EB-NEXT: copy_s.h $1, $w0[1] +; MIPS64R5EB-NEXT: copy_s.h $2, $w0[0] +; MIPS64R5EB-NEXT: copy_s.h $3, $w0[2] +; MIPS64R5EB-NEXT: copy_s.h $4, $w0[3] +; MIPS64R5EB-NEXT: copy_s.h $5, $w0[4] +; MIPS64R5EB-NEXT: copy_s.h $6, $w0[5] +; MIPS64R5EB-NEXT: copy_s.h $7, $w0[6] +; MIPS64R5EB-NEXT: copy_s.h $8, $w0[7] +; MIPS64R5EB-NEXT: andi $7, $7, 255 +; MIPS64R5EB-NEXT: dinsm $8, $7, 8, 56 +; MIPS64R5EB-NEXT: andi $6, $6, 255 +; MIPS64R5EB-NEXT: dsll $6, $6, 16 +; MIPS64R5EB-NEXT: or $6, $8, $6 +; MIPS64R5EB-NEXT: andi $5, $5, 255 +; MIPS64R5EB-NEXT: dsll $5, $5, 24 +; MIPS64R5EB-NEXT: or $5, $6, $5 +; MIPS64R5EB-NEXT: andi $4, $4, 255 +; MIPS64R5EB-NEXT: dsll $4, $4, 32 +; MIPS64R5EB-NEXT: or $4, $5, $4 +; MIPS64R5EB-NEXT: andi $3, $3, 255 +; MIPS64R5EB-NEXT: dsll $3, $3, 40 +; MIPS64R5EB-NEXT: or $3, $4, $3 +; MIPS64R5EB-NEXT: andi $1, $1, 255 +; MIPS64R5EB-NEXT: dsll $1, $1, 48 +; MIPS64R5EB-NEXT: or $1, $3, $1 +; MIPS64R5EB-NEXT: dsll $2, $2, 56 +; MIPS64R5EB-NEXT: or $2, $1, $2 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS32R5EL-LABEL: i8_8: ; MIPS32R5EL: # %bb.0: @@ -909,6 +980,85 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) { ; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop +; +; MIPS64R5EL-LABEL: i8_8: +; MIPS64R5EL: # %bb.0: +; MIPS64R5EL-NEXT: dsrl $1, $5, 8 +; MIPS64R5EL-NEXT: dsrl $2, $4, 8 +; MIPS64R5EL-NEXT: sll $1, $1, 0 +; MIPS64R5EL-NEXT: sll $3, $5, 0 +; MIPS64R5EL-NEXT: insert.h $w0[0], $3 +; MIPS64R5EL-NEXT: insert.h $w0[1], $1 +; MIPS64R5EL-NEXT: sll $1, $2, 0 +; MIPS64R5EL-NEXT: sll $2, $4, 0 +; MIPS64R5EL-NEXT: dsrl $3, $5, 16 +; MIPS64R5EL-NEXT: sll $3, $3, 0 +; MIPS64R5EL-NEXT: insert.h $w0[2], $3 +; MIPS64R5EL-NEXT: insert.h $w1[0], $2 +; MIPS64R5EL-NEXT: insert.h $w1[1], $1 +; MIPS64R5EL-NEXT: dsrl $1, $4, 16 +; MIPS64R5EL-NEXT: sll $1, $1, 0 +; MIPS64R5EL-NEXT: insert.h $w1[2], $1 +; MIPS64R5EL-NEXT: dsrl $1, $5, 32 +; MIPS64R5EL-NEXT: dsrl $2, $4, 32 +; MIPS64R5EL-NEXT: dsrl $3, $5, 24 +; MIPS64R5EL-NEXT: sll $3, $3, 0 +; MIPS64R5EL-NEXT: insert.h $w0[3], $3 +; MIPS64R5EL-NEXT: dsrl $3, $5, 56 +; MIPS64R5EL-NEXT: sll $1, $1, 0 +; MIPS64R5EL-NEXT: dsrl $6, $4, 24 +; MIPS64R5EL-NEXT: sll $6, $6, 0 +; MIPS64R5EL-NEXT: insert.h $w1[3], $6 +; MIPS64R5EL-NEXT: insert.h $w0[4], $1 +; MIPS64R5EL-NEXT: sll $1, $2, 0 +; MIPS64R5EL-NEXT: dsrl $2, $5, 48 +; MIPS64R5EL-NEXT: dsrl $5, $5, 40 +; MIPS64R5EL-NEXT: sll $5, $5, 0 +; MIPS64R5EL-NEXT: dsrl $6, $4, 56 +; MIPS64R5EL-NEXT: dsrl $7, $4, 48 +; MIPS64R5EL-NEXT: insert.h $w0[5], $5 +; MIPS64R5EL-NEXT: sll $2, $2, 0 +; MIPS64R5EL-NEXT: insert.h $w0[6], $2 +; MIPS64R5EL-NEXT: sll $2, $3, 0 +; MIPS64R5EL-NEXT: insert.h $w0[7], $2 +; MIPS64R5EL-NEXT: insert.h $w1[4], $1 +; MIPS64R5EL-NEXT: dsrl $1, $4, 40 +; MIPS64R5EL-NEXT: sll $1, $1, 0 +; MIPS64R5EL-NEXT: insert.h $w1[5], $1 +; MIPS64R5EL-NEXT: sll $1, $7, 0 +; MIPS64R5EL-NEXT: insert.h $w1[6], $1 +; MIPS64R5EL-NEXT: sll $1, $6, 0 +; MIPS64R5EL-NEXT: insert.h $w1[7], $1 +; MIPS64R5EL-NEXT: addv.h $w0, $w1, $w0 +; MIPS64R5EL-NEXT: copy_s.h $1, $w0[6] +; MIPS64R5EL-NEXT: copy_s.h $2, $w0[7] +; MIPS64R5EL-NEXT: copy_s.h $3, $w0[5] +; MIPS64R5EL-NEXT: copy_s.h $4, $w0[4] +; MIPS64R5EL-NEXT: copy_s.h $5, $w0[3] +; MIPS64R5EL-NEXT: copy_s.h $6, $w0[2] +; MIPS64R5EL-NEXT: copy_s.h $7, $w0[1] +; MIPS64R5EL-NEXT: copy_s.h $8, $w0[0] +; MIPS64R5EL-NEXT: andi $7, $7, 255 +; MIPS64R5EL-NEXT: dinsm $8, $7, 8, 56 +; MIPS64R5EL-NEXT: andi $6, $6, 255 +; MIPS64R5EL-NEXT: dsll $6, $6, 16 +; MIPS64R5EL-NEXT: or $6, $8, $6 +; MIPS64R5EL-NEXT: andi $5, $5, 255 +; MIPS64R5EL-NEXT: dsll $5, $5, 24 +; MIPS64R5EL-NEXT: or $5, $6, $5 +; MIPS64R5EL-NEXT: andi $4, $4, 255 +; MIPS64R5EL-NEXT: dsll $4, $4, 32 +; MIPS64R5EL-NEXT: or $4, $5, $4 +; MIPS64R5EL-NEXT: andi $3, $3, 255 +; MIPS64R5EL-NEXT: dsll $3, $3, 40 +; MIPS64R5EL-NEXT: or $3, $4, $3 +; MIPS64R5EL-NEXT: andi $1, $1, 255 +; MIPS64R5EL-NEXT: dsll $1, $1, 48 +; MIPS64R5EL-NEXT: or $1, $3, $1 +; MIPS64R5EL-NEXT: dsll $2, $2, 56 +; MIPS64R5EL-NEXT: or $2, $1, $2 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop %1 = add <8 x i8> %a, %b ret <8 x i8> %1 } @@ -1221,102 +1371,101 @@ define <2 x i16> @i16_2(<2 x i16> %a, <2 x i16> %b) { ; ; MIPS32R5EB-LABEL: i16_2: ; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -64 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 64 -; MIPS32R5EB-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 +; MIPS32R5EB-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill ; MIPS32R5EB-NEXT: .cfi_offset 31, -4 ; MIPS32R5EB-NEXT: .cfi_offset 30, -8 ; MIPS32R5EB-NEXT: move $fp, $sp ; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EB-NEXT: addiu $1, $zero, -16 ; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: sw $5, 48($sp) -; MIPS32R5EB-NEXT: sw $4, 52($sp) -; MIPS32R5EB-NEXT: lhu $1, 50($sp) -; MIPS32R5EB-NEXT: sw $1, 28($sp) -; MIPS32R5EB-NEXT: lhu $1, 48($sp) +; MIPS32R5EB-NEXT: sw $5, 28($sp) +; MIPS32R5EB-NEXT: srl $1, $5, 16 ; MIPS32R5EB-NEXT: sw $1, 20($sp) -; MIPS32R5EB-NEXT: lhu $1, 54($sp) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: lhu $1, 52($sp) +; MIPS32R5EB-NEXT: sw $4, 12($sp) +; MIPS32R5EB-NEXT: srl $1, $4, 16 ; MIPS32R5EB-NEXT: sw $1, 4($sp) ; MIPS32R5EB-NEXT: ld.d $w0, 16($sp) ; MIPS32R5EB-NEXT: ld.d $w1, 0($sp) ; MIPS32R5EB-NEXT: addv.d $w0, $w1, $w0 ; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177 -; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5EB-NEXT: sh $2, 46($sp) -; MIPS32R5EB-NEXT: sh $1, 44($sp) -; MIPS32R5EB-NEXT: lw $2, 44($sp) +; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1] +; MIPS32R5EB-NEXT: ins $2, $1, 16, 16 ; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 64 +; MIPS32R5EB-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop ; -; MIPS64R5-LABEL: i16_2: -; MIPS64R5: # %bb.0: -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sw $5, 8($sp) -; MIPS64R5-NEXT: sw $4, 12($sp) -; MIPS64R5-NEXT: lh $1, 10($sp) -; MIPS64R5-NEXT: lh $2, 8($sp) -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $1 -; MIPS64R5-NEXT: lh $1, 14($sp) -; MIPS64R5-NEXT: lh $2, 12($sp) -; MIPS64R5-NEXT: insert.d $w1[0], $2 -; MIPS64R5-NEXT: insert.d $w1[1], $1 -; MIPS64R5-NEXT: addv.d $w0, $w1, $w0 -; MIPS64R5-NEXT: copy_s.d $1, $w0[0] -; MIPS64R5-NEXT: copy_s.d $2, $w0[1] -; MIPS64R5-NEXT: sh $2, 6($sp) -; MIPS64R5-NEXT: sh $1, 4($sp) -; MIPS64R5-NEXT: lw $2, 4($sp) -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: i16_2: +; MIPS64R5EB: # %bb.0: +; MIPS64R5EB-NEXT: sll $1, $5, 0 +; MIPS64R5EB-NEXT: srl $1, $1, 16 +; MIPS64R5EB-NEXT: insert.d $w0[0], $1 +; MIPS64R5EB-NEXT: insert.d $w0[1], $5 +; MIPS64R5EB-NEXT: sll $1, $4, 0 +; MIPS64R5EB-NEXT: srl $1, $1, 16 +; MIPS64R5EB-NEXT: insert.d $w1[0], $1 +; MIPS64R5EB-NEXT: insert.d $w1[1], $4 +; MIPS64R5EB-NEXT: addv.d $w0, $w1, $w0 +; MIPS64R5EB-NEXT: shf.w $w0, $w0, 177 +; MIPS64R5EB-NEXT: copy_s.w $2, $w0[3] +; MIPS64R5EB-NEXT: copy_s.w $1, $w0[1] +; MIPS64R5EB-NEXT: ins $2, $1, 16, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS32R5EL-LABEL: i16_2: ; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -64 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 64 -; MIPS32R5EL-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 +; MIPS32R5EL-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill ; MIPS32R5EL-NEXT: .cfi_offset 31, -4 ; MIPS32R5EL-NEXT: .cfi_offset 30, -8 ; MIPS32R5EL-NEXT: move $fp, $sp ; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EL-NEXT: addiu $1, $zero, -16 ; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: sw $5, 48($sp) -; MIPS32R5EL-NEXT: sw $4, 52($sp) -; MIPS32R5EL-NEXT: lhu $1, 50($sp) +; MIPS32R5EL-NEXT: sw $5, 16($sp) +; MIPS32R5EL-NEXT: srl $1, $5, 16 ; MIPS32R5EL-NEXT: sw $1, 24($sp) -; MIPS32R5EL-NEXT: lhu $1, 48($sp) -; MIPS32R5EL-NEXT: sw $1, 16($sp) -; MIPS32R5EL-NEXT: lhu $1, 54($sp) +; MIPS32R5EL-NEXT: sw $4, 0($sp) +; MIPS32R5EL-NEXT: srl $1, $4, 16 ; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: lhu $1, 52($sp) -; MIPS32R5EL-NEXT: sw $1, 0($sp) ; MIPS32R5EL-NEXT: ld.d $w0, 16($sp) ; MIPS32R5EL-NEXT: ld.d $w1, 0($sp) ; MIPS32R5EL-NEXT: addv.d $w0, $w1, $w0 -; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2] -; MIPS32R5EL-NEXT: sh $2, 46($sp) -; MIPS32R5EL-NEXT: sh $1, 44($sp) -; MIPS32R5EL-NEXT: lw $2, 44($sp) +; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] +; MIPS32R5EL-NEXT: copy_s.w $1, $w0[2] +; MIPS32R5EL-NEXT: ins $2, $1, 16, 16 ; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 64 +; MIPS32R5EL-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop +; +; MIPS64R5EL-LABEL: i16_2: +; MIPS64R5EL: # %bb.0: +; MIPS64R5EL-NEXT: sll $1, $5, 0 +; MIPS64R5EL-NEXT: srl $1, $1, 16 +; MIPS64R5EL-NEXT: insert.d $w0[0], $5 +; MIPS64R5EL-NEXT: insert.d $w0[1], $1 +; MIPS64R5EL-NEXT: sll $1, $4, 0 +; MIPS64R5EL-NEXT: srl $1, $1, 16 +; MIPS64R5EL-NEXT: insert.d $w1[0], $4 +; MIPS64R5EL-NEXT: insert.d $w1[1], $1 +; MIPS64R5EL-NEXT: addv.d $w0, $w1, $w0 +; MIPS64R5EL-NEXT: copy_s.w $2, $w0[0] +; MIPS64R5EL-NEXT: copy_s.w $1, $w0[2] +; MIPS64R5EL-NEXT: ins $2, $1, 16, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop %1 = add <2 x i16> %a, %b ret <2 x i16> %1 } @@ -1427,41 +1576,44 @@ define <4 x i16> @i16_4(<4 x i16> %a, <4 x i16> %b) { ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop ; -; MIPS64R5-LABEL: i16_4: -; MIPS64R5: # %bb.0: -; MIPS64R5-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5-NEXT: sd $5, 16($sp) -; MIPS64R5-NEXT: sd $4, 24($sp) -; MIPS64R5-NEXT: lhu $1, 18($sp) -; MIPS64R5-NEXT: lhu $2, 16($sp) -; MIPS64R5-NEXT: insert.w $w0[0], $2 -; MIPS64R5-NEXT: insert.w $w0[1], $1 -; MIPS64R5-NEXT: lhu $1, 20($sp) -; MIPS64R5-NEXT: insert.w $w0[2], $1 -; MIPS64R5-NEXT: lhu $1, 22($sp) -; MIPS64R5-NEXT: insert.w $w0[3], $1 -; MIPS64R5-NEXT: lhu $1, 26($sp) -; MIPS64R5-NEXT: lhu $2, 24($sp) -; MIPS64R5-NEXT: insert.w $w1[0], $2 -; MIPS64R5-NEXT: insert.w $w1[1], $1 -; MIPS64R5-NEXT: lhu $1, 28($sp) -; MIPS64R5-NEXT: insert.w $w1[2], $1 -; MIPS64R5-NEXT: lhu $1, 30($sp) -; MIPS64R5-NEXT: insert.w $w1[3], $1 -; MIPS64R5-NEXT: addv.w $w0, $w1, $w0 -; MIPS64R5-NEXT: copy_s.w $1, $w0[0] -; MIPS64R5-NEXT: copy_s.w $2, $w0[1] -; MIPS64R5-NEXT: copy_s.w $3, $w0[2] -; MIPS64R5-NEXT: copy_s.w $4, $w0[3] -; MIPS64R5-NEXT: sh $4, 14($sp) -; MIPS64R5-NEXT: sh $3, 12($sp) -; MIPS64R5-NEXT: sh $2, 10($sp) -; MIPS64R5-NEXT: sh $1, 8($sp) -; MIPS64R5-NEXT: ld $2, 8($sp) -; MIPS64R5-NEXT: daddiu $sp, $sp, 32 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: i16_4: +; MIPS64R5EB: # %bb.0: +; MIPS64R5EB-NEXT: dsrl $1, $5, 32 +; MIPS64R5EB-NEXT: sll $1, $1, 0 +; MIPS64R5EB-NEXT: dsrl $2, $5, 48 +; MIPS64R5EB-NEXT: sll $2, $2, 0 +; MIPS64R5EB-NEXT: insert.w $w0[0], $2 +; MIPS64R5EB-NEXT: insert.w $w0[1], $1 +; MIPS64R5EB-NEXT: dsrl $1, $4, 32 +; MIPS64R5EB-NEXT: dsrl $2, $5, 16 +; MIPS64R5EB-NEXT: sll $2, $2, 0 +; MIPS64R5EB-NEXT: insert.w $w0[2], $2 +; MIPS64R5EB-NEXT: sll $1, $1, 0 +; MIPS64R5EB-NEXT: dsrl $2, $4, 48 +; MIPS64R5EB-NEXT: sll $2, $2, 0 +; MIPS64R5EB-NEXT: sll $3, $5, 0 +; MIPS64R5EB-NEXT: insert.w $w0[3], $3 +; MIPS64R5EB-NEXT: insert.w $w1[0], $2 +; MIPS64R5EB-NEXT: insert.w $w1[1], $1 +; MIPS64R5EB-NEXT: dsrl $1, $4, 16 +; MIPS64R5EB-NEXT: sll $1, $1, 0 +; MIPS64R5EB-NEXT: insert.w $w1[2], $1 +; MIPS64R5EB-NEXT: sll $1, $4, 0 +; MIPS64R5EB-NEXT: insert.w $w1[3], $1 +; MIPS64R5EB-NEXT: addv.w $w0, $w1, $w0 +; MIPS64R5EB-NEXT: copy_s.w $1, $w0[0] +; MIPS64R5EB-NEXT: copy_s.w $2, $w0[1] +; MIPS64R5EB-NEXT: copy_s.w $3, $w0[2] +; MIPS64R5EB-NEXT: copy_s.w $4, $w0[3] +; MIPS64R5EB-NEXT: andi $3, $3, 65535 +; MIPS64R5EB-NEXT: dinsm $4, $3, 16, 48 +; MIPS64R5EB-NEXT: andi $2, $2, 65535 +; MIPS64R5EB-NEXT: dsll $2, $2, 32 +; MIPS64R5EB-NEXT: or $2, $4, $2 +; MIPS64R5EB-NEXT: dsll $1, $1, 48 +; MIPS64R5EB-NEXT: or $2, $2, $1 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS32R5EL-LABEL: i16_4: ; MIPS32R5EL: # %bb.0: @@ -1517,6 +1669,45 @@ define <4 x i16> @i16_4(<4 x i16> %a, <4 x i16> %b) { ; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop +; +; MIPS64R5EL-LABEL: i16_4: +; MIPS64R5EL: # %bb.0: +; MIPS64R5EL-NEXT: dsrl $1, $5, 16 +; MIPS64R5EL-NEXT: sll $1, $1, 0 +; MIPS64R5EL-NEXT: sll $2, $5, 0 +; MIPS64R5EL-NEXT: insert.w $w0[0], $2 +; MIPS64R5EL-NEXT: insert.w $w0[1], $1 +; MIPS64R5EL-NEXT: dsrl $1, $4, 16 +; MIPS64R5EL-NEXT: dsrl $2, $5, 32 +; MIPS64R5EL-NEXT: sll $2, $2, 0 +; MIPS64R5EL-NEXT: insert.w $w0[2], $2 +; MIPS64R5EL-NEXT: sll $1, $1, 0 +; MIPS64R5EL-NEXT: sll $2, $4, 0 +; MIPS64R5EL-NEXT: dsrl $3, $5, 48 +; MIPS64R5EL-NEXT: sll $3, $3, 0 +; MIPS64R5EL-NEXT: insert.w $w0[3], $3 +; MIPS64R5EL-NEXT: insert.w $w1[0], $2 +; MIPS64R5EL-NEXT: insert.w $w1[1], $1 +; MIPS64R5EL-NEXT: dsrl $1, $4, 32 +; MIPS64R5EL-NEXT: sll $1, $1, 0 +; MIPS64R5EL-NEXT: insert.w $w1[2], $1 +; MIPS64R5EL-NEXT: dsrl $1, $4, 48 +; MIPS64R5EL-NEXT: sll $1, $1, 0 +; MIPS64R5EL-NEXT: insert.w $w1[3], $1 +; MIPS64R5EL-NEXT: addv.w $w0, $w1, $w0 +; MIPS64R5EL-NEXT: copy_s.w $1, $w0[3] +; MIPS64R5EL-NEXT: copy_s.w $2, $w0[2] +; MIPS64R5EL-NEXT: copy_s.w $3, $w0[1] +; MIPS64R5EL-NEXT: copy_s.w $4, $w0[0] +; MIPS64R5EL-NEXT: andi $3, $3, 65535 +; MIPS64R5EL-NEXT: dinsm $4, $3, 16, 48 +; MIPS64R5EL-NEXT: andi $2, $2, 65535 +; MIPS64R5EL-NEXT: dsll $2, $2, 32 +; MIPS64R5EL-NEXT: or $2, $4, $2 +; MIPS64R5EL-NEXT: dsll $1, $1, 48 +; MIPS64R5EL-NEXT: or $2, $2, $1 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop %1 = add <4 x i16> %a, %b ret <4 x i16> %1 } @@ -1749,8 +1940,6 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) { ; ; MIPS64R5EB-LABEL: i32_2: ; MIPS64R5EB: # %bb.0: -; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32 ; MIPS64R5EB-NEXT: dsrl $1, $5, 32 ; MIPS64R5EB-NEXT: insert.d $w0[0], $1 ; MIPS64R5EB-NEXT: insert.d $w0[1], $5 @@ -1758,12 +1947,12 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) { ; MIPS64R5EB-NEXT: insert.d $w1[0], $1 ; MIPS64R5EB-NEXT: insert.d $w1[1], $4 ; MIPS64R5EB-NEXT: addv.d $w0, $w1, $w0 -; MIPS64R5EB-NEXT: copy_s.d $1, $w0[0] -; MIPS64R5EB-NEXT: copy_s.d $2, $w0[1] -; MIPS64R5EB-NEXT: sw $2, 12($sp) -; MIPS64R5EB-NEXT: sw $1, 8($sp) -; MIPS64R5EB-NEXT: ld $2, 8($sp) -; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EB-NEXT: shf.w $w0, $w0, 177 +; MIPS64R5EB-NEXT: copy_s.w $1, $w0[1] +; MIPS64R5EB-NEXT: copy_s.w $2, $w0[3] +; MIPS64R5EB-NEXT: dext $2, $2, 0, 32 +; MIPS64R5EB-NEXT: dsll $1, $1, 32 +; MIPS64R5EB-NEXT: or $2, $2, $1 ; MIPS64R5EB-NEXT: jr $ra ; MIPS64R5EB-NEXT: nop ; @@ -1797,23 +1986,18 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) { ; ; MIPS64R5EL-LABEL: i32_2: ; MIPS64R5EL: # %bb.0: -; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5EL-NEXT: sd $5, 16($sp) -; MIPS64R5EL-NEXT: sd $4, 24($sp) -; MIPS64R5EL-NEXT: lw $1, 20($sp) +; MIPS64R5EL-NEXT: dsrl $1, $5, 32 ; MIPS64R5EL-NEXT: insert.d $w0[0], $5 ; MIPS64R5EL-NEXT: insert.d $w0[1], $1 -; MIPS64R5EL-NEXT: lw $1, 28($sp) +; MIPS64R5EL-NEXT: dsrl $1, $4, 32 ; MIPS64R5EL-NEXT: insert.d $w1[0], $4 ; MIPS64R5EL-NEXT: insert.d $w1[1], $1 ; MIPS64R5EL-NEXT: addv.d $w0, $w1, $w0 -; MIPS64R5EL-NEXT: copy_s.d $1, $w0[0] -; MIPS64R5EL-NEXT: copy_s.d $2, $w0[1] -; MIPS64R5EL-NEXT: sw $2, 12($sp) -; MIPS64R5EL-NEXT: sw $1, 8($sp) -; MIPS64R5EL-NEXT: ld $2, 8($sp) -; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EL-NEXT: copy_s.w $1, $w0[2] +; MIPS64R5EL-NEXT: copy_s.w $2, $w0[0] +; MIPS64R5EL-NEXT: dext $2, $2, 0, 32 +; MIPS64R5EL-NEXT: dsll $1, $1, 32 +; MIPS64R5EL-NEXT: or $2, $2, $1 ; MIPS64R5EL-NEXT: jr $ra ; MIPS64R5EL-NEXT: nop %1 = add <2 x i32> %a, %b @@ -3424,9 +3608,9 @@ define void @call_i8_4() { ; ; MIPS32R5EB-LABEL: call_i8_4: ; MIPS32R5EB: # %bb.0: # %entry -; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: addiu $sp, $sp, -24 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 24 +; MIPS32R5EB-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32R5EB-NEXT: .cfi_offset 31, -4 ; MIPS32R5EB-NEXT: lui $1, 1543 ; MIPS32R5EB-NEXT: ori $4, $1, 2314 @@ -3436,17 +3620,17 @@ define void @call_i8_4() { ; MIPS32R5EB-NEXT: nop ; MIPS32R5EB-NEXT: lui $1, %hi(gv4i8) ; MIPS32R5EB-NEXT: sw $2, %lo(gv4i8)($1) -; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 +; MIPS32R5EB-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 24 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop ; ; MIPS64R5EB-LABEL: call_i8_4: ; MIPS64R5EB: # %bb.0: # %entry -; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5EB-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill -; MIPS64R5EB-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill ; MIPS64R5EB-NEXT: .cfi_offset 31, -8 ; MIPS64R5EB-NEXT: .cfi_offset 28, -16 ; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(call_i8_4))) @@ -3461,9 +3645,9 @@ define void @call_i8_4() { ; MIPS64R5EB-NEXT: nop ; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4i8)($gp) ; MIPS64R5EB-NEXT: sw $2, 0($1) -; MIPS64R5EB-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload -; MIPS64R5EB-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload -; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 ; MIPS64R5EB-NEXT: jr $ra ; MIPS64R5EB-NEXT: nop ; @@ -3512,9 +3696,9 @@ define void @call_i8_4() { ; ; MIPS32R5EL-LABEL: call_i8_4: ; MIPS32R5EL: # %bb.0: # %entry -; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: addiu $sp, $sp, -24 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 24 +; MIPS32R5EL-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32R5EL-NEXT: .cfi_offset 31, -4 ; MIPS32R5EL-NEXT: lui $1, 2569 ; MIPS32R5EL-NEXT: ori $4, $1, 1798 @@ -3523,17 +3707,17 @@ define void @call_i8_4() { ; MIPS32R5EL-NEXT: nop ; MIPS32R5EL-NEXT: lui $1, %hi(gv4i8) ; MIPS32R5EL-NEXT: sw $2, %lo(gv4i8)($1) -; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 +; MIPS32R5EL-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 24 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop ; ; MIPS64R5EL-LABEL: call_i8_4: ; MIPS64R5EL: # %bb.0: # %entry -; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5EL-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill -; MIPS64R5EL-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill ; MIPS64R5EL-NEXT: .cfi_offset 31, -8 ; MIPS64R5EL-NEXT: .cfi_offset 28, -16 ; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(call_i8_4))) @@ -3547,9 +3731,9 @@ define void @call_i8_4() { ; MIPS64R5EL-NEXT: nop ; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4i8)($gp) ; MIPS64R5EL-NEXT: sw $2, 0($1) -; MIPS64R5EL-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload -; MIPS64R5EL-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload -; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64R5EL-NEXT: jr $ra ; MIPS64R5EL-NEXT: nop entry: @@ -3641,10 +3825,10 @@ define void @call_i8_8() { ; ; MIPS64R5EB-LABEL: call_i8_8: ; MIPS64R5EB: # %bb.0: # %entry -; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5EB-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill -; MIPS64R5EB-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill ; MIPS64R5EB-NEXT: .cfi_offset 31, -8 ; MIPS64R5EB-NEXT: .cfi_offset 28, -16 ; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(call_i8_8))) @@ -3667,9 +3851,9 @@ define void @call_i8_8() { ; MIPS64R5EB-NEXT: nop ; MIPS64R5EB-NEXT: ld $1, %got_disp(gv8i8)($gp) ; MIPS64R5EB-NEXT: sd $2, 0($1) -; MIPS64R5EB-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload -; MIPS64R5EB-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload -; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 ; MIPS64R5EB-NEXT: jr $ra ; MIPS64R5EB-NEXT: nop ; @@ -3748,10 +3932,10 @@ define void @call_i8_8() { ; ; MIPS64R5EL-LABEL: call_i8_8: ; MIPS64R5EL: # %bb.0: # %entry -; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5EL-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill -; MIPS64R5EL-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill ; MIPS64R5EL-NEXT: .cfi_offset 31, -8 ; MIPS64R5EL-NEXT: .cfi_offset 28, -16 ; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(call_i8_8))) @@ -3769,9 +3953,9 @@ define void @call_i8_8() { ; MIPS64R5EL-NEXT: nop ; MIPS64R5EL-NEXT: ld $1, %got_disp(gv8i8)($gp) ; MIPS64R5EL-NEXT: sd $2, 0($1) -; MIPS64R5EL-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload -; MIPS64R5EL-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload -; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64R5EL-NEXT: jr $ra ; MIPS64R5EL-NEXT: nop entry: @@ -4059,9 +4243,9 @@ define void @calli16_2() { ; ; MIPS32R5EB-LABEL: calli16_2: ; MIPS32R5EB: # %bb.0: # %entry -; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: addiu $sp, $sp, -24 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 24 +; MIPS32R5EB-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32R5EB-NEXT: .cfi_offset 31, -4 ; MIPS32R5EB-NEXT: lui $1, 6 ; MIPS32R5EB-NEXT: ori $4, $1, 7 @@ -4071,17 +4255,17 @@ define void @calli16_2() { ; MIPS32R5EB-NEXT: nop ; MIPS32R5EB-NEXT: lui $1, %hi(gv2i16) ; MIPS32R5EB-NEXT: sw $2, %lo(gv2i16)($1) -; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 +; MIPS32R5EB-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 24 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop ; ; MIPS64R5EB-LABEL: calli16_2: ; MIPS64R5EB: # %bb.0: # %entry -; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5EB-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill -; MIPS64R5EB-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill ; MIPS64R5EB-NEXT: .cfi_offset 31, -8 ; MIPS64R5EB-NEXT: .cfi_offset 28, -16 ; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_2))) @@ -4096,9 +4280,9 @@ define void @calli16_2() { ; MIPS64R5EB-NEXT: nop ; MIPS64R5EB-NEXT: ld $1, %got_disp(gv2i16)($gp) ; MIPS64R5EB-NEXT: sw $2, 0($1) -; MIPS64R5EB-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload -; MIPS64R5EB-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload -; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 ; MIPS64R5EB-NEXT: jr $ra ; MIPS64R5EB-NEXT: nop ; @@ -4149,9 +4333,9 @@ define void @calli16_2() { ; ; MIPS32R5EL-LABEL: calli16_2: ; MIPS32R5EL: # %bb.0: # %entry -; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: addiu $sp, $sp, -24 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 24 +; MIPS32R5EL-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32R5EL-NEXT: .cfi_offset 31, -4 ; MIPS32R5EL-NEXT: lui $1, 7 ; MIPS32R5EL-NEXT: ori $4, $1, 6 @@ -4161,17 +4345,17 @@ define void @calli16_2() { ; MIPS32R5EL-NEXT: nop ; MIPS32R5EL-NEXT: lui $1, %hi(gv2i16) ; MIPS32R5EL-NEXT: sw $2, %lo(gv2i16)($1) -; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 +; MIPS32R5EL-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 24 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop ; ; MIPS64R5EL-LABEL: calli16_2: ; MIPS64R5EL: # %bb.0: # %entry -; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5EL-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill -; MIPS64R5EL-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill ; MIPS64R5EL-NEXT: .cfi_offset 31, -8 ; MIPS64R5EL-NEXT: .cfi_offset 28, -16 ; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_2))) @@ -4186,9 +4370,9 @@ define void @calli16_2() { ; MIPS64R5EL-NEXT: nop ; MIPS64R5EL-NEXT: ld $1, %got_disp(gv2i16)($gp) ; MIPS64R5EL-NEXT: sw $2, 0($1) -; MIPS64R5EL-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload -; MIPS64R5EL-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload -; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64R5EL-NEXT: jr $ra ; MIPS64R5EL-NEXT: nop entry: @@ -4282,10 +4466,10 @@ define void @calli16_4() { ; ; MIPS64R5EB-LABEL: calli16_4: ; MIPS64R5EB: # %bb.0: # %entry -; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5EB-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill -; MIPS64R5EB-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill ; MIPS64R5EB-NEXT: .cfi_offset 31, -8 ; MIPS64R5EB-NEXT: .cfi_offset 28, -16 ; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_4))) @@ -4308,9 +4492,9 @@ define void @calli16_4() { ; MIPS64R5EB-NEXT: nop ; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4i16)($gp) ; MIPS64R5EB-NEXT: sd $2, 0($1) -; MIPS64R5EB-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload -; MIPS64R5EB-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload -; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 ; MIPS64R5EB-NEXT: jr $ra ; MIPS64R5EB-NEXT: nop ; @@ -4398,10 +4582,10 @@ define void @calli16_4() { ; ; MIPS64R5EL-LABEL: calli16_4: ; MIPS64R5EL: # %bb.0: # %entry -; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5EL-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill -; MIPS64R5EL-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill ; MIPS64R5EL-NEXT: .cfi_offset 31, -8 ; MIPS64R5EL-NEXT: .cfi_offset 28, -16 ; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_4))) @@ -4424,9 +4608,9 @@ define void @calli16_4() { ; MIPS64R5EL-NEXT: nop ; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4i16)($gp) ; MIPS64R5EL-NEXT: sd $2, 0($1) -; MIPS64R5EL-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload -; MIPS64R5EL-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload -; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64R5EL-NEXT: jr $ra ; MIPS64R5EL-NEXT: nop entry: @@ -4807,10 +4991,10 @@ define void @calli32_2() { ; ; MIPS64R5EB-LABEL: calli32_2: ; MIPS64R5EB: # %bb.0: # %entry -; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5EB-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill -; MIPS64R5EB-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill ; MIPS64R5EB-NEXT: .cfi_offset 31, -8 ; MIPS64R5EB-NEXT: .cfi_offset 28, -16 ; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_2))) @@ -4826,9 +5010,9 @@ define void @calli32_2() { ; MIPS64R5EB-NEXT: nop ; MIPS64R5EB-NEXT: ld $1, %got_disp(gv2i32)($gp) ; MIPS64R5EB-NEXT: sd $2, 0($1) -; MIPS64R5EB-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload -; MIPS64R5EB-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload -; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 ; MIPS64R5EB-NEXT: jr $ra ; MIPS64R5EB-NEXT: nop ; @@ -4862,10 +5046,10 @@ define void @calli32_2() { ; ; MIPS64R5EL-LABEL: calli32_2: ; MIPS64R5EL: # %bb.0: # %entry -; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5EL-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill -; MIPS64R5EL-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill ; MIPS64R5EL-NEXT: .cfi_offset 31, -8 ; MIPS64R5EL-NEXT: .cfi_offset 28, -16 ; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_2))) @@ -4882,9 +5066,9 @@ define void @calli32_2() { ; MIPS64R5EL-NEXT: nop ; MIPS64R5EL-NEXT: ld $1, %got_disp(gv2i32)($gp) ; MIPS64R5EL-NEXT: sd $2, 0($1) -; MIPS64R5EL-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload -; MIPS64R5EL-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload -; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64R5EL-NEXT: jr $ra ; MIPS64R5EL-NEXT: nop entry: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll index 6cb98557c9bc1..97fc0634bfd53 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -281,7 +281,6 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr ; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q5, #0x0 ; CHECK-NEXT: bic r3, r3, #3 @@ -316,20 +315,14 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr ; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 ; CHECK-NEXT: @ implicit-def: $q5 -; CHECK-NEXT: vmrs r4, p0 -; CHECK-NEXT: and r2, r4, #1 -; CHECK-NEXT: rsbs r5, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r5, #0, #1 -; CHECK-NEXT: ubfx r5, r4, #4, #1 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: bfi r2, r5, #1, #1 -; CHECK-NEXT: ubfx r5, r4, #8, #1 -; CHECK-NEXT: ubfx r4, r4, #12, #1 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: bfi r2, r5, #2, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #3, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: ubfx r4, r2, #4, #1 +; CHECK-NEXT: orr.w r4, r5, r4, lsl #1 +; CHECK-NEXT: ubfx r5, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: orr.w r4, r4, r5, lsl #2 +; CHECK-NEXT: orr.w r2, r4, r2, lsl #3 ; CHECK-NEXT: lsls r4, r2, #31 ; CHECK-NEXT: bne .LBB2_12 ; CHECK-NEXT: @ %bb.4: @ %else @@ -352,20 +345,14 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vcmp.u32 cs, q2, q4 ; CHECK-NEXT: @ implicit-def: $q6 -; CHECK-NEXT: vmrs r4, p0 -; CHECK-NEXT: and r2, r4, #1 -; CHECK-NEXT: rsbs r5, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r5, #0, #1 -; CHECK-NEXT: ubfx r5, r4, #4, #1 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: bfi r2, r5, #1, #1 -; CHECK-NEXT: ubfx r5, r4, #8, #1 -; CHECK-NEXT: ubfx r4, r4, #12, #1 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: bfi r2, r5, #2, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r2, r4, #3, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: ubfx r4, r2, #4, #1 +; CHECK-NEXT: orr.w r4, r5, r4, lsl #1 +; CHECK-NEXT: ubfx r5, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: orr.w r4, r4, r5, lsl #2 +; CHECK-NEXT: orr.w r2, r4, r2, lsl #3 ; CHECK-NEXT: lsls r4, r2, #31 ; CHECK-NEXT: bne .LBB2_15 ; CHECK-NEXT: @ %bb.9: @ %else15 @@ -432,7 +419,6 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vadd.f32 q0, q0, r0 -; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll index bcd92f81911b2..595bf8b3b294a 100644 --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -277,10 +277,10 @@ define void @test_width2(ptr nocapture readnone %x, ptr nocapture %y, i8 zeroext ; CHECK-LABEL: test_width2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: beq .LBB5_3 -; CHECK-NEXT: @ %bb.1: @ %for.body.preheader +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB5_1: @ %for.body.preheader ; CHECK-NEXT: adds r0, r2, #1 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: bic r0, r0, #1 @@ -291,32 +291,24 @@ define void @test_width2(ptr nocapture readnone %x, ptr nocapture %y, i8 zeroext ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.64 r2 ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: subs r2, #2 -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r0, r3, #1 -; CHECK-NEXT: ubfx r3, r3, #8, #1 -; CHECK-NEXT: rsb.w r12, r0, #0 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r0, r12, #0, #1 ; CHECK-NEXT: sub.w r12, r1, #8 -; CHECK-NEXT: bfi r0, r3, #1, #1 -; CHECK-NEXT: lsls r3, r0, #31 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: subs r2, #2 +; CHECK-NEXT: ubfx r3, r0, #8, #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: orr.w r3, r0, r3, lsl #1 +; CHECK-NEXT: lsls r0, r3, #31 ; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne.w r3, [r12] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r0, r0, #30 +; CHECK-NEXT: ldrne.w r0, [r12] +; CHECK-NEXT: vmovne.32 q0[0], r0 +; CHECK-NEXT: lsls r0, r3, #30 ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrmi.w r0, [r12, #4] ; CHECK-NEXT: vmovmi.32 q0[2], r0 -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r0, r3, #1 -; CHECK-NEXT: ubfx r3, r3, #8, #1 -; CHECK-NEXT: rsb.w r12, r0, #0 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r0, r12, #0, #1 -; CHECK-NEXT: bfi r0, r3, #1, #1 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: ubfx r3, r0, #8, #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: orr.w r0, r0, r3, lsl #1 ; CHECK-NEXT: lsls r3, r0, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne r3, s0 @@ -327,8 +319,7 @@ define void @test_width2(ptr nocapture readnone %x, ptr nocapture %y, i8 zeroext ; CHECK-NEXT: strmi r0, [r1, #4] ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: le lr, .LBB5_2 -; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp9.not = icmp eq i8 %m, 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll index 4934d22320903..00c72e47165f3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -91,72 +91,76 @@ define void @foo_sext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .save {r4, r5, r7, lr} ; CHECK-LE-NEXT: push {r4, r5, r7, lr} -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: ldrd r12, lr, [r1] ; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: @ implicit-def: $q1 +; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: rsbs.w r3, r12, #0 -; CHECK-LE-NEXT: vmov q0[2], q0[0], r12, lr +; CHECK-LE-NEXT: vmov q1[2], q1[0], r12, lr ; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 ; CHECK-LE-NEXT: csetm r3, lt ; CHECK-LE-NEXT: rsbs.w r4, lr, #0 ; CHECK-LE-NEXT: sbcs.w r4, r1, lr, asr #31 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: bfi r1, r3, #0, #8 ; CHECK-LE-NEXT: csetm r3, lt -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: movs r4, #0 +; CHECK-LE-NEXT: bfi r1, r3, #8, #8 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: and r1, r1, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #1 ; CHECK-LE-NEXT: lsls r3, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r3, [r2] -; CHECK-LE-NEXT: vmovne.32 q1[0], r3 +; CHECK-LE-NEXT: vmovne.32 q0[0], r3 ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] -; CHECK-LE-NEXT: vmovmi.32 q1[2], r1 -; CHECK-LE-NEXT: vmov r2, s6 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 +; CHECK-LE-NEXT: vmov r1, s2 +; CHECK-LE-NEXT: vmov r2, s4 ; CHECK-LE-NEXT: vmov r3, s0 -; CHECK-LE-NEXT: vmov r4, s4 -; CHECK-LE-NEXT: vmov q1[2], q1[0], r4, r2 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: asr.w r12, r2, #31 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3, asr #31 -; CHECK-LE-NEXT: vmov r3, s2 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: asr.w lr, r4, #31 -; CHECK-LE-NEXT: vmov q1[3], q1[1], lr, r12 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: vmov q0[2], q0[0], r3, r1 +; CHECK-LE-NEXT: rsbs r5, r2, #0 +; CHECK-LE-NEXT: asr.w r12, r1, #31 +; CHECK-LE-NEXT: sbcs.w r1, r4, r2, asr #31 +; CHECK-LE-NEXT: vmov r2, s6 +; CHECK-LE-NEXT: asr.w lr, r3, #31 +; CHECK-LE-NEXT: csetm r1, lt +; CHECK-LE-NEXT: vmov q0[3], q0[1], lr, r12 +; CHECK-LE-NEXT: rsbs r3, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-LE-NEXT: bfi r4, r1, #0, #8 +; CHECK-LE-NEXT: csetm r1, lt +; CHECK-LE-NEXT: bfi r4, r1, #8, #8 +; CHECK-LE-NEXT: and r2, r4, #1 +; CHECK-LE-NEXT: ubfx r1, r4, #8, #1 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: vstrne d2, [r0] +; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi -; CHECK-LE-NEXT: vstrmi d3, [r0, #8] -; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: vstrmi d1, [r0, #8] ; CHECK-LE-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-BE-LABEL: foo_sext_v2i64_v2i32: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .save {r4, r5, r7, lr} ; CHECK-BE-NEXT: push {r4, r5, r7, lr} -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: ldrd r12, lr, [r1] -; CHECK-BE-NEXT: rsbs.w r3, lr, #0 -; CHECK-BE-NEXT: mov.w r1, #0 -; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31 -; CHECK-BE-NEXT: vmov q0[3], q0[1], r12, lr -; CHECK-BE-NEXT: csetm lr, lt -; CHECK-BE-NEXT: rsbs.w r3, r12, #0 +; CHECK-BE-NEXT: ldrd lr, r12, [r1] +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: @ implicit-def: $q2 -; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31 -; CHECK-BE-NEXT: bfi r1, lr, #0, #1 -; CHECK-BE-NEXT: csetm r3, lt -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: rsbs.w r1, lr, #0 +; CHECK-BE-NEXT: vmov q0[3], q0[1], lr, r12 +; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 +; CHECK-BE-NEXT: csetm lr, lt +; CHECK-BE-NEXT: rsbs.w r1, r12, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-BE-NEXT: bfi r3, lr, #0, #8 +; CHECK-BE-NEXT: csetm r1, lt +; CHECK-BE-NEXT: bfi r3, r1, #8, #8 +; CHECK-BE-NEXT: and r1, r3, #1 +; CHECK-BE-NEXT: ubfx r3, r3, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1 ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB5_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load @@ -175,31 +179,33 @@ define void @foo_sext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) { ; CHECK-BE-NEXT: .LBB5_4: @ %else2 ; CHECK-BE-NEXT: vrev64.32 q0, q2 ; CHECK-BE-NEXT: vrev64.32 q2, q1 -; CHECK-BE-NEXT: vmov r2, s3 -; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: vmov r2, s9 +; CHECK-BE-NEXT: movs r4, #0 +; CHECK-BE-NEXT: vmov r1, s3 ; CHECK-BE-NEXT: vmov r3, s1 -; CHECK-BE-NEXT: vmov r4, s11 -; CHECK-BE-NEXT: asr.w r12, r2, #31 +; CHECK-BE-NEXT: rsbs r5, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-BE-NEXT: vmov r2, s11 +; CHECK-BE-NEXT: asr.w r12, r1, #31 ; CHECK-BE-NEXT: asr.w lr, r3, #31 -; CHECK-BE-NEXT: rsbs r5, r4, #0 ; CHECK-BE-NEXT: vmov q1[2], q1[0], lr, r12 -; CHECK-BE-NEXT: sbcs.w r4, r1, r4, asr #31 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-BE-NEXT: vmov r3, s9 -; CHECK-BE-NEXT: csetm r2, lt +; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r1 +; CHECK-BE-NEXT: csetm r1, lt ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: rsbs r5, r3, #0 -; CHECK-BE-NEXT: sbcs.w r3, r1, r3, asr #31 -; CHECK-BE-NEXT: bfi r1, r2, #0, #1 -; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: rsbs r3, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-BE-NEXT: bfi r4, r1, #0, #8 +; CHECK-BE-NEXT: csetm r1, lt +; CHECK-BE-NEXT: bfi r4, r1, #8, #8 +; CHECK-BE-NEXT: and r1, r4, #1 +; CHECK-BE-NEXT: ubfx r2, r4, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #1 ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vstrmi d0, [r0] ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vstrne d1, [r0, #8] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r4, r5, r7, pc} entry: %0 = load <2 x i32>, ptr %mask, align 4 @@ -215,8 +221,6 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .save {r4, r5, r7, lr} ; CHECK-LE-NEXT: push {r4, r5, r7, lr} -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: ldrd r12, lr, [r1] ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: @ implicit-def: $q0 @@ -226,9 +230,13 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: csetm r3, lt ; CHECK-LE-NEXT: rsbs.w r4, lr, #0 ; CHECK-LE-NEXT: sbcs.w r4, r1, lr, asr #31 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: bfi r1, r3, #0, #8 ; CHECK-LE-NEXT: csetm r3, lt -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: movs r4, #0 +; CHECK-LE-NEXT: bfi r1, r3, #8, #8 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: and r1, r1, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #1 ; CHECK-LE-NEXT: lsls r3, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r3, [r2] @@ -237,23 +245,25 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] ; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 -; CHECK-LE-NEXT: vmov r2, s2 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vmov r3, s4 -; CHECK-LE-NEXT: vmov r4, s0 -; CHECK-LE-NEXT: vmov q0[2], q0[0], r4, r2 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: asr.w r12, r2, #31 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3, asr #31 -; CHECK-LE-NEXT: vmov r3, s6 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: asr.w lr, r4, #31 +; CHECK-LE-NEXT: vmov r1, s2 +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov r3, s0 +; CHECK-LE-NEXT: vmov q0[2], q0[0], r3, r1 +; CHECK-LE-NEXT: rsbs r5, r2, #0 +; CHECK-LE-NEXT: asr.w r12, r1, #31 +; CHECK-LE-NEXT: sbcs.w r1, r4, r2, asr #31 +; CHECK-LE-NEXT: vmov r2, s6 +; CHECK-LE-NEXT: asr.w lr, r3, #31 +; CHECK-LE-NEXT: csetm r1, lt ; CHECK-LE-NEXT: vmov q0[3], q0[1], lr, r12 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: rsbs r3, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-LE-NEXT: bfi r4, r1, #0, #8 +; CHECK-LE-NEXT: csetm r1, lt +; CHECK-LE-NEXT: bfi r4, r1, #8, #8 +; CHECK-LE-NEXT: and r2, r4, #1 +; CHECK-LE-NEXT: ubfx r1, r4, #8, #1 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, r3, d0 @@ -262,27 +272,27 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi r1, r2, d1 ; CHECK-LE-NEXT: strdmi r1, r2, [r0, #8] -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-BE-LABEL: foo_sext_v2i64_v2i32_unaligned: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .save {r4, r5, r7, lr} ; CHECK-BE-NEXT: push {r4, r5, r7, lr} -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: ldrd r12, lr, [r1] -; CHECK-BE-NEXT: rsbs.w r3, lr, #0 -; CHECK-BE-NEXT: mov.w r1, #0 -; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31 -; CHECK-BE-NEXT: vmov q0[3], q0[1], r12, lr -; CHECK-BE-NEXT: csetm lr, lt -; CHECK-BE-NEXT: rsbs.w r3, r12, #0 +; CHECK-BE-NEXT: ldrd lr, r12, [r1] +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: @ implicit-def: $q2 -; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31 -; CHECK-BE-NEXT: bfi r1, lr, #0, #1 -; CHECK-BE-NEXT: csetm r3, lt -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: rsbs.w r1, lr, #0 +; CHECK-BE-NEXT: vmov q0[3], q0[1], lr, r12 +; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 +; CHECK-BE-NEXT: csetm lr, lt +; CHECK-BE-NEXT: rsbs.w r1, r12, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-BE-NEXT: bfi r3, lr, #0, #8 +; CHECK-BE-NEXT: csetm r1, lt +; CHECK-BE-NEXT: bfi r3, r1, #8, #8 +; CHECK-BE-NEXT: and r1, r3, #1 +; CHECK-BE-NEXT: ubfx r3, r3, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1 ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB6_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load @@ -301,24 +311,27 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-BE-NEXT: .LBB6_4: @ %else2 ; CHECK-BE-NEXT: vrev64.32 q0, q2 ; CHECK-BE-NEXT: vrev64.32 q2, q1 -; CHECK-BE-NEXT: vmov r2, s3 -; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: vmov r2, s9 +; CHECK-BE-NEXT: movs r4, #0 +; CHECK-BE-NEXT: vmov r1, s3 ; CHECK-BE-NEXT: vmov r3, s1 -; CHECK-BE-NEXT: vmov r4, s11 -; CHECK-BE-NEXT: asr.w r12, r2, #31 +; CHECK-BE-NEXT: rsbs r5, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-BE-NEXT: vmov r2, s11 +; CHECK-BE-NEXT: asr.w r12, r1, #31 ; CHECK-BE-NEXT: asr.w lr, r3, #31 -; CHECK-BE-NEXT: rsbs r5, r4, #0 ; CHECK-BE-NEXT: vmov q1[2], q1[0], lr, r12 -; CHECK-BE-NEXT: sbcs.w r4, r1, r4, asr #31 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-BE-NEXT: vmov r3, s9 -; CHECK-BE-NEXT: csetm r2, lt +; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r1 +; CHECK-BE-NEXT: csetm r1, lt ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: rsbs r5, r3, #0 -; CHECK-BE-NEXT: sbcs.w r3, r1, r3, asr #31 -; CHECK-BE-NEXT: bfi r1, r2, #0, #1 -; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: rsbs r3, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-BE-NEXT: bfi r4, r1, #0, #8 +; CHECK-BE-NEXT: csetm r1, lt +; CHECK-BE-NEXT: bfi r4, r1, #8, #8 +; CHECK-BE-NEXT: and r1, r4, #1 +; CHECK-BE-NEXT: ubfx r2, r4, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #1 ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi r2, r3, d0 @@ -327,7 +340,6 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne r1, r2, d1 ; CHECK-BE-NEXT: strdne r2, r1, [r0, #8] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r4, r5, r7, pc} entry: %0 = load <2 x i32>, ptr %mask, align 4 @@ -343,8 +355,6 @@ define void @foo_zext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .save {r4, lr} ; CHECK-LE-NEXT: push {r4, lr} -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: ldrd r12, lr, [r1] ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: @ implicit-def: $q0 @@ -355,9 +365,12 @@ define void @foo_zext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: csetm r3, lt ; CHECK-LE-NEXT: rsbs.w r4, lr, #0 ; CHECK-LE-NEXT: sbcs.w r4, r1, lr, asr #31 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: bfi r1, r3, #0, #8 ; CHECK-LE-NEXT: csetm r3, lt -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: bfi r1, r3, #8, #8 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: and r1, r1, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #1 ; CHECK-LE-NEXT: lsls r3, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r3, [r2] @@ -375,36 +388,39 @@ define void @foo_zext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: csetm r2, lt ; CHECK-LE-NEXT: rsbs r4, r3, #0 ; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: bfi r1, r2, #0, #8 ; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: bfi r1, r2, #8, #8 +; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-LE-NEXT: and r1, r1, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vstrmi d1, [r0, #8] -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: foo_zext_v2i64_v2i32: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .save {r7, lr} ; CHECK-BE-NEXT: push {r7, lr} -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: ldrd r12, lr, [r1] -; CHECK-BE-NEXT: rsbs.w r3, lr, #0 -; CHECK-BE-NEXT: mov.w r1, #0 -; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r12, lr -; CHECK-BE-NEXT: csetm lr, lt -; CHECK-BE-NEXT: rsbs.w r3, r12, #0 +; CHECK-BE-NEXT: ldrd lr, r12, [r1] +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: @ implicit-def: $q0 -; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31 -; CHECK-BE-NEXT: bfi r1, lr, #0, #1 -; CHECK-BE-NEXT: csetm r3, lt -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: rsbs.w r1, lr, #0 +; CHECK-BE-NEXT: vmov q1[3], q1[1], lr, r12 +; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 +; CHECK-BE-NEXT: csetm lr, lt +; CHECK-BE-NEXT: rsbs.w r1, r12, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-BE-NEXT: bfi r3, lr, #0, #8 +; CHECK-BE-NEXT: csetm r1, lt +; CHECK-BE-NEXT: bfi r3, r1, #8, #8 +; CHECK-BE-NEXT: and r1, r3, #1 +; CHECK-BE-NEXT: ubfx r3, r3, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1 ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB7_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load @@ -425,24 +441,26 @@ define void @foo_zext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) { ; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vrev64.32 q3, q1 ; CHECK-BE-NEXT: vrev64.32 q1, q2 -; CHECK-BE-NEXT: vmov r2, s7 +; CHECK-BE-NEXT: vmov r2, s5 ; CHECK-BE-NEXT: vand q0, q0, q3 ; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: vmov r3, s5 +; CHECK-BE-NEXT: vmov r3, s7 ; CHECK-BE-NEXT: sbcs.w r2, r1, r2, asr #31 ; CHECK-BE-NEXT: csetm r12, lt ; CHECK-BE-NEXT: rsbs r2, r3, #0 ; CHECK-BE-NEXT: sbcs.w r2, r1, r3, asr #31 -; CHECK-BE-NEXT: bfi r1, r12, #0, #1 +; CHECK-BE-NEXT: bfi r1, r12, #0, #8 ; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: bfi r1, r2, #8, #8 +; CHECK-BE-NEXT: and r2, r1, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1 ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vstrmi d0, [r0] ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vstrne d1, [r0, #8] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r7, pc} entry: %0 = load <2 x i32>, ptr %mask, align 4 @@ -458,8 +476,6 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .save {r4, lr} ; CHECK-LE-NEXT: push {r4, lr} -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: ldrd r12, lr, [r1] ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: @ implicit-def: $q0 @@ -470,9 +486,12 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: csetm r3, lt ; CHECK-LE-NEXT: rsbs.w r4, lr, #0 ; CHECK-LE-NEXT: sbcs.w r4, r1, lr, asr #31 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: bfi r1, r3, #0, #8 ; CHECK-LE-NEXT: csetm r3, lt -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: bfi r1, r3, #8, #8 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: and r1, r1, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #1 ; CHECK-LE-NEXT: lsls r3, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r3, [r2] @@ -490,9 +509,12 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: csetm r2, lt ; CHECK-LE-NEXT: rsbs r4, r3, #0 ; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: bfi r1, r2, #0, #8 ; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: bfi r1, r2, #8, #8 +; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-LE-NEXT: and r1, r1, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, r3, d0 @@ -501,27 +523,27 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi r1, r2, d1 ; CHECK-LE-NEXT: strdmi r1, r2, [r0, #8] -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: foo_zext_v2i64_v2i32_unaligned: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .save {r7, lr} ; CHECK-BE-NEXT: push {r7, lr} -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: ldrd r12, lr, [r1] -; CHECK-BE-NEXT: rsbs.w r3, lr, #0 -; CHECK-BE-NEXT: mov.w r1, #0 -; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r12, lr -; CHECK-BE-NEXT: csetm lr, lt -; CHECK-BE-NEXT: rsbs.w r3, r12, #0 +; CHECK-BE-NEXT: ldrd lr, r12, [r1] +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: @ implicit-def: $q0 -; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31 -; CHECK-BE-NEXT: bfi r1, lr, #0, #1 -; CHECK-BE-NEXT: csetm r3, lt -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: rsbs.w r1, lr, #0 +; CHECK-BE-NEXT: vmov q1[3], q1[1], lr, r12 +; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 +; CHECK-BE-NEXT: csetm lr, lt +; CHECK-BE-NEXT: rsbs.w r1, r12, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31 +; CHECK-BE-NEXT: bfi r3, lr, #0, #8 +; CHECK-BE-NEXT: csetm r1, lt +; CHECK-BE-NEXT: bfi r3, r1, #8, #8 +; CHECK-BE-NEXT: and r1, r3, #1 +; CHECK-BE-NEXT: ubfx r3, r3, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1 ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB8_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load @@ -542,17 +564,20 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vrev64.32 q3, q1 ; CHECK-BE-NEXT: vrev64.32 q1, q2 -; CHECK-BE-NEXT: vmov r2, s7 +; CHECK-BE-NEXT: vmov r2, s5 ; CHECK-BE-NEXT: vand q0, q0, q3 ; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: vmov r3, s5 +; CHECK-BE-NEXT: vmov r3, s7 ; CHECK-BE-NEXT: sbcs.w r2, r1, r2, asr #31 ; CHECK-BE-NEXT: csetm r12, lt ; CHECK-BE-NEXT: rsbs r2, r3, #0 ; CHECK-BE-NEXT: sbcs.w r2, r1, r3, asr #31 -; CHECK-BE-NEXT: bfi r1, r12, #0, #1 +; CHECK-BE-NEXT: bfi r1, r12, #0, #8 ; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: bfi r1, r2, #8, #8 +; CHECK-BE-NEXT: and r2, r1, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1 ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi r2, r3, d0 @@ -561,7 +586,6 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne r1, r2, d1 ; CHECK-BE-NEXT: strdne r2, r1, [r0, #8] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r7, pc} entry: %0 = load <2 x i32>, ptr %mask, align 4 @@ -724,27 +748,17 @@ entry: define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-LABEL: foo_v4f32_v4f16: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r7, lr} -; CHECK-LE-NEXT: push {r7, lr} -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vldrh.s32 q0, [r1] ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vmrs lr, p0 -; CHECK-LE-NEXT: and r1, lr, #1 -; CHECK-LE-NEXT: ubfx r3, lr, #4, #1 -; CHECK-LE-NEXT: rsb.w r12, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r12, #0, #1 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, lr, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: ubfx r3, lr, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #3, #1 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r12, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r12, r3, r12, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r3, r12, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r3, r1, lsl #3 ; CHECK-LE-NEXT: lsls r3, r1, #31 ; CHECK-LE-NEXT: bne .LBB18_6 ; CHECK-LE-NEXT: @ %bb.1: @ %else @@ -760,24 +774,18 @@ define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: vldr.16 s2, [r2, #6] ; CHECK-LE-NEXT: vins.f16 s1, s2 ; CHECK-LE-NEXT: .LBB18_5: @ %else8 -; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r1, p0 ; CHECK-LE-NEXT: vcvtt.f32.f16 s3, s1 ; CHECK-LE-NEXT: vcvtb.f32.f16 s2, s1 ; CHECK-LE-NEXT: vcvtt.f32.f16 s1, s0 ; CHECK-LE-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-LE-NEXT: and r3, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 @@ -794,8 +802,7 @@ define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi r1, s3 ; CHECK-LE-NEXT: strmi r1, [r0, #12] -; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: pop {r7, pc} +; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB18_6: @ %cond.load ; CHECK-LE-NEXT: vldr.16 s0, [r2] ; CHECK-LE-NEXT: lsls r3, r1, #30 @@ -815,27 +822,17 @@ define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) { ; ; CHECK-BE-LABEL: foo_v4f32_v4f16: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .save {r7, lr} -; CHECK-BE-NEXT: push {r7, lr} -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vldrh.s32 q0, [r1] ; CHECK-BE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-BE-NEXT: @ implicit-def: $q0 -; CHECK-BE-NEXT: vmrs lr, p0 -; CHECK-BE-NEXT: ubfx r1, lr, #12, #1 -; CHECK-BE-NEXT: ubfx r3, lr, #8, #1 -; CHECK-BE-NEXT: rsb.w r12, r1, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: bfi r1, r12, #0, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, lr, #4, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: and r3, lr, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #3, #1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r12, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r12, r3, r12, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r3, r12, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #3 ; CHECK-BE-NEXT: lsls r3, r1, #28 ; CHECK-BE-NEXT: bmi .LBB18_6 ; CHECK-BE-NEXT: @ %bb.1: @ %else @@ -851,24 +848,18 @@ define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) { ; CHECK-BE-NEXT: vldr.16 s2, [r2, #6] ; CHECK-BE-NEXT: vins.f16 s1, s2 ; CHECK-BE-NEXT: .LBB18_5: @ %else8 -; CHECK-BE-NEXT: vmrs r2, p0 -; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: vmrs r1, p0 ; CHECK-BE-NEXT: vcvtt.f32.f16 s3, s1 ; CHECK-BE-NEXT: vcvtb.f32.f16 s2, s1 ; CHECK-BE-NEXT: vcvtt.f32.f16 s1, s0 ; CHECK-BE-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-BE-NEXT: and r2, r2, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi r2, s0 @@ -885,8 +876,7 @@ define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) { ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne r1, s3 ; CHECK-BE-NEXT: strne r1, [r0, #12] -; CHECK-BE-NEXT: add sp, #4 -; CHECK-BE-NEXT: pop {r7, pc} +; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .LBB18_6: @ %cond.load ; CHECK-BE-NEXT: vldr.16 s0, [r2] ; CHECK-BE-NEXT: lsls r3, r1, #29 @@ -915,27 +905,17 @@ entry: define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-LABEL: foo_v4f32_v4f16_unaligned: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r7, lr} -; CHECK-LE-NEXT: push {r7, lr} -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vldrh.s32 q0, [r1] ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vmrs lr, p0 -; CHECK-LE-NEXT: and r1, lr, #1 -; CHECK-LE-NEXT: ubfx r3, lr, #4, #1 -; CHECK-LE-NEXT: rsb.w r12, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r12, #0, #1 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, lr, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: ubfx r3, lr, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #3, #1 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r12, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r12, r3, r12, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r3, r12, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r3, r1, lsl #3 ; CHECK-LE-NEXT: lsls r3, r1, #31 ; CHECK-LE-NEXT: bne .LBB19_6 ; CHECK-LE-NEXT: @ %bb.1: @ %else @@ -951,24 +931,18 @@ define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: vldr.16 s2, [r2, #6] ; CHECK-LE-NEXT: vins.f16 s1, s2 ; CHECK-LE-NEXT: .LBB19_5: @ %else8 -; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r1, p0 ; CHECK-LE-NEXT: vcvtt.f32.f16 s3, s1 ; CHECK-LE-NEXT: vcvtb.f32.f16 s2, s1 ; CHECK-LE-NEXT: vcvtt.f32.f16 s1, s0 ; CHECK-LE-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-LE-NEXT: and r3, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 @@ -985,8 +959,7 @@ define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi r1, s3 ; CHECK-LE-NEXT: strmi r1, [r0, #12] -; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: pop {r7, pc} +; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB19_6: @ %cond.load ; CHECK-LE-NEXT: vldr.16 s0, [r2] ; CHECK-LE-NEXT: lsls r3, r1, #30 @@ -1006,27 +979,17 @@ define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) { ; ; CHECK-BE-LABEL: foo_v4f32_v4f16_unaligned: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .save {r7, lr} -; CHECK-BE-NEXT: push {r7, lr} -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vldrh.s32 q0, [r1] ; CHECK-BE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-BE-NEXT: @ implicit-def: $q0 -; CHECK-BE-NEXT: vmrs lr, p0 -; CHECK-BE-NEXT: ubfx r1, lr, #12, #1 -; CHECK-BE-NEXT: ubfx r3, lr, #8, #1 -; CHECK-BE-NEXT: rsb.w r12, r1, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: bfi r1, r12, #0, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, lr, #4, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: and r3, lr, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #3, #1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r12, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r12, r3, r12, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r3, r12, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #3 ; CHECK-BE-NEXT: lsls r3, r1, #28 ; CHECK-BE-NEXT: bmi .LBB19_6 ; CHECK-BE-NEXT: @ %bb.1: @ %else @@ -1042,24 +1005,18 @@ define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-BE-NEXT: vldr.16 s2, [r2, #6] ; CHECK-BE-NEXT: vins.f16 s1, s2 ; CHECK-BE-NEXT: .LBB19_5: @ %else8 -; CHECK-BE-NEXT: vmrs r2, p0 -; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: vmrs r1, p0 ; CHECK-BE-NEXT: vcvtt.f32.f16 s3, s1 ; CHECK-BE-NEXT: vcvtb.f32.f16 s2, s1 ; CHECK-BE-NEXT: vcvtt.f32.f16 s1, s0 ; CHECK-BE-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-BE-NEXT: and r2, r2, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi r2, s0 @@ -1076,8 +1033,7 @@ define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne r1, s3 ; CHECK-BE-NEXT: strne r1, [r0, #12] -; CHECK-BE-NEXT: add sp, #4 -; CHECK-BE-NEXT: pop {r7, pc} +; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .LBB19_6: @ %cond.load ; CHECK-BE-NEXT: vldr.16 s0, [r2] ; CHECK-BE-NEXT: lsls r3, r1, #29 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll index b0a3a6354daa7..9e3c63718e9a5 100644 --- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll @@ -45,24 +45,16 @@ entry: define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(ptr %dest, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4i32_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r2, [r0] @@ -79,30 +71,21 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(ptr %dest, <4 x i32> ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r0, [r0, #12] ; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4i32_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 -; CHECK-BE-NEXT: vmrs r2, p0 -; CHECK-BE-NEXT: ubfx r1, r2, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r1, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: bfi r1, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-BE-NEXT: and r2, r2, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: ldrmi r2, [r0] @@ -120,7 +103,6 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(ptr %dest, <4 x i32> ; CHECK-BE-NEXT: ldrne r0, [r0, #12] ; CHECK-BE-NEXT: vmovne.32 q1[3], r0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <4 x i32> %a, zeroinitializer @@ -195,24 +177,16 @@ entry: define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align1_undef(ptr %dest, <4 x i32> %a) { ; CHECK-LE-LABEL: zext16_masked_v4i32_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r2, [r0] @@ -230,30 +204,21 @@ define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align1_undef(ptr %dest, <4 ; CHECK-LE-NEXT: ldrhmi r0, [r0, #6] ; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 ; CHECK-LE-NEXT: vmovlb.s16 q0, q0 -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: zext16_masked_v4i32_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: @ implicit-def: $q0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vmrs r2, p0 -; CHECK-BE-NEXT: ubfx r1, r2, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r1, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: bfi r1, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-BE-NEXT: and r2, r2, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: ldrhmi r2, [r0] @@ -272,7 +237,6 @@ define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align1_undef(ptr %dest, <4 ; CHECK-BE-NEXT: vmovne.32 q0[3], r0 ; CHECK-BE-NEXT: vmovlb.s16 q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <4 x i32> %a, zeroinitializer @@ -353,24 +317,16 @@ entry: define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align1_undef(ptr %dest, <4 x i32> %a) { ; CHECK-LE-LABEL: sext16_masked_v4i32_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r2, [r0] @@ -388,30 +344,21 @@ define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align1_undef(ptr %dest, <4 ; CHECK-LE-NEXT: ldrhmi r0, [r0, #6] ; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 ; CHECK-LE-NEXT: vmovlb.s16 q0, q0 -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: sext16_masked_v4i32_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: @ implicit-def: $q0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vmrs r2, p0 -; CHECK-BE-NEXT: ubfx r1, r2, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r1, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: bfi r1, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-BE-NEXT: and r2, r2, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: ldrhmi r2, [r0] @@ -430,7 +377,6 @@ define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align1_undef(ptr %dest, <4 ; CHECK-BE-NEXT: vmovne.32 q0[3], r0 ; CHECK-BE-NEXT: vmovlb.s16 q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <4 x i32> %a, zeroinitializer @@ -553,38 +499,25 @@ entry: define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(ptr %dest, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8i16_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmrs r1, p0 -; CHECK-LE-NEXT: and r2, r1, #1 -; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: bfi r2, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 ; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 ; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #3 ; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #4 ; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #5 ; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 ; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #6, #1 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r2, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r1, r2 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #6 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #7 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r2, [r0] ; CHECK-LE-NEXT: vmovne.16 q0[0], r2 @@ -616,44 +549,30 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(ptr %dest, <8 x i16> ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrhmi r0, [r0, #14] ; CHECK-LE-NEXT: vmovmi.16 q0[7], r0 -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8i16_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 ; CHECK-BE-NEXT: vmrs r1, p0 -; CHECK-BE-NEXT: ubfx r2, r1, #14, #1 -; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: bfi r2, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #14, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 ; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 ; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #3 ; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #4 ; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #5 ; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 ; CHECK-BE-NEXT: and r1, r1, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #6, #1 -; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: bfi r2, r1, #7, #1 -; CHECK-BE-NEXT: uxtb r1, r2 -; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #6 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #7 +; CHECK-BE-NEXT: lsls r2, r1, #24 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: ldrhmi r2, [r0] ; CHECK-BE-NEXT: vmovmi.16 q1[0], r2 @@ -686,7 +605,6 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(ptr %dest, <8 x i16> ; CHECK-BE-NEXT: ldrhne r0, [r0, #14] ; CHECK-BE-NEXT: vmovne.16 q1[7], r0 ; CHECK-BE-NEXT: vrev64.16 q0, q1 -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <8 x i16> %a, zeroinitializer @@ -1221,24 +1139,16 @@ entry: define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(ptr %dest, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4f32_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r2, [r0] @@ -1255,30 +1165,21 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(ptr %dest, <4 x i3 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r0, [r0, #12] ; CHECK-LE-NEXT: vmovmi s3, r0 -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4f32_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 -; CHECK-BE-NEXT: vmrs r2, p0 -; CHECK-BE-NEXT: ubfx r1, r2, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r1, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: bfi r1, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-BE-NEXT: and r2, r2, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: ldrmi r2, [r0] @@ -1296,7 +1197,6 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(ptr %dest, <4 x i3 ; CHECK-BE-NEXT: ldrne r0, [r0, #12] ; CHECK-BE-NEXT: vmovne s7, r0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <4 x i32> %a, zeroinitializer @@ -1417,38 +1317,27 @@ entry: define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(ptr %dest, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8f16_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #36 -; CHECK-LE-NEXT: sub sp, #36 +; CHECK-LE-NEXT: .pad #32 +; CHECK-LE-NEXT: sub sp, #32 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmrs r1, p0 -; CHECK-LE-NEXT: and r2, r1, #1 -; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: bfi r2, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 ; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 ; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #3 ; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #4 ; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #5 ; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 ; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #6, #1 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r2, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r1, r2 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #6 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #7 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: bne .LBB45_9 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -1472,7 +1361,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(ptr %dest, <8 x i16 ; CHECK-LE-NEXT: lsls r1, r1, #24 ; CHECK-LE-NEXT: bmi .LBB45_16 ; CHECK-LE-NEXT: .LBB45_8: @ %else20 -; CHECK-LE-NEXT: add sp, #36 +; CHECK-LE-NEXT: add sp, #32 ; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB45_9: @ %cond.load ; CHECK-LE-NEXT: ldrh r2, [r0] @@ -1530,44 +1419,33 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(ptr %dest, <8 x i16 ; CHECK-LE-NEXT: strh.w r0, [sp] ; CHECK-LE-NEXT: vldr.16 s4, [sp] ; CHECK-LE-NEXT: vins.f16 s3, s4 -; CHECK-LE-NEXT: add sp, #36 +; CHECK-LE-NEXT: add sp, #32 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8f16_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #36 -; CHECK-BE-NEXT: sub sp, #36 +; CHECK-BE-NEXT: .pad #32 +; CHECK-BE-NEXT: sub sp, #32 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 ; CHECK-BE-NEXT: vmrs r1, p0 -; CHECK-BE-NEXT: ubfx r2, r1, #14, #1 -; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: bfi r2, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #14, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 ; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 ; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #3 ; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #4 ; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #5 ; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 ; CHECK-BE-NEXT: and r1, r1, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #6, #1 -; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: bfi r2, r1, #7, #1 -; CHECK-BE-NEXT: uxtb r1, r2 -; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #6 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #7 +; CHECK-BE-NEXT: lsls r2, r1, #24 ; CHECK-BE-NEXT: bmi .LBB45_10 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #25 @@ -1597,7 +1475,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(ptr %dest, <8 x i16 ; CHECK-BE-NEXT: vins.f16 s7, s0 ; CHECK-BE-NEXT: .LBB45_9: @ %else20 ; CHECK-BE-NEXT: vrev64.16 q0, q1 -; CHECK-BE-NEXT: add sp, #36 +; CHECK-BE-NEXT: add sp, #32 ; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .LBB45_10: @ %cond.load ; CHECK-BE-NEXT: ldrh r2, [r0] @@ -1732,19 +1610,20 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(ptr %dest, <2 x i64> ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .save {r7, lr} ; CHECK-LE-NEXT: push {r7, lr} -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r2, r3, d0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vmov r12, lr, d1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: rsbs.w r3, r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, lr -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: vmov r1, lr, d0 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmov r3, r12, d1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, lr +; CHECK-LE-NEXT: csetm r1, lt +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: sbcs.w r3, r2, r12 +; CHECK-LE-NEXT: bfi r2, r1, #0, #8 +; CHECK-LE-NEXT: csetm r1, lt +; CHECK-LE-NEXT: bfi r2, r1, #8, #8 +; CHECK-LE-NEXT: ubfx r1, r2, #8, #1 +; CHECK-LE-NEXT: and r2, r2, #1 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: beq .LBB49_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load @@ -1757,7 +1636,6 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(ptr %dest, <2 x i64> ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vldrmi d1, [r0, #8] -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r7, pc} ; CHECK-LE-NEXT: .p2align 3 ; CHECK-LE-NEXT: @ %bb.4: @@ -1769,20 +1647,21 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(ptr %dest, <2 x i64> ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .save {r7, lr} ; CHECK-BE-NEXT: push {r7, lr} -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vmov r2, r3, d3 -; CHECK-BE-NEXT: vmov r12, lr, d2 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: sbcs.w r2, r1, r2 +; CHECK-BE-NEXT: vmov lr, r2, d2 +; CHECK-BE-NEXT: vmov r12, r3, d3 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r1, lr ; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: sbcs.w r3, r1, r12 -; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: bfi r1, r2, #0, #8 ; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: bfi r1, r2, #8, #8 +; CHECK-BE-NEXT: and r2, r1, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1 ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: bpl .LBB49_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load @@ -1795,7 +1674,6 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(ptr %dest, <2 x i64> ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vldrne d1, [r0, #8] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r7, pc} ; CHECK-BE-NEXT: .p2align 3 ; CHECK-BE-NEXT: @ %bb.4: @@ -1813,19 +1691,20 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(ptr %dest, <2 x do ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .save {r7, lr} ; CHECK-LE-NEXT: push {r7, lr} -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r2, r3, d2 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vmov r12, lr, d3 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: rsbs.w r3, r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, lr -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: vmov r1, lr, d2 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmov r3, r12, d3 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, lr +; CHECK-LE-NEXT: csetm r1, lt +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: sbcs.w r3, r2, r12 +; CHECK-LE-NEXT: bfi r2, r1, #0, #8 +; CHECK-LE-NEXT: csetm r1, lt +; CHECK-LE-NEXT: bfi r2, r1, #8, #8 +; CHECK-LE-NEXT: ubfx r1, r2, #8, #1 +; CHECK-LE-NEXT: and r2, r2, #1 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: beq .LBB50_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load @@ -1838,7 +1717,6 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(ptr %dest, <2 x do ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vldrmi d1, [r0, #8] -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r7, pc} ; CHECK-LE-NEXT: .p2align 3 ; CHECK-LE-NEXT: @ %bb.4: @@ -1850,20 +1728,21 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(ptr %dest, <2 x do ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .save {r7, lr} ; CHECK-BE-NEXT: push {r7, lr} -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vmov r2, r3, d1 -; CHECK-BE-NEXT: vmov r12, lr, d0 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: sbcs.w r2, r1, r2 +; CHECK-BE-NEXT: vmov lr, r2, d0 +; CHECK-BE-NEXT: vmov r12, r3, d1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r1, lr ; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: sbcs.w r3, r1, r12 -; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: bfi r1, r2, #0, #8 ; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: bfi r1, r2, #8, #8 +; CHECK-BE-NEXT: and r2, r1, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1 ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: bpl .LBB50_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load @@ -1876,7 +1755,6 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(ptr %dest, <2 x do ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vldrne d1, [r0, #8] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r7, pc} ; CHECK-BE-NEXT: .p2align 3 ; CHECK-BE-NEXT: @ %bb.4: @@ -1912,30 +1790,22 @@ entry: define arm_aapcs_vfpcc <4 x i16> @anyext_v4i16_align1(ptr %dest, <4 x i32> %a) { ; CHECK-LE-LABEL: anyext_v4i16_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: vmrs r3, p0 -; CHECK-LE-NEXT: and r1, r3, #1 -; CHECK-LE-NEXT: rsbs r2, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: ubfx r2, r3, #4, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: ubfx r2, r3, #8, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #2, #1 -; CHECK-LE-NEXT: ubfx r2, r3, #12, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: beq .LBB52_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load -; CHECK-LE-NEXT: ldrh r2, [r0] -; CHECK-LE-NEXT: vdup.32 q0, r12 -; CHECK-LE-NEXT: vmov.32 q0[0], r2 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: ldrh r3, [r0] +; CHECK-LE-NEXT: vdup.32 q0, r2 +; CHECK-LE-NEXT: vmov.32 q0[0], r3 ; CHECK-LE-NEXT: b .LBB52_3 ; CHECK-LE-NEXT: .LBB52_2: ; CHECK-LE-NEXT: vmov.i32 q0, #0x0 @@ -1952,36 +1822,27 @@ define arm_aapcs_vfpcc <4 x i16> @anyext_v4i16_align1(ptr %dest, <4 x i32> %a) { ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrhmi r0, [r0, #6] ; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: anyext_v4i16_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov.w r12, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vmrs r3, p0 -; CHECK-BE-NEXT: ubfx r1, r3, #12, #1 -; CHECK-BE-NEXT: rsbs r2, r1, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: bfi r1, r2, #0, #1 -; CHECK-BE-NEXT: ubfx r2, r3, #8, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 -; CHECK-BE-NEXT: ubfx r2, r3, #4, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #2, #1 -; CHECK-BE-NEXT: and r2, r3, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: bpl .LBB52_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load -; CHECK-BE-NEXT: ldrh r2, [r0] -; CHECK-BE-NEXT: vdup.32 q1, r12 -; CHECK-BE-NEXT: vmov.32 q1[0], r2 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: ldrh r3, [r0] +; CHECK-BE-NEXT: vdup.32 q1, r2 +; CHECK-BE-NEXT: vmov.32 q1[0], r3 ; CHECK-BE-NEXT: b .LBB52_3 ; CHECK-BE-NEXT: .LBB52_2: ; CHECK-BE-NEXT: vmov.i32 q1, #0x0 @@ -1999,7 +1860,6 @@ define arm_aapcs_vfpcc <4 x i16> @anyext_v4i16_align1(ptr %dest, <4 x i32> %a) { ; CHECK-BE-NEXT: ldrhne r0, [r0, #6] ; CHECK-BE-NEXT: vmovne.32 q1[3], r0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <4 x i32> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll index 9012fada2bee2..73e7827b7046c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -24,23 +24,15 @@ entry: define arm_aapcs_vfpcc void @masked_v4i32_align1(ptr %dest, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4i32_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 @@ -57,29 +49,20 @@ define arm_aapcs_vfpcc void @masked_v4i32_align1(ptr %dest, <4 x i32> %a) { ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi r1, s3 ; CHECK-LE-NEXT: strmi r1, [r0, #12] -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4i32_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vmrs r2, p0 -; CHECK-BE-NEXT: ubfx r1, r2, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r1, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: bfi r1, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-BE-NEXT: and r2, r2, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi r2, s4 @@ -96,7 +79,6 @@ define arm_aapcs_vfpcc void @masked_v4i32_align1(ptr %dest, <4 x i32> %a) { ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne r1, s7 ; CHECK-BE-NEXT: strne r1, [r0, #12] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <4 x i32> %a, zeroinitializer @@ -181,37 +163,24 @@ entry: define arm_aapcs_vfpcc void @masked_v8i16_align1(ptr %dest, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8i16_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr ; CHECK-LE-NEXT: vmrs r1, p0 -; CHECK-LE-NEXT: and r2, r1, #1 -; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: bfi r2, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 ; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 ; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #3 ; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #4 ; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #5 ; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 ; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #6, #1 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r2, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r1, r2 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #6 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #7 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne.u16 r2, q0[0] ; CHECK-LE-NEXT: strhne r2, [r0] @@ -243,43 +212,29 @@ define arm_aapcs_vfpcc void @masked_v8i16_align1(ptr %dest, <8 x i16> %a) { ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi.u16 r1, q0[7] ; CHECK-LE-NEXT: strhmi r1, [r0, #14] -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8i16_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: vmrs r1, p0 -; CHECK-BE-NEXT: ubfx r2, r1, #14, #1 -; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: bfi r2, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #14, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 ; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 ; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #3 ; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #4 ; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #5 ; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 ; CHECK-BE-NEXT: and r1, r1, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #6, #1 -; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: bfi r2, r1, #7, #1 -; CHECK-BE-NEXT: uxtb r1, r2 -; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #6 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #7 +; CHECK-BE-NEXT: lsls r2, r1, #24 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi.u16 r2, q1[0] ; CHECK-BE-NEXT: strhmi r2, [r0] @@ -311,7 +266,6 @@ define arm_aapcs_vfpcc void @masked_v8i16_align1(ptr %dest, <8 x i16> %a) { ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne.u16 r1, q1[7] ; CHECK-BE-NEXT: strhne r1, [r0, #14] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <8 x i16> %a, zeroinitializer @@ -471,23 +425,15 @@ entry: define arm_aapcs_vfpcc void @masked_v4f32_align1(ptr %dest, <4 x float> %a, <4 x i32> %b) { ; CHECK-LE-LABEL: masked_v4f32_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r3, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 @@ -504,30 +450,21 @@ define arm_aapcs_vfpcc void @masked_v4f32_align1(ptr %dest, <4 x float> %a, <4 x ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi r1, s3 ; CHECK-LE-NEXT: strmi r1, [r0, #12] -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4f32_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q2, q1 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmrs r2, p0 -; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-BE-NEXT: and r2, r2, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi r2, s4 @@ -544,7 +481,6 @@ define arm_aapcs_vfpcc void @masked_v4f32_align1(ptr %dest, <4 x float> %a, <4 x ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne r1, s7 ; CHECK-BE-NEXT: strne r1, [r0, #12] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp ugt <4 x i32> %b, zeroinitializer @@ -630,37 +566,26 @@ entry: define arm_aapcs_vfpcc void @masked_v8f16_align1(ptr %dest, <8 x half> %a, <8 x i16> %b) { ; CHECK-LE-LABEL: masked_v8f16_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #36 -; CHECK-LE-NEXT: sub sp, #36 +; CHECK-LE-NEXT: .pad #32 +; CHECK-LE-NEXT: sub sp, #32 ; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr -; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vmrs r1, p0 ; CHECK-LE-NEXT: and r3, r1, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 ; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 ; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #3 ; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #4 ; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #5 ; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 ; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #6, #1 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r2, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r1, r2 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #6 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #7 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: bne .LBB16_9 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -684,7 +609,7 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(ptr %dest, <8 x half> %a, <8 x ; CHECK-LE-NEXT: lsls r1, r1, #24 ; CHECK-LE-NEXT: bmi .LBB16_16 ; CHECK-LE-NEXT: .LBB16_8: @ %else14 -; CHECK-LE-NEXT: add sp, #36 +; CHECK-LE-NEXT: add sp, #32 ; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB16_9: @ %cond.store ; CHECK-LE-NEXT: vstr.16 s0, [sp, #28] @@ -736,44 +661,33 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(ptr %dest, <8 x half> %a, <8 x ; CHECK-LE-NEXT: vstr.16 s0, [sp] ; CHECK-LE-NEXT: ldrh.w r1, [sp] ; CHECK-LE-NEXT: strh r1, [r0, #14] -; CHECK-LE-NEXT: add sp, #36 +; CHECK-LE-NEXT: add sp, #32 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8f16_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #36 -; CHECK-BE-NEXT: sub sp, #36 +; CHECK-BE-NEXT: .pad #32 +; CHECK-BE-NEXT: sub sp, #32 ; CHECK-BE-NEXT: vrev64.16 q2, q1 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr ; CHECK-BE-NEXT: vmrs r1, p0 -; CHECK-BE-NEXT: ubfx r2, r1, #14, #1 -; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: bfi r2, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #14, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 ; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 ; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #3 ; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #4 ; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #5 ; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 ; CHECK-BE-NEXT: and r1, r1, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #6, #1 -; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: bfi r2, r1, #7, #1 -; CHECK-BE-NEXT: uxtb r1, r2 -; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #6 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #7 +; CHECK-BE-NEXT: lsls r2, r1, #24 ; CHECK-BE-NEXT: bmi .LBB16_9 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #25 @@ -797,7 +711,7 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(ptr %dest, <8 x half> %a, <8 x ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: bne .LBB16_16 ; CHECK-BE-NEXT: .LBB16_8: @ %else14 -; CHECK-BE-NEXT: add sp, #36 +; CHECK-BE-NEXT: add sp, #32 ; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .LBB16_9: @ %cond.store ; CHECK-BE-NEXT: vstr.16 s4, [sp, #28] @@ -849,7 +763,7 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(ptr %dest, <8 x half> %a, <8 x ; CHECK-BE-NEXT: vstr.16 s0, [sp] ; CHECK-BE-NEXT: ldrh.w r1, [sp] ; CHECK-BE-NEXT: strh r1, [r0, #14] -; CHECK-BE-NEXT: add sp, #36 +; CHECK-BE-NEXT: add sp, #32 ; CHECK-BE-NEXT: bx lr entry: %c = icmp ugt <8 x i16> %b, zeroinitializer @@ -917,53 +831,53 @@ define arm_aapcs_vfpcc void @masked_v2i64(ptr %dest, <2 x i64> %a) { ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .save {r7, lr} ; CHECK-LE-NEXT: push {r7, lr} -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r2, r3, d0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vmov r12, lr, d1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: rsbs.w r3, r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, lr -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: vmov r1, lr, d0 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmov r3, r12, d1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, lr +; CHECK-LE-NEXT: csetm r1, lt +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: sbcs.w r3, r2, r12 +; CHECK-LE-NEXT: bfi r2, r1, #0, #8 +; CHECK-LE-NEXT: csetm r1, lt +; CHECK-LE-NEXT: bfi r2, r1, #8, #8 +; CHECK-LE-NEXT: ubfx r1, r2, #8, #1 +; CHECK-LE-NEXT: and r2, r2, #1 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vstrmi d1, [r0, #8] -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r7, pc} ; ; CHECK-BE-LABEL: masked_v2i64: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .save {r7, lr} ; CHECK-BE-NEXT: push {r7, lr} -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vmov r2, r3, d3 -; CHECK-BE-NEXT: vmov r12, lr, d2 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: sbcs.w r2, r1, r2 +; CHECK-BE-NEXT: vmov lr, r2, d2 +; CHECK-BE-NEXT: vmov r12, r3, d3 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r1, lr ; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: sbcs.w r3, r1, r12 -; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: bfi r1, r2, #0, #8 ; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: bfi r1, r2, #8, #8 +; CHECK-BE-NEXT: and r2, r1, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1 ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vstrmi d0, [r0] ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vstrne d1, [r0, #8] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r7, pc} entry: %c = icmp sgt <2 x i64> %a, zeroinitializer @@ -976,53 +890,53 @@ define arm_aapcs_vfpcc void @masked_v2f64(ptr %dest, <2 x double> %a, <2 x i64> ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .save {r7, lr} ; CHECK-LE-NEXT: push {r7, lr} -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r2, r3, d2 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vmov r12, lr, d3 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: rsbs.w r3, r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, lr -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: vmov r1, lr, d2 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmov r3, r12, d3 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, lr +; CHECK-LE-NEXT: csetm r1, lt +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: sbcs.w r3, r2, r12 +; CHECK-LE-NEXT: bfi r2, r1, #0, #8 +; CHECK-LE-NEXT: csetm r1, lt +; CHECK-LE-NEXT: bfi r2, r1, #8, #8 +; CHECK-LE-NEXT: ubfx r1, r2, #8, #1 +; CHECK-LE-NEXT: and r2, r2, #1 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vstrmi d1, [r0, #8] -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r7, pc} ; ; CHECK-BE-LABEL: masked_v2f64: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .save {r7, lr} ; CHECK-BE-NEXT: push {r7, lr} -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q2, q1 ; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vmov r2, r3, d5 -; CHECK-BE-NEXT: vmov r12, lr, d4 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: sbcs.w r2, r1, r2 +; CHECK-BE-NEXT: vmov lr, r2, d4 +; CHECK-BE-NEXT: vmov r12, r3, d5 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r1, lr ; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: rsbs.w r3, lr, #0 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: sbcs.w r3, r1, r12 -; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: bfi r1, r2, #0, #8 ; CHECK-BE-NEXT: csetm r2, lt -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: bfi r1, r2, #8, #8 +; CHECK-BE-NEXT: and r2, r1, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1 ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vstrmi d0, [r0] ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vstrne d1, [r0, #8] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r7, pc} entry: %c = icmp sgt <2 x i64> %b, zeroinitializer @@ -1093,23 +1007,15 @@ entry: define arm_aapcs_vfpcc void @masked_v4i16_align1(ptr %dest, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4i16_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 @@ -1126,29 +1032,20 @@ define arm_aapcs_vfpcc void @masked_v4i16_align1(ptr %dest, <4 x i32> %a) { ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi r1, s3 ; CHECK-LE-NEXT: strhmi r1, [r0, #6] -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4i16_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vmrs r2, p0 -; CHECK-BE-NEXT: ubfx r1, r2, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r1, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: bfi r1, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-BE-NEXT: and r2, r2, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi r2, s4 @@ -1165,7 +1062,6 @@ define arm_aapcs_vfpcc void @masked_v4i16_align1(ptr %dest, <4 x i32> %a) { ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne r1, s7 ; CHECK-BE-NEXT: strhne r1, [r0, #6] -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <4 x i32> %a, zeroinitializer @@ -1177,120 +1073,122 @@ entry: define arm_aapcs_vfpcc void @masked_v4f16_align4(ptr %dest, <4 x float> %a) { ; CHECK-LE-LABEL: masked_v4f16_align4: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.f32 s0, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s1, #0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: csetm r1, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: bfi r2, r1, #0, #4 +; CHECK-LE-NEXT: csetm r1, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s3, #0 -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: bfi r2, r1, #4, #4 +; CHECK-LE-NEXT: csetm r1, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-LE-NEXT: bfi r1, r2, #2, #1 -; CHECK-LE-NEXT: csetm r2, gt -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: bfi r2, r1, #8, #4 +; CHECK-LE-NEXT: csetm r1, gt +; CHECK-LE-NEXT: bfi r2, r1, #12, #4 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: ubfx r1, r2, #4, #1 +; CHECK-LE-NEXT: orr.w r1, r3, r1, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 -; CHECK-LE-NEXT: bne .LBB25_5 +; CHECK-LE-NEXT: bne .LBB25_6 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 -; CHECK-LE-NEXT: bmi .LBB25_6 +; CHECK-LE-NEXT: bmi .LBB25_7 ; CHECK-LE-NEXT: .LBB25_2: @ %else2 ; CHECK-LE-NEXT: lsls r2, r1, #29 -; CHECK-LE-NEXT: bmi .LBB25_7 -; CHECK-LE-NEXT: .LBB25_3: @ %else4 +; CHECK-LE-NEXT: bpl .LBB25_4 +; CHECK-LE-NEXT: .LBB25_3: @ %cond.store3 +; CHECK-LE-NEXT: vstr.16 s5, [r0, #4] +; CHECK-LE-NEXT: .LBB25_4: @ %else4 ; CHECK-LE-NEXT: lsls r1, r1, #28 -; CHECK-LE-NEXT: bmi .LBB25_8 -; CHECK-LE-NEXT: .LBB25_4: @ %else6 -; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: it pl +; CHECK-LE-NEXT: bxpl lr +; CHECK-LE-NEXT: .LBB25_5: @ %cond.store5 +; CHECK-LE-NEXT: vmovx.f16 s0, s5 +; CHECK-LE-NEXT: vstr.16 s0, [r0, #6] ; CHECK-LE-NEXT: bx lr -; CHECK-LE-NEXT: .LBB25_5: @ %cond.store +; CHECK-LE-NEXT: .LBB25_6: @ %cond.store ; CHECK-LE-NEXT: vstr.16 s4, [r0] ; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: bpl .LBB25_2 -; CHECK-LE-NEXT: .LBB25_6: @ %cond.store1 +; CHECK-LE-NEXT: .LBB25_7: @ %cond.store1 ; CHECK-LE-NEXT: vmovx.f16 s0, s4 ; CHECK-LE-NEXT: vstr.16 s0, [r0, #2] ; CHECK-LE-NEXT: lsls r2, r1, #29 -; CHECK-LE-NEXT: bpl .LBB25_3 -; CHECK-LE-NEXT: .LBB25_7: @ %cond.store3 -; CHECK-LE-NEXT: vstr.16 s5, [r0, #4] -; CHECK-LE-NEXT: lsls r1, r1, #28 -; CHECK-LE-NEXT: bpl .LBB25_4 -; CHECK-LE-NEXT: .LBB25_8: @ %cond.store5 -; CHECK-LE-NEXT: vmovx.f16 s0, s5 -; CHECK-LE-NEXT: vstr.16 s0, [r0, #6] -; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: bmi .LBB25_3 +; CHECK-LE-NEXT: b .LBB25_4 ; ; CHECK-BE-LABEL: masked_v4f16_align4: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vcmp.f32 s7, #0 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: vcmp.f32 s4, #0 ; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: vcmp.f32 s6, #0 +; CHECK-BE-NEXT: vcmp.f32 s5, #0 ; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 ; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 ; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 -; CHECK-BE-NEXT: csetm r2, gt +; CHECK-BE-NEXT: csetm r1, gt ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: vcmp.f32 s5, #0 -; CHECK-BE-NEXT: bfi r1, r2, #0, #1 -; CHECK-BE-NEXT: csetm r2, gt +; CHECK-BE-NEXT: vcmp.f32 s6, #0 +; CHECK-BE-NEXT: bfi r2, r1, #0, #4 +; CHECK-BE-NEXT: csetm r1, gt ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: vcmp.f32 s4, #0 -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 -; CHECK-BE-NEXT: csetm r2, gt +; CHECK-BE-NEXT: vcmp.f32 s7, #0 +; CHECK-BE-NEXT: bfi r2, r1, #4, #4 +; CHECK-BE-NEXT: csetm r1, gt ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: bfi r1, r2, #2, #1 -; CHECK-BE-NEXT: csetm r2, gt -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: bfi r2, r1, #8, #4 +; CHECK-BE-NEXT: csetm r1, gt +; CHECK-BE-NEXT: bfi r2, r1, #12, #4 +; CHECK-BE-NEXT: ubfx r1, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 +; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: and r2, r2, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 -; CHECK-BE-NEXT: bmi .LBB25_5 +; CHECK-BE-NEXT: bmi .LBB25_6 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #29 -; CHECK-BE-NEXT: bmi .LBB25_6 +; CHECK-BE-NEXT: bmi .LBB25_7 ; CHECK-BE-NEXT: .LBB25_2: @ %else2 ; CHECK-BE-NEXT: lsls r2, r1, #30 -; CHECK-BE-NEXT: bmi .LBB25_7 -; CHECK-BE-NEXT: .LBB25_3: @ %else4 +; CHECK-BE-NEXT: bpl .LBB25_4 +; CHECK-BE-NEXT: .LBB25_3: @ %cond.store3 +; CHECK-BE-NEXT: vstr.16 s1, [r0, #4] +; CHECK-BE-NEXT: .LBB25_4: @ %else4 ; CHECK-BE-NEXT: lsls r1, r1, #31 -; CHECK-BE-NEXT: bne .LBB25_8 -; CHECK-BE-NEXT: .LBB25_4: @ %else6 -; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: it eq +; CHECK-BE-NEXT: bxeq lr +; CHECK-BE-NEXT: .LBB25_5: @ %cond.store5 +; CHECK-BE-NEXT: vmovx.f16 s0, s1 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #6] ; CHECK-BE-NEXT: bx lr -; CHECK-BE-NEXT: .LBB25_5: @ %cond.store +; CHECK-BE-NEXT: .LBB25_6: @ %cond.store ; CHECK-BE-NEXT: vstr.16 s0, [r0] ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: bpl .LBB25_2 -; CHECK-BE-NEXT: .LBB25_6: @ %cond.store1 +; CHECK-BE-NEXT: .LBB25_7: @ %cond.store1 ; CHECK-BE-NEXT: vmovx.f16 s0, s0 ; CHECK-BE-NEXT: vstr.16 s0, [r0, #2] ; CHECK-BE-NEXT: lsls r2, r1, #30 -; CHECK-BE-NEXT: bpl .LBB25_3 -; CHECK-BE-NEXT: .LBB25_7: @ %cond.store3 -; CHECK-BE-NEXT: vstr.16 s1, [r0, #4] -; CHECK-BE-NEXT: lsls r1, r1, #31 -; CHECK-BE-NEXT: beq .LBB25_4 -; CHECK-BE-NEXT: .LBB25_8: @ %cond.store5 -; CHECK-BE-NEXT: vmovx.f16 s0, s1 -; CHECK-BE-NEXT: vstr.16 s0, [r0, #6] -; CHECK-BE-NEXT: add sp, #4 -; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: bmi .LBB25_3 +; CHECK-BE-NEXT: b .LBB25_4 entry: %c = fcmp ogt <4 x float> %a, zeroinitializer %trunc = fptrunc <4 x float> %a to <4 x half> @@ -1301,120 +1199,122 @@ entry: define arm_aapcs_vfpcc void @masked_v4f16_align2(ptr %dest, <4 x float> %a) { ; CHECK-LE-LABEL: masked_v4f16_align2: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.f32 s0, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s1, #0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: csetm r1, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: bfi r2, r1, #0, #4 +; CHECK-LE-NEXT: csetm r1, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s3, #0 -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: bfi r2, r1, #4, #4 +; CHECK-LE-NEXT: csetm r1, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-LE-NEXT: bfi r1, r2, #2, #1 -; CHECK-LE-NEXT: csetm r2, gt -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: bfi r2, r1, #8, #4 +; CHECK-LE-NEXT: csetm r1, gt +; CHECK-LE-NEXT: bfi r2, r1, #12, #4 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: ubfx r1, r2, #4, #1 +; CHECK-LE-NEXT: orr.w r1, r3, r1, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 -; CHECK-LE-NEXT: bne .LBB26_5 +; CHECK-LE-NEXT: bne .LBB26_6 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 -; CHECK-LE-NEXT: bmi .LBB26_6 +; CHECK-LE-NEXT: bmi .LBB26_7 ; CHECK-LE-NEXT: .LBB26_2: @ %else2 ; CHECK-LE-NEXT: lsls r2, r1, #29 -; CHECK-LE-NEXT: bmi .LBB26_7 -; CHECK-LE-NEXT: .LBB26_3: @ %else4 +; CHECK-LE-NEXT: bpl .LBB26_4 +; CHECK-LE-NEXT: .LBB26_3: @ %cond.store3 +; CHECK-LE-NEXT: vstr.16 s5, [r0, #4] +; CHECK-LE-NEXT: .LBB26_4: @ %else4 ; CHECK-LE-NEXT: lsls r1, r1, #28 -; CHECK-LE-NEXT: bmi .LBB26_8 -; CHECK-LE-NEXT: .LBB26_4: @ %else6 -; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: it pl +; CHECK-LE-NEXT: bxpl lr +; CHECK-LE-NEXT: .LBB26_5: @ %cond.store5 +; CHECK-LE-NEXT: vmovx.f16 s0, s5 +; CHECK-LE-NEXT: vstr.16 s0, [r0, #6] ; CHECK-LE-NEXT: bx lr -; CHECK-LE-NEXT: .LBB26_5: @ %cond.store +; CHECK-LE-NEXT: .LBB26_6: @ %cond.store ; CHECK-LE-NEXT: vstr.16 s4, [r0] ; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: bpl .LBB26_2 -; CHECK-LE-NEXT: .LBB26_6: @ %cond.store1 +; CHECK-LE-NEXT: .LBB26_7: @ %cond.store1 ; CHECK-LE-NEXT: vmovx.f16 s0, s4 ; CHECK-LE-NEXT: vstr.16 s0, [r0, #2] ; CHECK-LE-NEXT: lsls r2, r1, #29 -; CHECK-LE-NEXT: bpl .LBB26_3 -; CHECK-LE-NEXT: .LBB26_7: @ %cond.store3 -; CHECK-LE-NEXT: vstr.16 s5, [r0, #4] -; CHECK-LE-NEXT: lsls r1, r1, #28 -; CHECK-LE-NEXT: bpl .LBB26_4 -; CHECK-LE-NEXT: .LBB26_8: @ %cond.store5 -; CHECK-LE-NEXT: vmovx.f16 s0, s5 -; CHECK-LE-NEXT: vstr.16 s0, [r0, #6] -; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: bmi .LBB26_3 +; CHECK-LE-NEXT: b .LBB26_4 ; ; CHECK-BE-LABEL: masked_v4f16_align2: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vcmp.f32 s7, #0 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: vcmp.f32 s4, #0 ; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: vcmp.f32 s6, #0 +; CHECK-BE-NEXT: vcmp.f32 s5, #0 ; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 ; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 ; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 -; CHECK-BE-NEXT: csetm r2, gt +; CHECK-BE-NEXT: csetm r1, gt ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: vcmp.f32 s5, #0 -; CHECK-BE-NEXT: bfi r1, r2, #0, #1 -; CHECK-BE-NEXT: csetm r2, gt +; CHECK-BE-NEXT: vcmp.f32 s6, #0 +; CHECK-BE-NEXT: bfi r2, r1, #0, #4 +; CHECK-BE-NEXT: csetm r1, gt ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: vcmp.f32 s4, #0 -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 -; CHECK-BE-NEXT: csetm r2, gt +; CHECK-BE-NEXT: vcmp.f32 s7, #0 +; CHECK-BE-NEXT: bfi r2, r1, #4, #4 +; CHECK-BE-NEXT: csetm r1, gt ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: bfi r1, r2, #2, #1 -; CHECK-BE-NEXT: csetm r2, gt -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: bfi r2, r1, #8, #4 +; CHECK-BE-NEXT: csetm r1, gt +; CHECK-BE-NEXT: bfi r2, r1, #12, #4 +; CHECK-BE-NEXT: ubfx r1, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 +; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: and r2, r2, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 -; CHECK-BE-NEXT: bmi .LBB26_5 +; CHECK-BE-NEXT: bmi .LBB26_6 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #29 -; CHECK-BE-NEXT: bmi .LBB26_6 +; CHECK-BE-NEXT: bmi .LBB26_7 ; CHECK-BE-NEXT: .LBB26_2: @ %else2 ; CHECK-BE-NEXT: lsls r2, r1, #30 -; CHECK-BE-NEXT: bmi .LBB26_7 -; CHECK-BE-NEXT: .LBB26_3: @ %else4 +; CHECK-BE-NEXT: bpl .LBB26_4 +; CHECK-BE-NEXT: .LBB26_3: @ %cond.store3 +; CHECK-BE-NEXT: vstr.16 s1, [r0, #4] +; CHECK-BE-NEXT: .LBB26_4: @ %else4 ; CHECK-BE-NEXT: lsls r1, r1, #31 -; CHECK-BE-NEXT: bne .LBB26_8 -; CHECK-BE-NEXT: .LBB26_4: @ %else6 -; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: it eq +; CHECK-BE-NEXT: bxeq lr +; CHECK-BE-NEXT: .LBB26_5: @ %cond.store5 +; CHECK-BE-NEXT: vmovx.f16 s0, s1 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #6] ; CHECK-BE-NEXT: bx lr -; CHECK-BE-NEXT: .LBB26_5: @ %cond.store +; CHECK-BE-NEXT: .LBB26_6: @ %cond.store ; CHECK-BE-NEXT: vstr.16 s0, [r0] ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: bpl .LBB26_2 -; CHECK-BE-NEXT: .LBB26_6: @ %cond.store1 +; CHECK-BE-NEXT: .LBB26_7: @ %cond.store1 ; CHECK-BE-NEXT: vmovx.f16 s0, s0 ; CHECK-BE-NEXT: vstr.16 s0, [r0, #2] ; CHECK-BE-NEXT: lsls r2, r1, #30 -; CHECK-BE-NEXT: bpl .LBB26_3 -; CHECK-BE-NEXT: .LBB26_7: @ %cond.store3 -; CHECK-BE-NEXT: vstr.16 s1, [r0, #4] -; CHECK-BE-NEXT: lsls r1, r1, #31 -; CHECK-BE-NEXT: beq .LBB26_4 -; CHECK-BE-NEXT: .LBB26_8: @ %cond.store5 -; CHECK-BE-NEXT: vmovx.f16 s0, s1 -; CHECK-BE-NEXT: vstr.16 s0, [r0, #6] -; CHECK-BE-NEXT: add sp, #4 -; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: bmi .LBB26_3 +; CHECK-BE-NEXT: b .LBB26_4 entry: %c = fcmp ogt <4 x float> %a, zeroinitializer %trunc = fptrunc <4 x float> %a to <4 x half> @@ -1425,29 +1325,36 @@ entry: define arm_aapcs_vfpcc void @masked_v4f16_align1(ptr %dest, <4 x float> %a) { ; CHECK-LE-LABEL: masked_v4f16_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #20 -; CHECK-LE-NEXT: sub sp, #20 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 ; CHECK-LE-NEXT: vcmp.f32 s0, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s1, #0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: csetm r1, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: bfi r2, r1, #0, #4 +; CHECK-LE-NEXT: csetm r1, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s3, #0 -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: bfi r2, r1, #4, #4 +; CHECK-LE-NEXT: csetm r1, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-LE-NEXT: bfi r1, r2, #2, #1 -; CHECK-LE-NEXT: csetm r2, gt -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: bfi r2, r1, #8, #4 +; CHECK-LE-NEXT: csetm r1, gt +; CHECK-LE-NEXT: bfi r2, r1, #12, #4 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: ubfx r1, r2, #4, #1 +; CHECK-LE-NEXT: orr.w r1, r3, r1, lsl #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #2 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: bne .LBB27_5 ; CHECK-LE-NEXT: @ %bb.1: @ %else @@ -1460,7 +1367,7 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(ptr %dest, <4 x float> %a) { ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: bmi .LBB27_8 ; CHECK-LE-NEXT: .LBB27_4: @ %else6 -; CHECK-LE-NEXT: add sp, #20 +; CHECK-LE-NEXT: add sp, #16 ; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB27_5: @ %cond.store ; CHECK-LE-NEXT: vstr.16 s4, [sp, #12] @@ -1486,35 +1393,42 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(ptr %dest, <4 x float> %a) { ; CHECK-LE-NEXT: vstr.16 s0, [sp] ; CHECK-LE-NEXT: ldrh.w r1, [sp] ; CHECK-LE-NEXT: strh r1, [r0, #6] -; CHECK-LE-NEXT: add sp, #20 +; CHECK-LE-NEXT: add sp, #16 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v4f16_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #20 -; CHECK-BE-NEXT: sub sp, #20 +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vcmp.f32 s7, #0 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: vcmp.f32 s4, #0 ; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: vcmp.f32 s6, #0 +; CHECK-BE-NEXT: vcmp.f32 s5, #0 ; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 ; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 ; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 -; CHECK-BE-NEXT: csetm r2, gt +; CHECK-BE-NEXT: csetm r1, gt ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: vcmp.f32 s5, #0 -; CHECK-BE-NEXT: bfi r1, r2, #0, #1 -; CHECK-BE-NEXT: csetm r2, gt +; CHECK-BE-NEXT: vcmp.f32 s6, #0 +; CHECK-BE-NEXT: bfi r2, r1, #0, #4 +; CHECK-BE-NEXT: csetm r1, gt ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: vcmp.f32 s4, #0 -; CHECK-BE-NEXT: bfi r1, r2, #1, #1 -; CHECK-BE-NEXT: csetm r2, gt +; CHECK-BE-NEXT: vcmp.f32 s7, #0 +; CHECK-BE-NEXT: bfi r2, r1, #4, #4 +; CHECK-BE-NEXT: csetm r1, gt ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: bfi r1, r2, #2, #1 -; CHECK-BE-NEXT: csetm r2, gt -; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: bfi r2, r1, #8, #4 +; CHECK-BE-NEXT: csetm r1, gt +; CHECK-BE-NEXT: bfi r2, r1, #12, #4 +; CHECK-BE-NEXT: ubfx r1, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 +; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: and r2, r2, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r3, lsl #2 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #3 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: bmi .LBB27_5 ; CHECK-BE-NEXT: @ %bb.1: @ %else @@ -1527,7 +1441,7 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(ptr %dest, <4 x float> %a) { ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: bne .LBB27_8 ; CHECK-BE-NEXT: .LBB27_4: @ %else6 -; CHECK-BE-NEXT: add sp, #20 +; CHECK-BE-NEXT: add sp, #16 ; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .LBB27_5: @ %cond.store ; CHECK-BE-NEXT: vstr.16 s0, [sp, #12] @@ -1553,7 +1467,7 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(ptr %dest, <4 x float> %a) { ; CHECK-BE-NEXT: vstr.16 s0, [sp] ; CHECK-BE-NEXT: ldrh.w r1, [sp] ; CHECK-BE-NEXT: strh r1, [r0, #6] -; CHECK-BE-NEXT: add sp, #20 +; CHECK-BE-NEXT: add sp, #16 ; CHECK-BE-NEXT: bx lr entry: %c = fcmp ogt <4 x float> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll index 0d0e45956080d..6aea38cde4e69 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll @@ -5,47 +5,44 @@ define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) { ; CHECK-LE-LABEL: bitcast_to_v4i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: and r0, r0, #15 -; CHECK-LE-NEXT: vmov.i8 q1, #0x0 -; CHECK-LE-NEXT: vmov.i8 q2, #0xff -; CHECK-LE-NEXT: vmsr p0, r0 -; CHECK-LE-NEXT: vpsel q1, q2, q1 -; CHECK-LE-NEXT: vmov.u8 r0, q1[2] -; CHECK-LE-NEXT: vmov.u8 r1, q1[0] -; CHECK-LE-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-LE-NEXT: vmov.u8 r0, q1[3] -; CHECK-LE-NEXT: vmov.u8 r1, q1[1] +; CHECK-LE-NEXT: and r3, r0, #1 +; CHECK-LE-NEXT: and r1, r0, #8 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: sub.w r1, r2, r1, lsr #3 +; CHECK-LE-NEXT: bfi r2, r3, #0, #4 +; CHECK-LE-NEXT: ubfx r3, r0, #1, #1 +; CHECK-LE-NEXT: ubfx r0, r0, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsbs r0, r0, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #4 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-LE-NEXT: vcmp.i32 ne, q2, zr +; CHECK-LE-NEXT: bfi r2, r0, #8, #4 +; CHECK-LE-NEXT: bfi r2, r1, #12, #4 +; CHECK-LE-NEXT: vmsr p0, r2 ; CHECK-LE-NEXT: vpsel q0, q0, q1 -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_to_v4i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: rbit r0, r0 -; CHECK-BE-NEXT: vmov.i8 q1, #0x0 -; CHECK-BE-NEXT: vmov.i8 q2, #0xff -; CHECK-BE-NEXT: lsrs r0, r0, #28 -; CHECK-BE-NEXT: vmsr p0, r0 -; CHECK-BE-NEXT: vpsel q1, q2, q1 -; CHECK-BE-NEXT: vmov.u8 r0, q1[2] -; CHECK-BE-NEXT: vmov.u8 r1, q1[0] -; CHECK-BE-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-BE-NEXT: vmov.u8 r0, q1[3] -; CHECK-BE-NEXT: vmov.u8 r1, q1[1] +; CHECK-BE-NEXT: and r1, r0, #8 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov q2[3], q2[1], r1, r0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr +; CHECK-BE-NEXT: sub.w r1, r2, r1, lsr #3 +; CHECK-BE-NEXT: bfi r2, r1, #0, #4 +; CHECK-BE-NEXT: ubfx r1, r0, #2, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #4, #4 +; CHECK-BE-NEXT: ubfx r1, r0, #1, #1 +; CHECK-BE-NEXT: and r0, r0, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #8, #4 +; CHECK-BE-NEXT: rsbs r0, r0, #0 +; CHECK-BE-NEXT: bfi r2, r0, #12, #4 +; CHECK-BE-NEXT: vmsr p0, r2 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = bitcast i4 %b to <4 x i1> @@ -56,68 +53,68 @@ entry: define arm_aapcs_vfpcc <8 x i16> @bitcast_to_v8i1(i8 %b, <8 x i16> %a) { ; CHECK-LE-LABEL: bitcast_to_v8i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: uxtb r0, r0 -; CHECK-LE-NEXT: vmov.i8 q1, #0x0 -; CHECK-LE-NEXT: vmov.i8 q2, #0xff -; CHECK-LE-NEXT: vmsr p0, r0 -; CHECK-LE-NEXT: vpsel q2, q2, q1 -; CHECK-LE-NEXT: vmov.u8 r0, q2[0] -; CHECK-LE-NEXT: vmov.16 q1[0], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[1] -; CHECK-LE-NEXT: vmov.16 q1[1], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[2] -; CHECK-LE-NEXT: vmov.16 q1[2], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[3] -; CHECK-LE-NEXT: vmov.16 q1[3], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[4] -; CHECK-LE-NEXT: vmov.16 q1[4], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[5] -; CHECK-LE-NEXT: vmov.16 q1[5], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[6] -; CHECK-LE-NEXT: vmov.16 q1[6], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[7] -; CHECK-LE-NEXT: vmov.16 q1[7], r0 -; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr +; CHECK-LE-NEXT: and r3, r0, #1 +; CHECK-LE-NEXT: uxtb r2, r0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: sub.w r2, r1, r2, lsr #7 +; CHECK-LE-NEXT: bfi r1, r3, #0, #2 +; CHECK-LE-NEXT: ubfx r3, r0, #1, #1 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #2 +; CHECK-LE-NEXT: ubfx r3, r0, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #4, #2 +; CHECK-LE-NEXT: ubfx r3, r0, #3, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #6, #2 +; CHECK-LE-NEXT: ubfx r3, r0, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #8, #2 +; CHECK-LE-NEXT: ubfx r3, r0, #5, #1 +; CHECK-LE-NEXT: ubfx r0, r0, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #10, #2 +; CHECK-LE-NEXT: rsbs r0, r0, #0 +; CHECK-LE-NEXT: bfi r1, r0, #12, #2 +; CHECK-LE-NEXT: bfi r1, r2, #14, #2 +; CHECK-LE-NEXT: vmsr p0, r1 ; CHECK-LE-NEXT: vpsel q0, q0, q1 -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_to_v8i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: uxtb r0, r0 -; CHECK-BE-NEXT: vmov.i8 q1, #0x0 -; CHECK-BE-NEXT: rbit r0, r0 -; CHECK-BE-NEXT: vmov.i8 q2, #0xff -; CHECK-BE-NEXT: lsrs r0, r0, #24 -; CHECK-BE-NEXT: vmsr p0, r0 -; CHECK-BE-NEXT: vpsel q2, q2, q1 -; CHECK-BE-NEXT: vmov.u8 r0, q2[0] -; CHECK-BE-NEXT: vmov.16 q1[0], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[1] -; CHECK-BE-NEXT: vmov.16 q1[1], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[2] -; CHECK-BE-NEXT: vmov.16 q1[2], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[3] -; CHECK-BE-NEXT: vmov.16 q1[3], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[4] -; CHECK-BE-NEXT: vmov.16 q1[4], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[5] -; CHECK-BE-NEXT: vmov.16 q1[5], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[6] -; CHECK-BE-NEXT: vmov.16 q1[6], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[7] -; CHECK-BE-NEXT: vmov.16 q1[7], r0 -; CHECK-BE-NEXT: vcmp.i16 ne, q1, zr +; CHECK-BE-NEXT: uxtb r2, r0 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: sub.w r2, r1, r2, lsr #7 ; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: bfi r1, r2, #0, #2 +; CHECK-BE-NEXT: ubfx r2, r0, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: bfi r1, r2, #2, #2 +; CHECK-BE-NEXT: ubfx r2, r0, #5, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #4, #2 +; CHECK-BE-NEXT: ubfx r2, r0, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #6, #2 +; CHECK-BE-NEXT: ubfx r2, r0, #3, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #8, #2 +; CHECK-BE-NEXT: ubfx r2, r0, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #10, #2 +; CHECK-BE-NEXT: ubfx r2, r0, #1, #1 +; CHECK-BE-NEXT: and r0, r0, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #12, #2 +; CHECK-BE-NEXT: rsbs r0, r0, #0 +; CHECK-BE-NEXT: bfi r1, r0, #14, #2 +; CHECK-BE-NEXT: vmsr p0, r1 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.16 q0, q1 -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = bitcast i8 %b to <8 x i1> @@ -128,27 +125,116 @@ entry: define arm_aapcs_vfpcc <16 x i8> @bitcast_to_v16i1(i16 %b, <16 x i8> %a) { ; CHECK-LE-LABEL: bitcast_to_v16i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: and r3, r0, #1 +; CHECK-LE-NEXT: uxth r2, r0 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: sub.w r2, r1, r2, lsr #15 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #1, #1 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #3, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #5, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #6, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #7, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #7, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #8, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #9, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #9, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #10, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #11, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #11, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #12, #1 +; CHECK-LE-NEXT: ubfx r3, r0, #13, #1 +; CHECK-LE-NEXT: ubfx r0, r0, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #13, #1 +; CHECK-LE-NEXT: rsbs r0, r0, #0 +; CHECK-LE-NEXT: bfi r1, r0, #14, #1 +; CHECK-LE-NEXT: bfi r1, r2, #15, #1 +; CHECK-LE-NEXT: vmsr p0, r1 ; CHECK-LE-NEXT: vpsel q0, q0, q1 -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_to_v16i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: uxth r0, r0 +; CHECK-BE-NEXT: uxth r2, r0 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: sub.w r2, r1, r2, lsr #15 ; CHECK-BE-NEXT: vrev64.8 q1, q0 -; CHECK-BE-NEXT: rbit r0, r0 +; CHECK-BE-NEXT: bfi r1, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #14, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: lsrs r0, r0, #16 -; CHECK-BE-NEXT: vmsr p0, r0 +; CHECK-BE-NEXT: bfi r1, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #13, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #11, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #10, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #9, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #6, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #7, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #7, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #9, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #5, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #10, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #11, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #3, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #12, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #13, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #1, #1 +; CHECK-BE-NEXT: and r0, r0, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #14, #1 +; CHECK-BE-NEXT: rsbs r0, r0, #0 +; CHECK-BE-NEXT: bfi r1, r0, #15, #1 +; CHECK-BE-NEXT: vmsr p0, r1 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.8 q0, q1 -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = bitcast i16 %b to <16 x i1> @@ -159,41 +245,30 @@ entry: define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) { ; CHECK-LE-LABEL: bitcast_to_v2i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: and r0, r0, #3 -; CHECK-LE-NEXT: vmov.i8 q1, #0x0 -; CHECK-LE-NEXT: vmov.i8 q2, #0xff -; CHECK-LE-NEXT: vmsr p0, r0 -; CHECK-LE-NEXT: vpsel q1, q2, q1 -; CHECK-LE-NEXT: vmov.u8 r0, q1[1] -; CHECK-LE-NEXT: vmov.u8 r1, q1[0] -; CHECK-LE-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-LE-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr +; CHECK-LE-NEXT: and r1, r0, #2 +; CHECK-LE-NEXT: and r0, r0, #1 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: rsbs r0, r0, #0 +; CHECK-LE-NEXT: sub.w r1, r2, r1, lsr #1 +; CHECK-LE-NEXT: bfi r2, r0, #0, #8 +; CHECK-LE-NEXT: bfi r2, r1, #8, #8 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: vmsr p0, r2 ; CHECK-LE-NEXT: vpsel q0, q0, q1 -; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_to_v2i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: rbit r0, r0 -; CHECK-BE-NEXT: vmov.i8 q1, #0x0 -; CHECK-BE-NEXT: vmov.i8 q2, #0xff -; CHECK-BE-NEXT: lsrs r0, r0, #30 -; CHECK-BE-NEXT: vmsr p0, r0 -; CHECK-BE-NEXT: vpsel q1, q2, q1 -; CHECK-BE-NEXT: vmov.u8 r0, q1[1] -; CHECK-BE-NEXT: vmov.u8 r1, q1[0] -; CHECK-BE-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-BE-NEXT: vcmp.i32 ne, q1, zr +; CHECK-BE-NEXT: and r1, r0, #2 +; CHECK-BE-NEXT: and r0, r0, #1 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: rsbs r0, r0, #0 +; CHECK-BE-NEXT: sub.w r1, r2, r1, lsr #1 ; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: bfi r2, r1, #0, #8 +; CHECK-BE-NEXT: bfi r2, r0, #8, #8 +; CHECK-BE-NEXT: vmsr p0, r2 ; CHECK-BE-NEXT: vpsel q0, q0, q1 -; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = bitcast i2 %b to <2 x i1> @@ -205,47 +280,29 @@ entry: define arm_aapcs_vfpcc i4 @bitcast_from_v4i1(<4 x i32> %a) { ; CHECK-LE-LABEL: bitcast_from_v4i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.i32 eq, q0, zr -; CHECK-LE-NEXT: vmrs r1, p0 -; CHECK-LE-NEXT: and r0, r1, #1 -; CHECK-LE-NEXT: rsbs r2, r0, #0 -; CHECK-LE-NEXT: movs r0, #0 -; CHECK-LE-NEXT: bfi r0, r2, #0, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #1, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 -; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #2, #1 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r0, r1, #3, #1 -; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: vmrs r0, p0 +; CHECK-LE-NEXT: and r2, r0, #1 +; CHECK-LE-NEXT: ubfx r1, r0, #4, #1 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1 +; CHECK-LE-NEXT: ubfx r2, r0, #8, #1 +; CHECK-LE-NEXT: ubfx r0, r0, #12, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #2 +; CHECK-LE-NEXT: orr.w r0, r1, r0, lsl #3 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_from_v4i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vcmp.i32 eq, q1, zr -; CHECK-BE-NEXT: vmrs r1, p0 -; CHECK-BE-NEXT: ubfx r0, r1, #12, #1 -; CHECK-BE-NEXT: rsbs r2, r0, #0 -; CHECK-BE-NEXT: movs r0, #0 -; CHECK-BE-NEXT: bfi r0, r2, #0, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #1, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #4, #1 -; CHECK-BE-NEXT: and r1, r1, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #2, #1 -; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: bfi r0, r1, #3, #1 -; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: vmrs r0, p0 +; CHECK-BE-NEXT: ubfx r1, r0, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #12, #1 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #1 +; CHECK-BE-NEXT: ubfx r2, r0, #4, #1 +; CHECK-BE-NEXT: and r0, r0, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #2 +; CHECK-BE-NEXT: orr.w r0, r1, r0, lsl #3 ; CHECK-BE-NEXT: bx lr entry: %c = icmp eq <4 x i32> %a, zeroinitializer @@ -256,73 +313,45 @@ entry: define arm_aapcs_vfpcc i8 @bitcast_from_v8i1(<8 x i16> %a) { ; CHECK-LE-LABEL: bitcast_from_v8i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr -; CHECK-LE-NEXT: vmrs r1, p0 -; CHECK-LE-NEXT: and r0, r1, #1 -; CHECK-LE-NEXT: rsbs r2, r0, #0 -; CHECK-LE-NEXT: movs r0, #0 -; CHECK-LE-NEXT: bfi r0, r2, #0, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #1, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #2, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #3, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #4, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #5, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #12, #1 -; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #6, #1 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r0, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r0, r0 -; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: vmrs r0, p0 +; CHECK-LE-NEXT: and r2, r0, #1 +; CHECK-LE-NEXT: ubfx r1, r0, #2, #1 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1 +; CHECK-LE-NEXT: ubfx r2, r0, #4, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #2 +; CHECK-LE-NEXT: ubfx r2, r0, #6, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #3 +; CHECK-LE-NEXT: ubfx r2, r0, #8, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #4 +; CHECK-LE-NEXT: ubfx r2, r0, #10, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #5 +; CHECK-LE-NEXT: ubfx r2, r0, #12, #1 +; CHECK-LE-NEXT: ubfx r0, r0, #14, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #6 +; CHECK-LE-NEXT: orr.w r0, r1, r0, lsl #7 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_from_v8i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.i16 eq, q1, zr -; CHECK-BE-NEXT: vmrs r1, p0 -; CHECK-BE-NEXT: ubfx r0, r1, #14, #1 -; CHECK-BE-NEXT: rsbs r2, r0, #0 -; CHECK-BE-NEXT: movs r0, #0 -; CHECK-BE-NEXT: bfi r0, r2, #0, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #1, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #10, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #2, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #3, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #6, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #4, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #4, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #5, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #2, #1 -; CHECK-BE-NEXT: and r1, r1, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #6, #1 -; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: bfi r0, r1, #7, #1 -; CHECK-BE-NEXT: uxtb r0, r0 -; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: vmrs r0, p0 +; CHECK-BE-NEXT: ubfx r1, r0, #12, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #14, #1 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #1 +; CHECK-BE-NEXT: ubfx r2, r0, #10, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #2 +; CHECK-BE-NEXT: ubfx r2, r0, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #3 +; CHECK-BE-NEXT: ubfx r2, r0, #6, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #4 +; CHECK-BE-NEXT: ubfx r2, r0, #4, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #5 +; CHECK-BE-NEXT: ubfx r2, r0, #2, #1 +; CHECK-BE-NEXT: and r0, r0, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #6 +; CHECK-BE-NEXT: orr.w r0, r1, r0, lsl #7 ; CHECK-BE-NEXT: bx lr entry: %c = icmp eq <8 x i16> %a, zeroinitializer @@ -333,24 +362,77 @@ entry: define arm_aapcs_vfpcc i16 @bitcast_from_v16i1(<16 x i8> %a) { ; CHECK-LE-LABEL: bitcast_from_v16i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.i8 eq, q0, zr ; CHECK-LE-NEXT: vmrs r0, p0 -; CHECK-LE-NEXT: uxth r0, r0 -; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: and r2, r0, #1 +; CHECK-LE-NEXT: ubfx r1, r0, #1, #1 +; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1 +; CHECK-LE-NEXT: ubfx r2, r0, #2, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #2 +; CHECK-LE-NEXT: ubfx r2, r0, #3, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #3 +; CHECK-LE-NEXT: ubfx r2, r0, #4, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #4 +; CHECK-LE-NEXT: ubfx r2, r0, #5, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #5 +; CHECK-LE-NEXT: ubfx r2, r0, #6, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #6 +; CHECK-LE-NEXT: ubfx r2, r0, #7, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #7 +; CHECK-LE-NEXT: ubfx r2, r0, #8, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #8 +; CHECK-LE-NEXT: ubfx r2, r0, #9, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #9 +; CHECK-LE-NEXT: ubfx r2, r0, #10, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #10 +; CHECK-LE-NEXT: ubfx r2, r0, #11, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #11 +; CHECK-LE-NEXT: ubfx r2, r0, #12, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #12 +; CHECK-LE-NEXT: ubfx r2, r0, #13, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #13 +; CHECK-LE-NEXT: ubfx r2, r0, #14, #1 +; CHECK-LE-NEXT: ubfx r0, r0, #15, #1 +; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #14 +; CHECK-LE-NEXT: orr.w r0, r1, r0, lsl #15 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_from_v16i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vcmp.i8 eq, q1, zr ; CHECK-BE-NEXT: vmrs r0, p0 -; CHECK-BE-NEXT: rbit r0, r0 -; CHECK-BE-NEXT: lsrs r0, r0, #16 -; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: ubfx r1, r0, #14, #1 +; CHECK-BE-NEXT: ubfx r2, r0, #15, #1 +; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #1 +; CHECK-BE-NEXT: ubfx r2, r0, #13, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #2 +; CHECK-BE-NEXT: ubfx r2, r0, #12, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #3 +; CHECK-BE-NEXT: ubfx r2, r0, #11, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #4 +; CHECK-BE-NEXT: ubfx r2, r0, #10, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #5 +; CHECK-BE-NEXT: ubfx r2, r0, #9, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #6 +; CHECK-BE-NEXT: ubfx r2, r0, #8, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #7 +; CHECK-BE-NEXT: ubfx r2, r0, #7, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #8 +; CHECK-BE-NEXT: ubfx r2, r0, #6, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #9 +; CHECK-BE-NEXT: ubfx r2, r0, #5, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #10 +; CHECK-BE-NEXT: ubfx r2, r0, #4, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #11 +; CHECK-BE-NEXT: ubfx r2, r0, #3, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #12 +; CHECK-BE-NEXT: ubfx r2, r0, #2, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #13 +; CHECK-BE-NEXT: ubfx r2, r0, #1, #1 +; CHECK-BE-NEXT: and r0, r0, #1 +; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #14 +; CHECK-BE-NEXT: orr.w r0, r1, r0, lsl #15 ; CHECK-BE-NEXT: bx lr entry: %c = icmp eq <16 x i8> %a, zeroinitializer @@ -361,35 +443,35 @@ entry: define arm_aapcs_vfpcc i2 @bitcast_from_v2i1(<2 x i64> %a) { ; CHECK-LE-LABEL: bitcast_from_v2i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #4 -; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vmov r0, r1, d0 ; CHECK-LE-NEXT: orrs r0, r1 -; CHECK-LE-NEXT: csetm r1, eq -; CHECK-LE-NEXT: movs r0, #0 -; CHECK-LE-NEXT: bfi r0, r1, #0, #1 -; CHECK-LE-NEXT: vmov r1, r2, d1 -; CHECK-LE-NEXT: orrs r1, r2 -; CHECK-LE-NEXT: csetm r1, eq -; CHECK-LE-NEXT: bfi r0, r1, #1, #1 -; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: csetm r0, eq +; CHECK-LE-NEXT: bfi r1, r0, #0, #8 +; CHECK-LE-NEXT: vmov r0, r2, d1 +; CHECK-LE-NEXT: orrs r0, r2 +; CHECK-LE-NEXT: csetm r0, eq +; CHECK-LE-NEXT: bfi r1, r0, #8, #8 +; CHECK-LE-NEXT: ubfx r0, r1, #8, #1 +; CHECK-LE-NEXT: and r1, r1, #1 +; CHECK-LE-NEXT: orr.w r0, r1, r0, lsl #1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_from_v2i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #4 -; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov r0, r1, d3 +; CHECK-BE-NEXT: vmov r0, r1, d2 ; CHECK-BE-NEXT: orrs r0, r1 -; CHECK-BE-NEXT: csetm r1, eq -; CHECK-BE-NEXT: movs r0, #0 -; CHECK-BE-NEXT: bfi r0, r1, #0, #1 -; CHECK-BE-NEXT: vmov r1, r2, d2 -; CHECK-BE-NEXT: orrs r1, r2 -; CHECK-BE-NEXT: csetm r1, eq -; CHECK-BE-NEXT: bfi r0, r1, #1, #1 -; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: csetm r0, eq +; CHECK-BE-NEXT: bfi r1, r0, #0, #8 +; CHECK-BE-NEXT: vmov r0, r2, d3 +; CHECK-BE-NEXT: orrs r0, r2 +; CHECK-BE-NEXT: csetm r0, eq +; CHECK-BE-NEXT: bfi r1, r0, #8, #8 +; CHECK-BE-NEXT: and r0, r1, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #8, #1 +; CHECK-BE-NEXT: orr.w r0, r1, r0, lsl #1 ; CHECK-BE-NEXT: bx lr entry: %c = icmp eq <2 x i64> %a, zeroinitializer diff --git a/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll b/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll index ef87ac31fcf48..2285150620b1a 100644 --- a/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll @@ -219,111 +219,93 @@ define <8 x bfloat> @select(i8 %x, <8 x bfloat> %y) nounwind { ; X64-LABEL: select: ; X64: # %bb.0: ; X64-NEXT: vmovaps %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x28,0xc8] -; X64-NEXT: movb %dil, %al # encoding: [0x40,0x88,0xf8] -; X64-NEXT: movb %al, -{{[0-9]+}}(%rsp) # encoding: [0x88,0x44,0x24,0xff] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0xff] -; X64-NEXT: movl %eax, %ecx # encoding: [0x89,0xc1] -; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X64-NEXT: vmovd %ecx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X64-NEXT: shrb %cl # encoding: [0xd0,0xe9] -; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X64-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X64-NEXT: shrb $2, %cl # encoding: [0xc0,0xe9,0x02] -; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X64-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X64-NEXT: shrb $3, %cl # encoding: [0xc0,0xe9,0x03] -; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X64-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x03] -; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X64-NEXT: shrb $4, %cl # encoding: [0xc0,0xe9,0x04] -; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X64-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X64-NEXT: shrb $5, %cl # encoding: [0xc0,0xe9,0x05] -; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X64-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X64-NEXT: shrb $6, %cl # encoding: [0xc0,0xe9,0x06] -; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X64-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x06] -; X64-NEXT: shrb $7, %al # encoding: [0xc0,0xe8,0x07] -; X64-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] -; X64-NEXT: negl %eax # encoding: [0xf7,0xd8] -; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X64-NEXT: movb %dil, %cl # encoding: [0x40,0x88,0xf9] +; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X64-NEXT: shrb %dl # encoding: [0xd0,0xea] +; X64-NEXT: # implicit-def: $eax +; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X64-NEXT: # implicit-def: $edx +; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X64-NEXT: vmovd %edx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc2] +; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X64-NEXT: shrb $2, %dl # encoding: [0xc0,0xea,0x02] +; X64-NEXT: # implicit-def: $eax +; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X64-NEXT: shrb $3, %dl # encoding: [0xc0,0xea,0x03] +; X64-NEXT: # implicit-def: $eax +; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X64-NEXT: shrb $4, %dl # encoding: [0xc0,0xea,0x04] +; X64-NEXT: # implicit-def: $eax +; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X64-NEXT: shrb $5, %dl # encoding: [0xc0,0xea,0x05] +; X64-NEXT: # implicit-def: $eax +; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X64-NEXT: shrb $6, %dl # encoding: [0xc0,0xea,0x06] +; X64-NEXT: # implicit-def: $eax +; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; X64-NEXT: shrb $7, %cl # encoding: [0xc0,0xe9,0x07] +; X64-NEXT: # implicit-def: $eax +; X64-NEXT: movb %cl, %al # encoding: [0x88,0xc8] +; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] +; X64-NEXT: vpsllw $15, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x0f] +; X64-NEXT: vpsraw $15, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xe0,0x0f] ; X64-NEXT: vpandn %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdf,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; ; X86-LABEL: select: ; X86: # %bb.0: -; X86-NEXT: pushl %eax # encoding: [0x50] ; X86-NEXT: vmovaps %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x28,0xc8] -; X86-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x08] -; X86-NEXT: movb %al, {{[0-9]+}}(%esp) # encoding: [0x88,0x44,0x24,0x03] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x03] -; X86-NEXT: movl %eax, %ecx # encoding: [0x89,0xc1] -; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X86-NEXT: vmovd %ecx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X86-NEXT: shrb %cl # encoding: [0xd0,0xe9] -; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X86-NEXT: shrb $2, %cl # encoding: [0xc0,0xe9,0x02] -; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X86-NEXT: shrb $3, %cl # encoding: [0xc0,0xe9,0x03] -; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X86-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x03] -; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X86-NEXT: shrb $4, %cl # encoding: [0xc0,0xe9,0x04] -; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X86-NEXT: shrb $5, %cl # encoding: [0xc0,0xe9,0x05] -; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1] -; X86-NEXT: shrb $6, %cl # encoding: [0xc0,0xe9,0x06] -; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01] -; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x06] -; X86-NEXT: shrb $7, %al # encoding: [0xc0,0xe8,0x07] -; X86-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] -; X86-NEXT: negl %eax # encoding: [0xf7,0xd8] -; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl # encoding: [0x8a,0x4c,0x24,0x04] +; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X86-NEXT: shrb %dl # encoding: [0xd0,0xea] +; X86-NEXT: # implicit-def: $eax +; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X86-NEXT: # implicit-def: $edx +; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X86-NEXT: vmovd %edx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc2] +; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X86-NEXT: shrb $2, %dl # encoding: [0xc0,0xea,0x02] +; X86-NEXT: # implicit-def: $eax +; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X86-NEXT: shrb $3, %dl # encoding: [0xc0,0xea,0x03] +; X86-NEXT: # implicit-def: $eax +; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X86-NEXT: shrb $4, %dl # encoding: [0xc0,0xea,0x04] +; X86-NEXT: # implicit-def: $eax +; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X86-NEXT: shrb $5, %dl # encoding: [0xc0,0xea,0x05] +; X86-NEXT: # implicit-def: $eax +; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca] +; X86-NEXT: shrb $6, %dl # encoding: [0xc0,0xea,0x06] +; X86-NEXT: # implicit-def: $eax +; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0] +; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; X86-NEXT: shrb $7, %cl # encoding: [0xc0,0xe9,0x07] +; X86-NEXT: # implicit-def: $eax +; X86-NEXT: movb %cl, %al # encoding: [0x88,0xc8] +; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] +; X86-NEXT: vpsllw $15, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x0f] +; X86-NEXT: vpsraw $15, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xe0,0x0f] ; X86-NEXT: vpandn %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdf,0xc1] -; X86-NEXT: popl %eax # encoding: [0x58] ; X86-NEXT: retl # encoding: [0xc3] %1 = bitcast i8 %x to <8 x i1> %2 = select <8 x i1> %1, <8 x bfloat> zeroinitializer, <8 x bfloat> %y diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index 501e73c46af9c..599a2514683d2 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -140,26 +140,36 @@ define i1 @trunc_v4i32_cmp(<4 x i32> %a0) nounwind { } define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind { -; SSE-LABEL: bitcast_v8i16_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i16_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: shrb $4, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm1 +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i16_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: shrb $4, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: retq ; ; AVX12-LABEL: bitcast_v8i16_to_v2i4: ; AVX12: # %bb.0: ; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX12-NEXT: vpmovmskb %xmm0, %eax -; AVX12-NEXT: movl %eax, %ecx -; AVX12-NEXT: shrb $4, %cl -; AVX12-NEXT: andb $15, %al +; AVX12-NEXT: vpmovmskb %xmm0, %ecx +; AVX12-NEXT: movl %ecx, %eax +; AVX12-NEXT: shrb $4, %al ; AVX12-NEXT: addb %cl, %al -; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: retq ; ; AVX512-LABEL: bitcast_v8i16_to_v2i4: @@ -362,26 +372,41 @@ define i1 @trunc_v4i64_cmp(<4 x i64> %a0) nounwind { } define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind { -; SSE-LABEL: bitcast_v8i32_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i32_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm2 +; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: shrb $4, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm1 +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i32_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: shrb $4, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: retq ; ; AVX-LABEL: bitcast_v8i32_to_v2i4: ; AVX: # %bb.0: -; AVX-NEXT: vmovmskps %ymm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrb $4, %cl -; AVX-NEXT: andb $15, %al +; AVX-NEXT: vmovmskps %ymm0, %ecx +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrb $4, %al ; AVX-NEXT: addb %cl, %al -; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %1 = icmp slt <8 x i32> %a0, zeroinitializer @@ -632,19 +657,46 @@ define i1 @trunc_v32i8_cmp(<32 x i8> %a0) nounwind { ; define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { -; SSE-LABEL: bitcast_v8i64_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i64_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: packsswb %xmm4, %xmm4 +; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: shrb $4, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm1 +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i64_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: shrb $4, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: bitcast_v8i64_to_v2i4: ; AVX1: # %bb.0: @@ -656,12 +708,10 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovmskps %ymm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: andb $15, %al +; AVX1-NEXT: vmovmskps %ymm0, %ecx +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: shrb $4, %al ; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -669,12 +719,10 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vmovmskps %ymm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: andb $15, %al +; AVX2-NEXT: vmovmskps %ymm0, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: shrb $4, %al ; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr64655.ll b/llvm/test/CodeGen/X86/pr64655.ll index f2929527c88f2..350a7e7d30b68 100644 --- a/llvm/test/CodeGen/X86/pr64655.ll +++ b/llvm/test/CodeGen/X86/pr64655.ll @@ -7,32 +7,32 @@ define void @f(ptr %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: movzbl (%rdi), %eax ; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrb $2, %cl -; AVX2-NEXT: andb $1, %cl ; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: andb $1, %dl -; AVX2-NEXT: vmovd %edx, %xmm0 -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrb $3, %cl -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrb $5, %cl -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: movl %eax, %r9d +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: movl $1, %r10d +; AVX2-NEXT: vpinsrw $1, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: shrb $2, %r9b +; AVX2-NEXT: movzbl %r9b, %r9d +; AVX2-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: shrb $3, %r8b +; AVX2-NEXT: movzbl %r8b, %r8d +; AVX2-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: shrb $4, %sil +; AVX2-NEXT: movzbl %sil, %esi +; AVX2-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0 +; AVX2-NEXT: shrb $5, %dl +; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 ; AVX2-NEXT: shrb $6, %cl -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: shrb $7, %al -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax