diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ff8efd2debc21..0d2feeb4edea3 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4933,6 +4933,43 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { return false; } + // Packed math FP32 instructions typically accept SGPRs or VGPRs as source + // operands. On gfx12+, if a source operand uses SGPRs, the HW can only read + // the first SGPR and use it for both the low and high operations. + if (isPackedFP32Inst(Opc) && isGFX12Plus()) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); + + const MCOperand &Src0 = Inst.getOperand(Src0Idx); + const MCOperand &Src1 = Inst.getOperand(Src1Idx); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); + + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + + auto VerifyOneSGPR = [OpSel, OpSelHi](unsigned Index) -> bool { + unsigned Mask = 1U << Index; + return ((OpSel & Mask) == 0) && ((OpSelHi & Mask) == 0); + }; + + if (Src0.isReg() && isSGPR(Src0.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/0)) + return false; + if (Src1.isReg() && isSGPR(Src1.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/1)) + return false; + + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx != -1) { + const MCOperand &Src2 = Inst.getOperand(Src2Idx); + if (Src2.isReg() && isSGPR(Src2.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/2)) + return false; + } + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f20b22d14c984..19e6bcf6a219d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -18,6 +18,7 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -5534,6 +5535,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more + // information. + if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) { + for (unsigned I = 0; I < 3; ++I) { + if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I)) + return false; + } + } + return true; } @@ -6005,6 +6015,21 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx]; unsigned Opc = MI.getOpcode(); + // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more + // information. + if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) && + MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) { + constexpr const AMDGPU::OpName OpNames[] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; + + for (auto [I, OpName] : enumerate(OpNames)) { + int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]); + if (static_cast(SrcIdx) == OpIdx && + !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I, &MO)) + return false; + } + } + if (!isLegalRegOperand(MRI, OpInfo, MO)) return false; @@ -6053,6 +6078,39 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, return true; } +bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( + const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, + const MachineOperand *MO) const { + constexpr const unsigned NumOps = 3; + constexpr const AMDGPU::OpName OpNames[NumOps * 2] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, + AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers}; + + assert(SrcN < NumOps); + + if (!MO) { + int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]); + if (SrcIdx == -1) + return true; + MO = &MI.getOperand(SrcIdx); + } + + if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg())) + return true; + + int ModsIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]); + if (ModsIdx == -1) + return true; + + unsigned Mods = MI.getOperand(ModsIdx).getImm(); + bool OpSel = Mods & SISrcMods::OP_SEL_0; + bool OpSelHi = Mods & SISrcMods::OP_SEL_1; + + return !OpSel && !OpSelHi; +} + bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { const MachineFunction &MF = *MI.getParent()->getParent(); @@ -6390,6 +6448,15 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) && !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg())) legalizeOpWithMove(MI, VOP3Idx[2]); + + // Fix the register class of packed FP32 instructions on gfx12+. See + // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information. + if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) { + for (unsigned I = 0; I < 3; ++I) { + if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I)) + legalizeOpWithMove(MI, VOP3Idx[I]); + } + } } Register SIInstrInfo::readlaneVGPRToSGPR( diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e042b59eb0f04..6b9403f9c7a21 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1287,6 +1287,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { const MachineOperand &MO) const; bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &MO) const; + + /// Check if \p MO would be a legal operand for gfx12+ packed math FP32 + /// instructions. Packed math FP32 instructions typically accept SGPRs or + /// VGPRs as source operands. On gfx12+, if a source operand uses SGPRs, the + /// HW can only read the first SGPR and use it for both the low and high + /// operations. + /// \p SrcN can be 0, 1, or 2, representing src0, src1, and src2, + /// respectively. If \p MO is nullptr, the operand corresponding to SrcN will + /// be used. + bool isLegalGFX12PlusPackedMathFP32Operand( + const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, + const MachineOperand *MO = nullptr) const; + /// Legalize operands in \p MI by either commuting it or inserting a /// copy of src1. void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 00dcb9b52d4bd..1e3e9a20afb2e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3318,6 +3318,20 @@ unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { return 128; } +bool isPackedFP32Inst(unsigned Opc) { + switch (Opc) { + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_ADD_F32_gfx12: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_F32_gfx12: + case AMDGPU::V_PK_FMA_F32: + case AMDGPU::V_PK_FMA_F32_gfx12: + return true; + default: + return false; + } +} + } // namespace AMDGPU raw_ostream &operator<<(raw_ostream &OS, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 1252e35d81e82..1bcd36cf6241c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1709,6 +1709,8 @@ bool isArgPassedInSGPR(const Argument *Arg); bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo); +LLVM_READONLY bool isPackedFP32Inst(unsigned Opc); + LLVM_READONLY bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset); diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 42401afb6edf2..8304be958f1ad 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -78,12 +78,14 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; GFX1250-LABEL: fadd_v2_vs: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -142,13 +144,16 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1] -; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[6:7] +; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_v4_vs: @@ -156,13 +161,16 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[6:7] +; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id @@ -332,56 +340,69 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fadd_v32_vs: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v40, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1] -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96 -; GFX1250-SDAG-NEXT: s_clause 0x1 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v40, s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v40, s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v40, s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v40, s[34:35] +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v40, s[34:35] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v40, s[34:35] offset:96 +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v40, s[34:35] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v40, s[34:35] offset:112 +; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[12:13] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[14:15] +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s20 :: v_dual_mov_b32 v35, s21 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s22 :: v_dual_mov_b32 v39, s23 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v37, s29 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s30 :: v_dual_mov_b32 v43, s31 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v44, s24 :: v_dual_mov_b32 v33, s19 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s28 :: v_dual_mov_b32 v57, s15 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v54, s12 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s13 :: v_dual_mov_b32 v56, s14 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s7 :: v_dual_mov_b32 v52, s2 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s27 :: v_dual_mov_b32 v48, s4 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s5 :: v_dual_mov_b32 v50, s6 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s25 :: v_dual_mov_b32 v46, s26 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[34:35] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39] +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s9 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v39, s11 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[10:11] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[16:17] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[40:41] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[42:43] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[0:1] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[36:37] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[16:17] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[38:39] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[48:49] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[34:35] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[38:39] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[44:45] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[46:47] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[50:51] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[36:37] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[42:43] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[18:19] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[20:21] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[22:23] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[8:9] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[54:55] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[56:57] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[52:53] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[42:43] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[48:49] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[50:51] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[44:45] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[46:47] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[32:33] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[36:37] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1] -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[16:19], s[34:35] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[12:15], s[34:35] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[8:11], s[34:35] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[4:7], s[34:35] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[20:23], s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[24:27], s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[0:3], s[34:35] +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[28:31], s[34:35] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_v32_vs: @@ -389,54 +410,70 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 ; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[16:17] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[20:21] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[22:23] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[36:37] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[38:39] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[24:25] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[26:27] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[40:41] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[42:43] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[28:29] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[30:31] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[44:45] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[46:47] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[0:1] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[2:3] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[48:49] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[50:51] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[4:5] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[6:7] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[52:53] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[54:55] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[8:9] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[10:11] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[32:33] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[34:35] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[12:13] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[14:15] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[36:37] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35] -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id @@ -502,15 +539,16 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { ; GFX1250-GISEL-LABEL: fadd_v2_v_imm: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -645,15 +683,16 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { ; GFX1250-GISEL-LABEL: fadd_v2_v_lit_splat: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1.0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -703,13 +742,15 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { ; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x3f800000 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -746,17 +787,31 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm ; -; GFX1250-LABEL: fadd_v2_v_lit_lo0: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x3f80000000000000) -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-SDAG-LABEL: fadd_v2_v_lit_lo0: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x3f80000000000000) +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_v_lit_lo0: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x3f80000000000000) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -792,17 +847,31 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm ; -; GFX1250-LABEL: fadd_v2_v_unfoldable_lit: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000) -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-SDAG-LABEL: fadd_v2_v_unfoldable_lit: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x400000003f800000) +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_v_unfoldable_lit: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1085,12 +1154,14 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo2: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] neg_lo:[0,1] -; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] neg_lo:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo2: @@ -1159,12 +1230,14 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi2: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1] -; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi2: @@ -1262,12 +1335,14 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; GFX1250-LABEL: fmul_v2_vs: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -1326,13 +1401,16 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1] -; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[4:5] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[6:7] +; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fmul_v4_vs: @@ -1340,13 +1418,16 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[6:7] +; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id @@ -1516,56 +1597,69 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fmul_v32_vs: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v40, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1] -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96 -; GFX1250-SDAG-NEXT: s_clause 0x1 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v40, s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v40, s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v40, s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v40, s[34:35] +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v40, s[34:35] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v40, s[34:35] offset:96 +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v40, s[34:35] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v40, s[34:35] offset:112 +; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[12:13] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[14:15] +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s20 :: v_dual_mov_b32 v35, s21 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s22 :: v_dual_mov_b32 v39, s23 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v37, s29 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s30 :: v_dual_mov_b32 v43, s31 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v44, s24 :: v_dual_mov_b32 v33, s19 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s28 :: v_dual_mov_b32 v57, s15 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v54, s12 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s13 :: v_dual_mov_b32 v56, s14 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s7 :: v_dual_mov_b32 v52, s2 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s27 :: v_dual_mov_b32 v48, s4 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s5 :: v_dual_mov_b32 v50, s6 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s25 :: v_dual_mov_b32 v46, s26 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[34:35] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39] +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s9 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v39, s11 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[10:11] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[16:17] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[40:41] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[42:43] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[0:1] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[36:37] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[16:17] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[38:39] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[48:49] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[34:35] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[38:39] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[44:45] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[46:47] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[50:51] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[36:37] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[42:43] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[18:19] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[20:21] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[22:23] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[8:9] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[54:55] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[56:57] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[52:53] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[42:43] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[48:49] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[50:51] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[44:45] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[46:47] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[32:33] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[36:37] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1] -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[16:19], s[34:35] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[12:15], s[34:35] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[8:11], s[34:35] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[4:7], s[34:35] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[20:23], s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[24:27], s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[0:3], s[34:35] +; GFX1250-SDAG-NEXT: global_store_b128 v40, v[28:31], s[34:35] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fmul_v32_vs: @@ -1573,54 +1667,70 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 ; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[16:17] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[20:21] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[22:23] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[36:37] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[38:39] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[24:25] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[26:27] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[40:41] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[42:43] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[28:29] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[30:31] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[44:45] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[46:47] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[0:1] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[2:3] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[48:49] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[50:51] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[4:5] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[6:7] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[52:53] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[54:55] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[8:9] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[10:11] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[32:33] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[34:35] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[12:13] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[14:15] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[36:37] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35] -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id @@ -1685,15 +1795,16 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { ; GFX1250-GISEL-LABEL: fmul_v2_v_imm: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -1828,15 +1939,16 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { ; GFX1250-GISEL-LABEL: fmul_v2_v_lit_splat: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -1873,17 +1985,31 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm ; -; GFX1250-LABEL: fmul_v2_v_unfoldable_lit: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000) -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-SDAG-LABEL: fmul_v2_v_unfoldable_lit: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4040000040800000) +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fmul_v2_v_unfoldable_lit: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -2040,12 +2166,14 @@ define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; GFX1250-LABEL: fma_v2_vs: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] +; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -2104,13 +2232,16 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1] -; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[4:5], v[4:5] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[6:7] +; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fma_v4_vs: @@ -2118,13 +2249,16 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3] -; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], v[4:5] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[6:7], v[6:7] +; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id @@ -2294,56 +2428,68 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fma_v32_vs: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v34, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1] -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96 -; GFX1250-SDAG-NEXT: s_clause 0x1 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v34, s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v34, s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v34, s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v34, s[34:35] +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v34, s[34:35] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v34, s[34:35] offset:96 +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v34, s[34:35] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v34, s[34:35] offset:112 +; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[12:13], s[12:13] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[14:15], s[14:15] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[20:21] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[22:23] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[30:31] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[28:29] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[12:13] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[56:57], s[14:15] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[2:3] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[4:5] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[6:7] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[24:25] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[26:27] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[18:19] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[8:9] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[10:11] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[10:11], s[10:11] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[16:17], s[16:17] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[40:41], s[40:41] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[38:39], s[38:39] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[48:49], s[48:49] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[42:43], v[42:43] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[0:1] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[16:17] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[44:45], s[44:45] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[46:47], s[46:47] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[50:51], s[50:51] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[36:37], s[36:37] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[42:43], s[42:43] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[18:19], s[18:19] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[20:21], s[20:21] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[22:23], s[22:23] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[8:9], s[8:9] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[54:55], v[54:55] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[36:37], v[36:37] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[38:39], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[56:57], v[56:57] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[52:53], v[52:53] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[42:43], v[42:43] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[48:49], v[48:49] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[50:51], v[50:51] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[44:45], v[44:45] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[46:47], v[46:47] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[32:33], v[32:33] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[40:41], v[40:41] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1] -; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v34, v[16:19], s[34:35] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v34, v[12:15], s[34:35] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v34, v[8:11], s[34:35] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v34, v[4:7], s[34:35] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v34, v[20:23], s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v34, v[24:27], s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v34, v[0:3], s[34:35] +; GFX1250-SDAG-NEXT: global_store_b128 v34, v[28:31], s[34:35] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fma_v32_vs: @@ -2351,54 +2497,70 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 ; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[16:17], s[16:17] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[18:19], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[20:21], s[20:21] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[22:23], s[22:23] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[36:37], v[36:37] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[38:39], v[38:39] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[24:25], s[24:25] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[26:27], s[26:27] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[40:41], v[40:41] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[42:43], v[42:43] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[28:29], s[28:29] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[30:31], s[30:31] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[44:45], v[44:45] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[46:47], v[46:47] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[0:1], s[0:1] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[2:3], s[2:3] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[48:49], v[48:49] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[50:51], v[50:51] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[4:5], s[4:5] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[6:7], s[6:7] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[52:53], v[52:53] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[54:55], v[54:55] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[8:9], s[8:9] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[10:11], s[10:11] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[32:33], v[32:33] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[34:35], v[34:35] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[12:13], s[12:13] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[14:15], s[14:15] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35] -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id @@ -2488,17 +2650,19 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { ; GFX1250-GISEL-LABEL: fma_v2_v_imm: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0x43480000 ; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 ; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5] -; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -2653,17 +2817,19 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { ; GFX1250-GISEL-LABEL: fma_v2_v_lit_splat: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1.0 ; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 ; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5] -; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -2740,29 +2906,30 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; GFX1250-SDAG-LABEL: fma_v2_v_unfoldable_lit: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000) -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[4:5], lit64(0x4040000040800000) +; GFX1250-SDAG-NEXT: v_and_b32_e32 v6, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4040000040800000) +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[4:5], lit64(0x400000003f800000) ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[4:5], s[2:3] -; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX1250-SDAG-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fma_v2_v_unfoldable_lit: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000) ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_mov_b64 s[4:5], lit64(0x400000003f800000) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5] -; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -3268,20 +3435,22 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) ; GFX1250-SDAG-NEXT: s_add_f32 s1, s1, 0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX1250-SDAG-NEXT: flat_store_b64 v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_fadd_fsub_0: ; GFX1250-GISEL: ; %bb.0: ; %bb ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v0, v1 ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v0 -; GFX1250-GISEL-NEXT: flat_store_b64 v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm bb: %i12 = fadd <2 x float> zeroinitializer, %arg @@ -3363,15 +3532,16 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_add_f32 s6, s1, s3 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], s[2:3], s[6:7] op_sel_hi:[1,0] -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1] -; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-SDAG-NEXT: s_add_f32 s2, s1, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_3) +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[0:1], s[2:3] op_sel_hi:[1,0] +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[4:5], v[0:1] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_fadd_fsub: @@ -3380,13 +3550,16 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_sub_f32 s0, s0, s2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_3) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v2, s0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[2:3], v[0:1] -; GFX1250-GISEL-NEXT: v_dual_subrev_f32 v3, s3, v0 :: v_dual_mov_b32 v0, 0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_subrev_f32 v3, s3, v0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[4:5] ; GFX1250-GISEL-NEXT: s_endpgm bb: @@ -3593,7 +3766,9 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1] ; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-GISEL-NEXT: s_endpgm %fneg = fsub <2 x float> , %x diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_err.s new file mode 100644 index 0000000000000..1ea64de5cbc9e --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_err.s @@ -0,0 +1,74 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s + +v_pk_fma_f32 v[8:9], s[0:1], v[0:1], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_fma_f32 v[8:9], v[0:1], v[4:5], s[0:1] +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_fma_f32 v[8:9], s[0:1], v[0:1], v[4:5] op_sel:[1,0,0] op_sel_hi:[0,0,0] +// GFX12-ERR: :[[@LINE-1]]:45: error: invalid op_sel operand + +v_pk_fma_f32 v[8:9], s[0:1], v[0:1], v[4:5] op_sel:[1,0,0] op_sel_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:45: error: invalid op_sel operand + +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel:[0,1,0] op_sel_hi:[0,0,0] +// GFX12-ERR: :[[@LINE-1]]:45: error: invalid op_sel operand + +v_pk_fma_f32 v[8:9], v[0:1], v[4:5], s[0:1] op_sel:[0,0,1] op_sel_hi:[0,0,0] +// GFX12-ERR: :[[@LINE-1]]:45: error: invalid op_sel operand + +v_pk_mul_f32 v[8:9], s[0:1], v[0:1] +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_mul_f32 v[8:9], v[0:1], s[0:1] +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_mul_f32 v[8:9], s[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] +// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand + +v_pk_mul_f32 v[8:9], v[0:1], s[0:1] op_sel:[0,1] op_sel_hi:[0,0] +// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand + +v_pk_mul_f32 v[8:9], v[0:1], s[0:1] op_sel:[0,1] op_sel_hi:[0,1] +// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand + +v_pk_add_f32 v[8:9], s[0:1], v[0:1] +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_add_f32 v[8:9], v[0:1], s[0:1] +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_add_f32 v[8:9], s[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] +// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand + +v_pk_add_f32 v[8:9], v[0:1], s[0:1] op_sel:[0,1] op_sel_hi:[0,0] +// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand + +v_pk_add_f32 v[8:9], v[0:1], s[0:1] op_sel:[0,1] op_sel_hi:[0,1] +// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand + +v_pk_fma_f32 v[8:9], exec, v[0:1], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_fma_f32 v[8:9], v[0:1], exec, v[4:5] +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_fma_f32 v[8:9], v[0:1], v[4:5], exec +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_mul_f32 v[8:9], exec, v[0:1] +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_mul_f32 v[8:9], v[0:1], exec +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_add_f32 v[8:9], exec, v[0:1] +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand + +v_pk_add_f32 v[8:9], v[0:1], exec +// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand