diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index f031353422e40..7973261341725 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -473,6 +473,28 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo { return false; } + /// Returns true if the two subregisters are equal or overlap. + /// The registers may be virtual registers. + bool subRegsOverlap(Register RegA, unsigned SubA, Register RegB, + unsigned SubB) const { + if (RegA == RegB && SubA == SubB) + return true; + if (RegA.isVirtual() && RegB.isVirtual()) { + if (RegA != RegB) + return false; + LaneBitmask LA = getSubRegIndexLaneMask(SubA); + LaneBitmask LB = getSubRegIndexLaneMask(SubB); + return (LA & LB).any(); + } + if (RegA.isPhysical() && RegB.isPhysical()) { + MCRegister MCRegA = SubA ? getSubReg(RegA, SubA) : RegA.asMCReg(); + MCRegister MCRegB = SubB ? getSubReg(RegB, SubB) : RegB.asMCReg(); + assert(MCRegB.isValid() && MCRegA.isValid() && "invalid subregister"); + return MCRegisterInfo::regsOverlap(MCRegA, MCRegB); + } + return false; + } + /// Returns true if Reg contains RegUnit. bool hasRegUnit(MCRegister Reg, MCRegUnit RegUnit) const { return llvm::is_contained(regunits(Reg), RegUnit); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index c8bbcbbd76928..de757ea0ae3bb 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -627,6 +627,124 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const { return std::pair(MaxNumVGPRs, MaxNumAGPRs); } +// Check to which source operand UseOpIdx points to and return a pointer to the +// operand of the corresponding source modifier. +// Return nullptr if UseOpIdx either doesn't point to src0/1/2 or if there is no +// operand for the corresponding source modifier. +static const MachineOperand * +getVOP3PSourceModifierFromOpIdx(const MachineInstr *UseI, int UseOpIdx, + const SIInstrInfo &InstrInfo) { + AMDGPU::OpName UseModName; + AMDGPU::OpName UseName = + AMDGPU::getOperandIdxName(UseI->getOpcode(), UseOpIdx); + switch (UseName) { + case AMDGPU::OpName::src0: + UseModName = AMDGPU::OpName::src0_modifiers; + break; + case AMDGPU::OpName::src1: + UseModName = AMDGPU::OpName::src1_modifiers; + break; + case AMDGPU::OpName::src2: + UseModName = AMDGPU::OpName::src2_modifiers; + break; + default: + return nullptr; + } + return InstrInfo.getNamedOperand(*UseI, UseModName); +} + +// Get the subreg idx of the subreg that is used by the given instruction +// operand, considering the given op_sel modifier. +// Return 0 if the whole register is used or as a conservative fallback. +static unsigned getEffectiveSubRegIdx(const SIRegisterInfo &TRI, + const SIInstrInfo &InstrInfo, + const MachineOperand &Op) { + const MachineInstr *I = Op.getParent(); + if (!InstrInfo.isVOP3P(*I) || InstrInfo.isWMMA(*I) || InstrInfo.isSWMMAC(*I)) + return 0; + + const MachineOperand *OpMod = + getVOP3PSourceModifierFromOpIdx(I, Op.getOperandNo(), InstrInfo); + if (!OpMod) + return 0; + + // Note: the FMA_MIX* and MAD_MIX* instructions have different semantics for + // the op_sel and op_sel_hi source modifiers: + // - op_sel: selects low/high operand bits as input to the operation; + // has only meaning for 16-bit source operands + // - op_sel_hi: specifies the size of the source operands (16 or 32 bits); + // a value of 0 indicates 32 bit, 1 indicates 16 bit + // For the other VOP3P instructions, the semantics are: + // - op_sel: selects low/high operand bits as input to the operation which + // results in the lower-half of the destination + // - op_sel_hi: selects the low/high operand bits as input to the operation + // which results in the higher-half of the destination + int64_t OpSel = OpMod->getImm() & SISrcMods::OP_SEL_0; + int64_t OpSelHi = OpMod->getImm() & SISrcMods::OP_SEL_1; + + // Check if all parts of the register are being used (= op_sel and op_sel_hi + // differ for VOP3P or op_sel_hi=0 for VOP3PMix). In that case we can return + // early. + if ((!InstrInfo.isVOP3PMix(*I) && (!OpSel || !OpSelHi) && + (OpSel || OpSelHi)) || + (InstrInfo.isVOP3PMix(*I) && !OpSelHi)) + return 0; + + const MachineRegisterInfo &MRI = I->getParent()->getParent()->getRegInfo(); + const TargetRegisterClass *RC = TRI.getRegClassForOperandReg(MRI, Op); + + if (unsigned SubRegIdx = OpSel ? AMDGPU::sub1 : AMDGPU::sub0; + TRI.getSubClassWithSubReg(RC, SubRegIdx) == RC) + return SubRegIdx; + if (unsigned SubRegIdx = OpSel ? AMDGPU::hi16 : AMDGPU::lo16; + TRI.getSubClassWithSubReg(RC, SubRegIdx) == RC) + return SubRegIdx; + + return 0; +} + +Register GCNSubtarget::getRealSchedDependency(const MachineInstr *DefI, + int DefOpIdx, + const MachineInstr *UseI, + int UseOpIdx) const { + const SIRegisterInfo *TRI = getRegisterInfo(); + const MachineOperand &DefOp = DefI->getOperand(DefOpIdx); + const MachineOperand &UseOp = UseI->getOperand(UseOpIdx); + Register DefReg = DefOp.getReg(); + Register UseReg = UseOp.getReg(); + + // If the registers aren't restricted to a sub-register, there is no point in + // further analysis. This check makes only sense for virtual registers because + // physical registers may form a tuple and thus be part of a superregister + // although they are not a subregister themselves (vgpr0 is a "subreg" of + // vgpr0_vgpr1 without being a subreg in itself). + unsigned DefSubRegIdx = DefOp.getSubReg(); + if (DefReg.isVirtual() && !DefSubRegIdx) + return DefReg; + unsigned UseSubRegIdx = getEffectiveSubRegIdx(*TRI, InstrInfo, UseOp); + if (UseReg.isVirtual() && !UseSubRegIdx) + return DefReg; + + if (!TRI->subRegsOverlap(DefReg, DefSubRegIdx, UseReg, UseSubRegIdx)) + return 0; // no real dependency + + // UseReg might be smaller or larger than DefReg, depending on the subreg and + // on whether DefReg is a subreg, too. -> Find the smaller one. This does not + // apply to virtual registers because we cannot construct a subreg for them. + if (DefReg.isVirtual()) + return DefReg; + MCRegister DefMCReg = + DefSubRegIdx ? TRI->getSubReg(DefReg, DefSubRegIdx) : DefReg.asMCReg(); + MCRegister UseMCReg = + UseSubRegIdx ? TRI->getSubReg(UseReg, UseSubRegIdx) : UseReg.asMCReg(); + const TargetRegisterClass *DefRC = TRI->getPhysRegBaseClass(DefMCReg); + const TargetRegisterClass *UseRC = TRI->getPhysRegBaseClass(UseMCReg); + // Some registers, such as SGPR[0-9]+_HI16, do not have a register class. + if (!DefRC || !UseRC) + return DefReg; + return DefRC->hasSubClass(UseRC) ? UseMCReg : DefMCReg; +} + void GCNSubtarget::adjustSchedDependency( SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const { @@ -637,6 +755,13 @@ void GCNSubtarget::adjustSchedDependency( MachineInstr *DefI = Def->getInstr(); MachineInstr *UseI = Use->getInstr(); + if (Register Reg = getRealSchedDependency(DefI, DefOpIdx, UseI, UseOpIdx)) { + Dep.setReg(Reg); + } else { + Dep = SDep(Def, SDep::Artificial); + return; // this is not a data dependency anymore + } + if (DefI->isBundle()) { const SIRegisterInfo *TRI = getRegisterInfo(); auto Reg = Dep.getReg(); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index ac660d5fada79..47a4caf07c554 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -299,6 +299,15 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, SITargetLowering TLInfo; SIFrameLowering FrameLowering; + /// Get the register that represents the actual dependency between the + /// definition and the use. The definition might only affect a subregister + /// that is not actually used. Works for both virtual and physical registers. + /// Note: Currently supports VOP3P instructions (without WMMA an SWMMAC). + /// Returns the definition register if there is a real dependency and no + /// better match is found. + Register getRealSchedDependency(const MachineInstr *DefI, int DefOpIdx, + const MachineInstr *UseI, int UseOpIdx) const; + public: GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 5fdeddaf3f736..1759769ba23d1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -909,6 +909,26 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return get(Opcode).TSFlags & SIInstrFlags::VOP3P; } + bool isVOP3PMix(const MachineInstr &MI) const { + return isVOP3PMix(MI.getOpcode()); + } + + bool isVOP3PMix(uint16_t Opcode) const { + if (!isVOP3P(Opcode)) + return false; + switch (Opcode) { + case AMDGPU::V_FMA_MIXHI_F16: + case AMDGPU::V_FMA_MIXLO_F16: + case AMDGPU::V_FMA_MIX_F32: + case AMDGPU::V_MAD_MIXHI_F16: + case AMDGPU::V_MAD_MIXLO_F16: + case AMDGPU::V_MAD_MIX_F32: + return true; + default: + return false; + } + } + static bool isVINTRP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::VINTRP; } diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index c4692b71ca685..e1dbbfacfe177 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -358,8 +358,8 @@ let SubtargetPredicate = HasMadMixInsts in { let OtherPredicates = [NoFP32Denormals] in { // These are VOP3a-like opcodes which accept no omod. -// Size of src arguments (16/32) is controlled by op_sel. -// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. +// Size of src arguments (16/32) is controlled by op_sel_hi. +// For 16-bit src arguments their location (hi/lo) are controlled by op_sel. let isCommutable = 1, mayRaiseFPException = 0 in { let isReMaterializable = 1 in defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile>; diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 7dbbeaabeb715..8f9df402201fa 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -965,11 +965,11 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) { ; ; GFX11-LABEL: ps_mesa_inreg_v5i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_i32 s3, s3, 4 -; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 2 ; GFX11-NEXT: s_add_i32 s4, s4, 5 ; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: s_add_i32 s3, s3, 4 +; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 @@ -980,12 +980,11 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) { ; ; GFX1250-LABEL: ps_mesa_inreg_v5i32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_add_co_i32 s3, s3, 4 -; GFX1250-NEXT: s_add_co_i32 s2, s2, 3 ; GFX1250-NEXT: s_add_co_i32 s1, s1, 2 ; GFX1250-NEXT: s_add_co_i32 s4, s4, 5 ; GFX1250-NEXT: s_add_co_i32 s0, s0, 1 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_add_co_i32 s3, s3, 4 +; GFX1250-NEXT: s_add_co_i32 s2, s2, 3 ; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0 ; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX1250-NEXT: v_mov_b32_e32 v3, s3 @@ -1014,22 +1013,22 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) { ; ; VI-LABEL: ps_mesa_inreg_v5f32: ; VI: ; %bb.0: -; VI-NEXT: v_add_f32_e64 v3, s3, -1.0 -; VI-NEXT: v_add_f32_e64 v2, s2, 4.0 ; VI-NEXT: v_add_f32_e64 v1, s1, 2.0 ; VI-NEXT: v_add_f32_e64 v0, s0, 1.0 ; VI-NEXT: v_add_f32_e64 v4, s4, 0.5 +; VI-NEXT: v_add_f32_e64 v3, s3, -1.0 +; VI-NEXT: v_add_f32_e64 v2, s2, 4.0 ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: ps_mesa_inreg_v5f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_f32_e64 v3, s3, -1.0 -; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0 ; GFX11-NEXT: v_add_f32_e64 v1, s1, 2.0 ; GFX11-NEXT: v_add_f32_e64 v4, s4, 0.5 ; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, -1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off @@ -1037,13 +1036,13 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) { ; ; GFX1250-LABEL: ps_mesa_inreg_v5f32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_add_f32 s3, s3, -1.0 ; GFX1250-NEXT: s_add_f32 s4, s4, 0.5 ; GFX1250-NEXT: s_add_f32 s0, s0, 1.0 ; GFX1250-NEXT: s_add_f32 s1, s1, 2.0 +; GFX1250-NEXT: s_add_f32 s3, s3, -1.0 ; GFX1250-NEXT: s_add_f32 s2, s2, 4.0 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_2) ; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX1250-NEXT: v_mov_b32_e32 v3, s3 ; GFX1250-NEXT: s_clause 0x1 @@ -1148,22 +1147,22 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) { ; ; VI-LABEL: ps_mesa_v5i32: ; VI: ; %bb.0: -; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: ps_mesa_v5i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 5, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off @@ -1171,9 +1170,9 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) { ; ; GFX1250-LABEL: ps_mesa_v5i32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_dual_add_nc_u32 v3, 4, v3 :: v_dual_add_nc_u32 v2, 3, v2 ; GFX1250-NEXT: v_dual_add_nc_u32 v1, 2, v1 :: v_dual_add_nc_u32 v4, 5, v4 -; GFX1250-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v3, 4, v3 +; GFX1250-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b32 v[0:1], v4, off ; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off @@ -1199,20 +1198,20 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) { ; ; VI-LABEL: ps_mesa_v5f32: ; VI: ; %bb.0: -; VI-NEXT: v_add_f32_e32 v3, -1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 4.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 2.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; VI-NEXT: v_add_f32_e32 v4, 0.5, v4 +; VI-NEXT: v_add_f32_e32 v3, -1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 4.0, v2 ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: ps_mesa_v5f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4 -; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v3, -1.0, v3 +; GFX11-NEXT: v_add_f32_e32 v2, 4.0, v2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off @@ -1220,9 +1219,9 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) { ; ; GFX1250-LABEL: ps_mesa_v5f32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2 ; GFX1250-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4 -; GFX1250-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX1250-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v3, -1.0, v3 +; GFX1250-NEXT: v_add_f32_e32 v2, 4.0, v2 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b32 v[0:1], v4, off ; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 60ac0b943faf4..d913c539cc4f1 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -8098,8 +8098,8 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1 -; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1 ; GFX9-NEXT: v_med3_f32 v2, v3, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8254,9 +8254,9 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x41000000 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 0.5, v1 +; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s2, v2 ; GFX9-SDAG-NEXT: v_add_f32_e32 v4, 0x41800000, v1 ; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1 -; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s2, v2 ; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-SDAG-NEXT: global_store_dword v[0:1], v4, off ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -8274,9 +8274,9 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_add_f32_e32 v4, 0.5, v1 +; GFX9-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 ; GFX9-GISEL-NEXT: v_add_f32_e32 v5, 0x41800000, v1 ; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1 -; GFX9-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 ; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-GISEL-NEXT: global_store_dword v[0:1], v5, off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index 689d1472d6010..edda6a58a788c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -79,10 +79,10 @@ ; GCN-NEXT: ; implicit-def: $vgpr211 ; GCN-NEXT: v_max_f32_e32 v212, v211, v211 ; GCN-NEXT: ; implicit-def: $vgpr198 - ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GCN-NEXT: ; implicit-def: $vgpr32 ; GCN-NEXT: ; implicit-def: $vgpr33 ; GCN-NEXT: ; implicit-def: $vgpr34 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GCN-NEXT: v_add_u32_e32 v210, v19, v34 ; GCN-NEXT: v_add_u32_e32 v206, v19, v33 ; GCN-NEXT: v_add_u32_e32 v205, v19, v32 @@ -505,45 +505,44 @@ ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128 ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113 ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128 + ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113 ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128 - ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113 ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128 - ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128 - ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128 - ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120 - ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128 + ; GCN-NEXT: v_exp_f32_e32 v113, v112 ; GCN-NEXT: v_exp_f32_e32 v114, v138 ; GCN-NEXT: v_exp_f32_e32 v115, v139 ; GCN-NEXT: v_exp_f32_e32 v116, v140 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113 + ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114 + ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128 + ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116 + ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115 + ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129 + ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120 + ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128 ; GCN-NEXT: v_exp_f32_e32 v117, v141 ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v118 ; GCN-NEXT: v_exp_f32_e32 v118, v142 + ; GCN-NEXT: v_exp_f32_e32 v119, v143 ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v120 ; GCN-NEXT: v_exp_f32_e32 v120, v144 - ; GCN-NEXT: v_exp_f32_e32 v113, v112 - ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114 - ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116 - ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129 + ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121 + ; GCN-NEXT: v_exp_f32_e32 v112, v129 ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128 - ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119 - ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115 ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122 - ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117 - ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128 - ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121 - ; GCN-NEXT: v_exp_f32_e32 v112, v129 - ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 - ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128 ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] @@ -554,30 +553,30 @@ ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v119, v143 - ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117 + ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v121, v148 ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47] - ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112 - ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112 - ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112 - ; GCN-NEXT: v_mul_f32_e64 v23, v23, v112 - ; GCN-NEXT: v_mul_f32_e64 v24, v24, v112 - ; GCN-NEXT: v_mul_f32_e64 v25, v25, v112 + ; GCN-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47] + ; GCN-NEXT: v_mul_f32_e64 v28, v28, v112 + ; GCN-NEXT: v_mul_f32_e64 v29, v29, v112 + ; GCN-NEXT: v_mul_f32_e64 v30, v30, v112 + ; GCN-NEXT: v_mul_f32_e64 v31, v31, v112 + ; GCN-NEXT: v_mul_f32_e64 v48, v48, v112 + ; GCN-NEXT: v_mul_f32_e64 v49, v49, v112 ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0] @@ -589,47 +588,48 @@ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v119 ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128 ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120 - ; GCN-NEXT: v_exp_f32_e32 v121, v148 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 ; GCN-NEXT: v_exp_f32_e32 v122, v149 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v123, v150 ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126 ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124 ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125 ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128 - ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128 + ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v123, v150 + ; GCN-NEXT: v_exp_f32_e32 v124, v151 + ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128 ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128 ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128 ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 - ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v124, v151 + ; GCN-NEXT: v_exp_f32_e32 v96, v129 ; GCN-NEXT: ds_read_b128 v[130:133], v197 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122 - ; GCN-NEXT: v_exp_f32_e32 v96, v129 ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 - ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136 - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] ; GCN-NEXT: v_exp_f32_e32 v97, v125 ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123 ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v98, v138 ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124 ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128 - ; GCN-NEXT: v_exp_f32_e32 v98, v138 ; GCN-NEXT: v_exp_f32_e32 v99, v127 ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135 ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 0887fdf0844b0..4e16ff82f1e60 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -218,54 +218,48 @@ ; GCN-NEXT: v_max_f32_e32 v70, v70, v70 ; GCN-NEXT: v_max_f32_e32 v72, v81, v70 ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72 + ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72 ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72 ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 - ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72 ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72 ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72 ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72 ; GCN-NEXT: v_exp_f32_e32 v73, v16 + ; GCN-NEXT: v_exp_f32_e32 v17, v17 ; GCN-NEXT: v_exp_f32_e32 v74, v18 ; GCN-NEXT: v_exp_f32_e32 v75, v19 ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 - ; GCN-NEXT: v_exp_f32_e32 v80, v20 ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73 ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72 + ; GCN-NEXT: v_exp_f32_e32 v80, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 + ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 ; GCN-NEXT: v_exp_f32_e32 v81, v21 ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74 - ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 + ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 ; GCN-NEXT: v_exp_f32_e32 v82, v22 ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 - ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72 - ; GCN-NEXT: v_exp_f32_e32 v83, v23 + ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 + ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22 ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72 - ; GCN-NEXT: v_exp_f32_e32 v85, v22 - ; GCN-NEXT: v_exp_f32_e32 v17, v17 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN-NEXT: v_exp_f32_e32 v83, v23 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 - ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 - ; GCN-NEXT: v_exp_f32_e32 v88, v23 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 - ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 ; GCN-NEXT: ds_read_b128 v[18:21], v84 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 ; GCN-NEXT: v_exp_f32_e32 v16, v24 - ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0] @@ -276,30 +270,35 @@ ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v18, 0, v73 + ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80 + ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 + ; GCN-NEXT: v_exp_f32_e32 v85, v22 + ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 + ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 + ; GCN-NEXT: v_exp_f32_e32 v88, v23 + ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v18, 0, v73 - ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 - ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80 - ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 - ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 + ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3 + ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] ; GCN-NEXT: v_add_f32_e32 v17, v17, v18 ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72 ; GCN-NEXT: v_exp_f32_e32 v30, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 + ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 ; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72 ; GCN-NEXT: v_perm_b32 v31, v68, v64, s2 ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3 - ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3 ; GCN-NEXT: ds_read_b128 v[26:29], v91 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -322,31 +321,37 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b32 v78, v90 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b32 v79, v65 ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73 ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] + ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 ; GCN-NEXT: v_add_f32_e32 v17, v74, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85 - ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 + ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 ; GCN-NEXT: v_exp_f32_e32 v22, v64 ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88 + ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 ; GCN-NEXT: v_exp_f32_e32 v64, v65 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47] ; GCN-NEXT: v_add_f32_e32 v17, v75, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 - ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 + ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 ; GCN-NEXT: v_exp_f32_e32 v23, v23 ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31 + ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 + ; GCN-NEXT: v_exp_f32_e32 v25, v67 ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0 ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1 ; GCN-NEXT: v_pack_b32_f16 v0, v20, v21 ; GCN-NEXT: v_pack_b32_f16 v1, v18, v19 - ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 - ; GCN-NEXT: v_exp_f32_e32 v25, v67 + ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63] ; GCN-NEXT: v_add_f32_e32 v17, v80, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 @@ -356,62 +361,76 @@ ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72 ; GCN-NEXT: v_exp_f32_e32 v65, v65 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 ; GCN-NEXT: v_add_f32_e32 v17, v81, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23 - ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 + ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 ; GCN-NEXT: v_exp_f32_e32 v68, v2 ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25 - ; GCN-NEXT: ;;#ASMSTART - ; GCN-NEXT: s_waitcnt vmcnt(8) - ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_read_b128 v[0:3], v84 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_pack_b32_f16 v4, v18, v4 + ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 ; GCN-NEXT: v_pack_b32_f16 v5, v5, v19 ; GCN-NEXT: v_exp_f32_e32 v24, v24 ; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63] ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63] + ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 ; GCN-NEXT: v_add_f32_e32 v17, v82, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27 + ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 ; GCN-NEXT: v_exp_f32_e32 v26, v26 ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65 - ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 + ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 ; GCN-NEXT: v_exp_f32_e32 v67, v67 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47] ; GCN-NEXT: v_add_f32_e32 v17, v83, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68 + ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 ; GCN-NEXT: v_exp_f32_e32 v6, v6 ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7 + ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 ; GCN-NEXT: v_exp_f32_e32 v7, v7 ; GCN-NEXT: v_pack_b32_f16 v4, v28, v29 ; GCN-NEXT: v_pack_b32_f16 v5, v5, v69 - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 + ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63] ; GCN-NEXT: v_add_f32_e32 v0, v85, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26 + ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 + ; GCN-NEXT: v_exp_f32_e32 v8, v8 ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67 + ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 + ; GCN-NEXT: v_exp_f32_e32 v9, v9 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47] ; GCN-NEXT: v_add_f32_e32 v4, v88, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 + ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 ; GCN-NEXT: v_exp_f32_e32 v10, v0 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 + ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 + ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 + ; GCN-NEXT: v_exp_f32_e32 v11, v11 ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 ; GCN-NEXT: v_pack_b32_f16 v0, v17, v28 + ; GCN-NEXT: ; implicit-def: $sgpr2 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] + ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 ; GCN-NEXT: v_add_f32_e32 v2, v30, v4 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 + ; GCN-NEXT: v_exp_f32_e32 v12, v3 + ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 + ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 + ; GCN-NEXT: v_exp_f32_e32 v15, v3 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47] ; GCN-NEXT: v_add_f32_e32 v0, v31, v2 ; GCN-NEXT: v_add_f32_e32 v0, v22, v0 @@ -419,46 +438,27 @@ ; GCN-NEXT: v_add_f32_e32 v0, v23, v0 ; GCN-NEXT: v_add_f32_e32 v0, v25, v0 ; GCN-NEXT: v_add_f32_e32 v0, v27, v0 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 ; GCN-NEXT: v_add_f32_e32 v0, v65, v0 - ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 - ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 ; GCN-NEXT: v_add_f32_e32 v0, v68, v0 - ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 - ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 - ; GCN-NEXT: v_exp_f32_e32 v8, v8 ; GCN-NEXT: v_add_f32_e32 v0, v24, v0 - ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 - ; GCN-NEXT: v_exp_f32_e32 v9, v9 ; GCN-NEXT: v_add_f32_e32 v0, v26, v0 ; GCN-NEXT: v_add_f32_e32 v0, v67, v0 - ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 - ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5 ; GCN-NEXT: v_add_f32_e32 v0, v6, v0 - ; GCN-NEXT: v_exp_f32_e32 v11, v11 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 - ; GCN-NEXT: v_exp_f32_e32 v12, v3 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10 ; GCN-NEXT: v_exp_f32_e32 v17, v1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14 ; GCN-NEXT: v_add_f32_e32 v0, v7, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 - ; GCN-NEXT: v_exp_f32_e32 v15, v3 + ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 ; GCN-NEXT: v_exp_f32_e32 v18, v1 ; GCN-NEXT: v_add_f32_e32 v6, v8, v0 ; GCN-NEXT: ds_read_b128 v[0:3], v91 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 ; GCN-NEXT: v_add_f32_e32 v6, v9, v6 + ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14 ; GCN-NEXT: v_pack_b32_f16 v8, v4, v13 ; GCN-NEXT: v_add_f32_e32 v6, v10, v6 - ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14 ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v18 ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63] @@ -478,13 +478,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_mov_b32_e32 v4, 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] ; GCN-NEXT: v_add_f32_e32 v2, v18, v11 ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index f93e5f06beff9..529549f9430ae 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -6669,8 +6669,8 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009 -; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10001 ; GFX12-NEXT: v_lshrrev_b32_e32 v10, 15, v4 +; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10001 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir new file mode 100644 index 0000000000000..11ce1b0047a68 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir @@ -0,0 +1,973 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -misched-print-dags -stop-after=machine-scheduler -filetype=null %s 2>&1 | FileCheck -check-prefix=GCN %s + +# Note: the source modifier is the parameter before the source itself. So, +# src0_modifiers is the parameter in the list before src0, src1_modifiers before +# src1. +# For the srcN_modifiers, the following values are relevant for these tests: +# - 0: op_sel=0 and op_sel_hi=0 +# - 4: op_sel=1 and op_sel_hi=0 +# - 8: op_sel=0 and op_sel_hi=1 +# - 12: op_sel=1 and op_sel_hi=1 +# For every test where we test two register arguments, the size of the arguments +# and the used parts are encoded in the test name. Examples: +# - *_32_lo_lo_32_lo_lo: two args of size 32 where only the low parts are used +# - *_16_lo_hi_16_hi_hi: two args of size 16 where both parts of the first arg +# and the high part of the second arg are used +# For a "(lo|hi)_(lo|hi)" pair, the first field denotes the part controlled by +# op_sel, the second field the one controlled by op_sel_hi. +# +# For the mad_mix_* tests, op_sel and op_sel_hi have slightly different semantics: +# - op_sel_hi: selects if the full 32bit of the arg should be used or only a +# 16bit part (which is then selected by op_sel) +# op_sel_hi=0 selects 32bit +# op_sel_hi=1 selects 16bit +# - op_sel: selects low/high part of arg +# So, for the srcN_modifiers, we have the following values: +# - 0: op_sel=0 and 32bit (op_sel_hi=0) +# - 4: op_sel=1 and 32bit (op_sel_hi=0) +# - 8: op_sel=0 and 16bit (op_sel_hi=1) +# - 12: op_sel=1 and 16bit (op_sel_hi=1) + +--- +name: pk_mul_virtual_32_lo_lo_32_lo_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.sub0:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.sub1:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 2 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF + %0.sub1:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_32_lo_lo_32_lo_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr1 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_virtual_16_lo_lo_16_lo_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 2 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + undef %0.lo16:vgpr_32 = IMPLICIT_DEF + %0.hi16:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_16_lo_lo_16_lo_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0_lo16 = IMPLICIT_DEF + $vgpr0_hi16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_virtual_32_lo_lo_32_lo_hi +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.sub0:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.sub1:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 8, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 3 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF + %0.sub1:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 8, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_32_lo_lo_32_hi_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr1 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 4, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 5 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr1 + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 4, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_virtual_16_lo_lo_16_lo_hi +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 3 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + undef %0.lo16:vgpr_32 = IMPLICIT_DEF + %0.hi16:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_16_lo_lo_16_hi_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 4, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 5 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16 + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0_lo16 = IMPLICIT_DEF + $vgpr0_hi16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 4, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_virtual_32_hi_lo_32_lo_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.sub0:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.sub1:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 4, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 3 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF + %0.sub1:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 4, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_32_lo_hi_32_lo_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr1 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 5 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr1 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_virtual_16_hi_lo_16_lo_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 3 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + undef %0.lo16:vgpr_32 = IMPLICIT_DEF + %0.hi16:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = nofpexcept V_PK_MUL_F16 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_16_hi_lo_16_lo_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 4, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 5 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0_lo16 = IMPLICIT_DEF + $vgpr0_hi16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_PK_MUL_F16 4, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_virtual_32_hi_hi_32_lo_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.sub0:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.sub1:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF + %0.sub1:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_32_hi_hi_32_lo_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr1 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 6 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr1 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_virtual_16_hi_hi_16_hi_hi +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 2 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0 + ; GCN-NEXT: SU(0): Ord Latency=0 Artificial + ; GCN-NEXT: Pressure Diff + ; + undef %0.lo16:vgpr_32 = IMPLICIT_DEF + %0.hi16:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_16_hi_hi_16_hi_hi +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16 + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Ord Latency=0 Artificial + ; GCN-NEXT: Pressure Diff + ; + $vgpr0_lo16 = IMPLICIT_DEF + $vgpr0_hi16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_virtual_32_hi_hi_32_hi_hi +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.sub0:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.sub1:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 12, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 2 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0 + ; GCN-NEXT: SU(0): Ord Latency=0 Artificial + ; GCN-NEXT: Pressure Diff + ; + undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF + %0.sub1:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 12, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_32_hi_hi_32_hi_hi +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr1 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr1 + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Ord Latency=0 Artificial + ; GCN-NEXT: Pressure Diff + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_virtual_16_lo_lo_16_lo_lo_superreg_definition +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.sub1_hi16:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0.sub0:vreg_64_align2, 0, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 2 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + %0:vreg_64_align2 = IMPLICIT_DEF + %0.sub1_hi16:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0.sub0:vreg_64_align2, 0, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_16_lo_lo_16_lo_lo_superreg_definition +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0_vgpr1 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 2 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_vgpr1 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0_vgpr1 = IMPLICIT_DEF + $vgpr0_hi16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_virtual_16_hi_hi_16_hi_hi_superreg_definition +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.sub1_lo16:vreg_64_align2 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0.sub0:vreg_64_align2, 12, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 2 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + %0:vreg_64_align2 = IMPLICIT_DEF + %0.sub1_lo16:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0.sub0:vreg_64_align2, 12, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_16_hi_hi_16_hi_hi_superreg_definition +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0_vgpr1 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 2 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16 + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_vgpr1 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0_vgpr1 = IMPLICIT_DEF + $vgpr0_hi16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_physical_16_hi_hi_16_hi_hi_superreg_definition1 +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0_vgpr1 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr1_lo16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 3 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_vgpr1 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0_vgpr1 = IMPLICIT_DEF + $vgpr1_lo16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: mad_mix_virtual_16_lo_16_lo_16_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, %0:vgpr_32, 8, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 2 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + undef %0.lo16:vgpr_32 = IMPLICIT_DEF + %0.hi16:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, %0:vgpr_32, 8, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: mad_mix_physical_16_lo_16_lo_16_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_MAD_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0_lo16 = IMPLICIT_DEF + $vgpr0_hi16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_MAD_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: mad_mix_virtual_16_hi_16_hi_16_hi +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 2 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0 + ; GCN-NEXT: SU(0): Ord Latency=0 Artificial + ; GCN-NEXT: Pressure Diff + ; + undef %0.lo16:vgpr_32 = IMPLICIT_DEF + %0.hi16:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: mad_mix_physical_16_hi_16_hi_16_hi +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 12, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16 + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Ord Latency=0 Artificial + ; GCN-NEXT: Pressure Diff + ; + $vgpr0_lo16 = IMPLICIT_DEF + $vgpr0_hi16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 12, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: mad_mix_virtual_16_hi_16_lo_16_hi +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 8, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + undef %0.lo16:vgpr_32 = IMPLICIT_DEF + %0.hi16:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 8, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: mad_mix_physical_16_hi_16_lo_16_hi +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 8, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 6 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16 + ; GCN-NEXT: SU(1): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Ord Latency=0 Artificial + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0_lo16 = IMPLICIT_DEF + $vgpr0_hi16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 8, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: mad_mix_virtual_32_hi_32_hi_32_hi +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 4, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 2 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 0 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + undef %0.lo16:vgpr_32 = IMPLICIT_DEF + %0.hi16:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 4, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: mad_mix_physical_32_lo_32_lo_32_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16 + ; GCN-NEXT: SU(0): Out Latency=1 + ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0_lo16 = IMPLICIT_DEF + $vgpr0_hi16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: mad_mix_virtual_32_hi_32_hi_32_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): %0:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): %0.lo16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): %0.hi16:vgpr_32 = IMPLICIT_DEF + ; GCN-LABEL: SU(3): dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 2 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 1 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(2): Data Latency=0 Reg=%0 + ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0 + ; GCN-NEXT: Pressure Diff + ; + %0:vgpr_32 = IMPLICIT_DEF + %0.lo16:vgpr_32 = IMPLICIT_DEF + %0.hi16:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: mad_mix_physical_32_lo_32_hi_32_lo +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF + ; GCN-LABEL: SU(1): $vgpr0_lo16 = IMPLICIT_DEF + ; GCN-LABEL: SU(2): $vgpr0_hi16 = IMPLICIT_DEF + ; GCN-LABEL: SU(3): $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 4, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: # preds left : 4 + ; GCN-NEXT: # succs left : 0 + ; GCN-NEXT: # rdefs left : 0 + ; GCN-NEXT: Latency : 1 + ; GCN-NEXT: Depth : 2 + ; GCN-NEXT: Height : 0 + ; GCN-NEXT: Predecessors: + ; GCN-NEXT: SU(2): Out Latency=1 + ; GCN-NEXT: SU(2): Data Latency=0 Reg=$vgpr0_hi16 + ; GCN-NEXT: SU(1): Out Latency=1 + ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_lo16 + ; GCN-NEXT: Pressure Diff + ; + $vgpr0 = IMPLICIT_DEF + $vgpr0_lo16 = IMPLICIT_DEF + $vgpr0_hi16 = IMPLICIT_DEF + $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 4, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir index a2a0794ac59f3..aeb54bc080d58 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir @@ -94,10 +94,10 @@ body: | ; BOTTOMUP-NEXT: renamable $vgpr11 = IMAGE_SAMPLE_V1_V2_gfx11 $vgpr9_vgpr10, killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; BOTTOMUP-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = IMAGE_SAMPLE_V4_V2_gfx11 killed $vgpr9_vgpr10, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, killed renamable $sgpr36_sgpr37_sgpr38_sgpr39, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8) ; BOTTOMUP-NEXT: } - ; BOTTOMUP-NEXT: renamable $vgpr14 = V_MOV_B32_e32 0, implicit $exec ; BOTTOMUP-NEXT: nofpexcept V_CMP_GT_F32_e32 1065353216, killed $vgpr11, implicit-def $vcc_lo, implicit $mode, implicit $exec ; BOTTOMUP-NEXT: renamable $sgpr0_sgpr1 = COPY $vcc ; BOTTOMUP-NEXT: nofpexcept V_CMP_GT_F32_e32 1065353216, killed $vgpr8, implicit-def $vcc_lo, implicit $mode, implicit $exec + ; BOTTOMUP-NEXT: renamable $vgpr14 = V_MOV_B32_e32 0, implicit $exec ; BOTTOMUP-NEXT: renamable $sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr0_sgpr1, killed renamable $vcc, implicit-def dead $scc ; BOTTOMUP-NEXT: renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec ; BOTTOMUP-NEXT: renamable $vgpr12 = V_MOV_B32_e32 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir b/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir index 77e67b2732481..27908957b5886 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir @@ -15,7 +15,7 @@ # CHECK-NEXT: SU(0): Data Latency=1 Reg=$vgpr0 # CHECK: Successors: # CHECK-NEXT: SU(4): Out Latency=1 -# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1 +# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0 # CHECK-NEXT: SU(3): Out Latency=1 # CHECK-NEXT: SU(3): Data Latency=1 Reg=$vcc # CHECK: SU(3): $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def dead $vcc, implicit $vcc, implicit $exec @@ -26,13 +26,13 @@ # CHECK-NEXT: SU(1): Data Latency=1 Reg=$vgpr1 # CHECK: Successors: # CHECK-NEXT: SU(4): Out Latency=1 -# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1 +# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr1 # CHECK: SU(4): $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr # CHECK: Predecessors: # CHECK-NEXT: SU(3): Out Latency=1 -# CHECK-NEXT: SU(3): Data Latency=1 Reg=$vgpr0_vgpr1 +# CHECK-NEXT: SU(3): Data Latency=1 Reg=$vgpr1 # CHECK-NEXT: SU(2): Out Latency=1 -# CHECK-NEXT: SU(2): Data Latency=1 Reg=$vgpr0_vgpr1 +# CHECK-NEXT: SU(2): Data Latency=1 Reg=$vgpr0 # CHECK: Successors: # CHECK-NEXT: ExitSU: Ord Latency=3 Artificial diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index 7a3bff8aed56e..ead040fd14280 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -701,15 +701,15 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -722,10 +722,10 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 @@ -891,15 +891,15 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -912,10 +912,10 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 @@ -995,8 +995,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 @@ -1724,15 +1724,15 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -1745,10 +1745,10 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 @@ -1914,15 +1914,15 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -1935,10 +1935,10 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 @@ -2018,8 +2018,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 @@ -2747,15 +2747,15 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -2768,10 +2768,10 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 @@ -2937,15 +2937,15 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -2958,10 +2958,10 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 @@ -3041,8 +3041,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 @@ -3767,15 +3767,15 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -3788,10 +3788,10 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 @@ -3957,15 +3957,15 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -3978,10 +3978,10 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 @@ -4061,8 +4061,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 @@ -4787,15 +4787,15 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -4808,10 +4808,10 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 @@ -4977,15 +4977,15 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -4998,10 +4998,10 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 @@ -5081,8 +5081,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 @@ -5817,15 +5817,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -5838,10 +5838,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 @@ -6009,15 +6009,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -6030,10 +6030,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 @@ -6113,9 +6113,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 @@ -6848,15 +6848,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -6869,10 +6869,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 @@ -7040,15 +7040,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 ; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 @@ -7061,10 +7061,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 @@ -7144,9 +7144,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256