Skip to content

Commit f631324

Browse files
committed
[AMDGPU] misched: avoid subregister dependencies
There are some VOP3P instructions which operate on packed 32bit values and can be configured (op_sel/op_sel_hi) to only use one of the values. This patch adapts the scheduling dependencies so that a write to vgpr3, for example, is not a data dependency for a read from vgpr2_vgpr3 in case only vgpr2 is actually used.
1 parent bba9172 commit f631324

15 files changed

+1381
-217
lines changed

llvm/include/llvm/CodeGen/TargetRegisterInfo.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,28 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
468468
return false;
469469
}
470470

471+
/// Returns true if the two subregisters are equal or overlap.
472+
/// The registers may be virtual registers.
473+
bool subRegsOverlap(Register RegA, unsigned SubA, Register RegB,
474+
unsigned SubB) const {
475+
if (RegA == RegB && SubA == SubB)
476+
return true;
477+
if (RegA.isVirtual() && RegB.isVirtual()) {
478+
if (RegA != RegB)
479+
return false;
480+
LaneBitmask LA = getSubRegIndexLaneMask(SubA);
481+
LaneBitmask LB = getSubRegIndexLaneMask(SubB);
482+
return (LA & LB).any();
483+
}
484+
if (RegA.isPhysical() && RegB.isPhysical()) {
485+
RegA = getSubReg(RegA.asMCReg(), SubA);
486+
RegB = getSubReg(RegB.asMCReg(), SubB);
487+
assert(RegB.isValid() && RegA.isValid() && "invalid subregister");
488+
return MCRegisterInfo::regsOverlap(RegA.asMCReg(), RegB.asMCReg());
489+
}
490+
return false;
491+
}
492+
471493
/// Returns true if Reg contains RegUnit.
472494
bool hasRegUnit(MCRegister Reg, MCRegUnit RegUnit) const {
473495
return llvm::is_contained(regunits(Reg), RegUnit);

llvm/lib/MC/MCRegisterInfo.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,9 @@ MCRegisterInfo::getMatchingSuperReg(MCRegister Reg, unsigned SubIdx,
114114
}
115115

116116
MCRegister MCRegisterInfo::getSubReg(MCRegister Reg, unsigned Idx) const {
117-
assert(Idx && Idx < getNumSubRegIndices() &&
118-
"This is not a subregister index");
117+
if (!Idx)
118+
return Reg;
119+
assert(Idx < getNumSubRegIndices() && "This is not a subregister index");
119120
// Get a pointer to the corresponding SubRegIndices list. This list has the
120121
// name of each sub-register in the same order as MCSubRegIterator.
121122
const uint16_t *SRI = SubRegIndices + get(Reg).SubRegIndices;

llvm/lib/Target/AMDGPU/GCNSubtarget.cpp

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,122 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
628628
return std::pair(MaxNumVGPRs, MaxNumAGPRs);
629629
}
630630

631+
// Check to which source operand UseOpIdx points to and return a pointer to the
632+
// operand of the corresponding source modifier.
633+
// Return nullptr if UseOpIdx either doesn't point to src0/1/2 or if there is no
634+
// operand for the corresponding source modifier.
635+
static const MachineOperand *
636+
getVOP3PSourceModifierFromOpIdx(const MachineInstr *UseI, int UseOpIdx,
637+
const SIInstrInfo &InstrInfo) {
638+
AMDGPU::OpName UseModName;
639+
AMDGPU::OpName UseName =
640+
AMDGPU::getOperandIdxName(UseI->getOpcode(), UseOpIdx);
641+
switch (UseName) {
642+
case AMDGPU::OpName::src0:
643+
UseModName = AMDGPU::OpName::src0_modifiers;
644+
break;
645+
case AMDGPU::OpName::src1:
646+
UseModName = AMDGPU::OpName::src1_modifiers;
647+
break;
648+
case AMDGPU::OpName::src2:
649+
UseModName = AMDGPU::OpName::src2_modifiers;
650+
break;
651+
default:
652+
return nullptr;
653+
}
654+
return InstrInfo.getNamedOperand(*UseI, UseModName);
655+
}
656+
657+
// Get the subreg idx of the subreg that is used by the given instruction
658+
// operand, considering the given op_sel modifier.
659+
// Return 0 if the whole register is used or as a conservative fallback.
660+
static unsigned getEffectiveSubRegIdx(const SIRegisterInfo *TRI,
661+
const SIInstrInfo &InstrInfo,
662+
const MachineOperand &Op) {
663+
const MachineInstr *I = Op.getParent();
664+
if (!InstrInfo.isVOP3P(*I) || InstrInfo.isWMMA(*I) || InstrInfo.isSWMMAC(*I))
665+
return 0;
666+
667+
const MachineOperand *OpMod =
668+
getVOP3PSourceModifierFromOpIdx(I, Op.getOperandNo(), InstrInfo);
669+
if (!OpMod)
670+
return 0;
671+
672+
// Note: the FMA_MIX* and MAD_MIX* instructions have different semantics for
673+
// the op_sel and op_sel_hi source modifiers:
674+
// - op_sel: selects low/high operand bits as input to the operation;
675+
// has only meaning for 16-bit source operands
676+
// - op_sel_hi: specifies the size of the source operands (16 or 32 bits);
677+
// a value of 0 indicates 32 bit, 1 indicates 16 bit
678+
// For the other VOP3P instructions, the semantics are:
679+
// - op_sel: selects low/high operand bits as input to the operation which
680+
// results in the lower-half of the destination
681+
// - op_sel_hi: selects the low/high operand bits as input to the operation
682+
// which results in the higher-half of the destination
683+
int64_t OpSel = OpMod->getImm() & SISrcMods::OP_SEL_0;
684+
int64_t OpSelHi = OpMod->getImm() & SISrcMods::OP_SEL_1;
685+
686+
// Check if all parts of the register are being used (= op_sel and op_sel_hi
687+
// differ for VOP3P or op_sel_hi=0 for VOP3PMix). In that case we can return
688+
// early.
689+
if ((!InstrInfo.isVOP3PMix(*I) && (!OpSel || !OpSelHi) &&
690+
(OpSel || OpSelHi)) ||
691+
(InstrInfo.isVOP3PMix(*I) && !OpSelHi))
692+
return 0;
693+
694+
const TargetRegisterClass *RC =
695+
InstrInfo.getOpRegClass(*I, Op.getOperandNo());
696+
697+
if (unsigned SubRegIdx = OpSel ? AMDGPU::sub1 : AMDGPU::sub0;
698+
TRI->getSubRegisterClass(RC, SubRegIdx))
699+
return SubRegIdx;
700+
if (unsigned SubRegIdx = OpSel ? AMDGPU::hi16 : AMDGPU::lo16;
701+
TRI->getSubRegisterClass(RC, SubRegIdx))
702+
return SubRegIdx;
703+
704+
return 0;
705+
}
706+
707+
Register GCNSubtarget::getRealSchedDependency(const MachineInstr *DefI,
708+
int DefOpIdx,
709+
const MachineInstr *UseI,
710+
int UseOpIdx) const {
711+
const SIRegisterInfo *TRI = getRegisterInfo();
712+
const MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
713+
const MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
714+
Register DefReg = DefOp.getReg();
715+
Register UseReg = UseOp.getReg();
716+
717+
// If the registers aren't restricted to a sub-register, there is no point in
718+
// further analysis. This check makes only sense for virtual registers because
719+
// physical registers may form a tuple and thus be part of a superregister
720+
// although they are not a subregister themselves (vgpr0 is a "subreg" of
721+
// vgpr0_vgpr1 without being a subreg in itself).
722+
unsigned DefSubRegIdx = DefOp.getSubReg();
723+
if (DefReg.isVirtual() && !DefSubRegIdx)
724+
return DefReg;
725+
unsigned UseSubRegIdx = getEffectiveSubRegIdx(TRI, InstrInfo, UseOp);
726+
if (UseReg.isVirtual() && !UseSubRegIdx)
727+
return DefReg;
728+
729+
if (!TRI->subRegsOverlap(DefReg, DefSubRegIdx, UseReg, UseSubRegIdx))
730+
return 0; // no real dependency
731+
732+
// UseReg might be smaller or larger than DefReg, depending on the subreg and
733+
// on whether DefReg is a subreg, too. -> Find the smaller one. This does not
734+
// apply to virtual registers because we cannot construct a subreg for them.
735+
if (DefReg.isVirtual())
736+
return DefReg;
737+
MCRegister DefMCReg = TRI->getSubReg(DefReg.asMCReg(), DefSubRegIdx);
738+
MCRegister UseMCReg = TRI->getSubReg(UseReg.asMCReg(), UseSubRegIdx);
739+
const TargetRegisterClass *DefRC = TRI->getPhysRegBaseClass(DefMCReg);
740+
const TargetRegisterClass *UseRC = TRI->getPhysRegBaseClass(UseMCReg);
741+
// Some registers, such as SGPR[0-9]+_HI16, do not have a register class.
742+
if (!DefRC || !UseRC)
743+
return DefReg;
744+
return DefRC->hasSubClass(UseRC) ? UseMCReg : DefMCReg;
745+
}
746+
631747
void GCNSubtarget::adjustSchedDependency(
632748
SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
633749
const TargetSchedModel *SchedModel) const {
@@ -638,6 +754,13 @@ void GCNSubtarget::adjustSchedDependency(
638754
MachineInstr *DefI = Def->getInstr();
639755
MachineInstr *UseI = Use->getInstr();
640756

757+
if (Register Reg = getRealSchedDependency(DefI, DefOpIdx, UseI, UseOpIdx)) {
758+
Dep.setReg(Reg);
759+
} else {
760+
Dep = SDep(Def, SDep::Artificial);
761+
return; // this is not a data dependency anymore
762+
}
763+
641764
if (DefI->isBundle()) {
642765
const SIRegisterInfo *TRI = getRegisterInfo();
643766
auto Reg = Dep.getReg();

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,15 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
296296
SITargetLowering TLInfo;
297297
SIFrameLowering FrameLowering;
298298

299+
/// Get the register that represents the actual dependency between the
300+
/// definition and the use. The definition might only affect a subregister
301+
/// that is not actually used. Works for both virtual and physical registers.
302+
/// Note: Currently supports VOP3P instructions (without WMMA an SWMMAC).
303+
/// Returns the definition register if there is a real dependency and no
304+
/// better match is found.
305+
Register getRealSchedDependency(const MachineInstr *DefI, int DefOpIdx,
306+
const MachineInstr *UseI, int UseOpIdx) const;
307+
299308
public:
300309
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
301310
const GCNTargetMachine &TM);

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,26 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
839839
return get(Opcode).TSFlags & SIInstrFlags::VOP3P;
840840
}
841841

842+
bool isVOP3PMix(const MachineInstr &MI) const {
843+
return isVOP3PMix(MI.getOpcode());
844+
}
845+
846+
bool isVOP3PMix(uint16_t Opcode) const {
847+
if (!isVOP3P(Opcode))
848+
return false;
849+
switch (Opcode) {
850+
case AMDGPU::V_FMA_MIXHI_F16:
851+
case AMDGPU::V_FMA_MIXLO_F16:
852+
case AMDGPU::V_FMA_MIX_F32:
853+
case AMDGPU::V_MAD_MIXHI_F16:
854+
case AMDGPU::V_MAD_MIXLO_F16:
855+
case AMDGPU::V_MAD_MIX_F32:
856+
return true;
857+
default:
858+
return false;
859+
}
860+
}
861+
842862
static bool isVINTRP(const MachineInstr &MI) {
843863
return MI.getDesc().TSFlags & SIInstrFlags::VINTRP;
844864
}

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -358,8 +358,8 @@ let SubtargetPredicate = HasMadMixInsts in {
358358
let OtherPredicates = [NoFP32Denormals] in {
359359

360360
// These are VOP3a-like opcodes which accept no omod.
361-
// Size of src arguments (16/32) is controlled by op_sel.
362-
// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
361+
// Size of src arguments (16/32) is controlled by op_sel_hi.
362+
// For 16-bit src arguments their location (hi/lo) are controlled by op_sel.
363363
let isCommutable = 1, mayRaiseFPException = 0 in {
364364
let isReMaterializable = 1 in
365365
defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;

llvm/test/CodeGen/AMDGPU/calling-conventions.ll

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -965,11 +965,11 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
965965
;
966966
; GFX11-LABEL: ps_mesa_inreg_v5i32:
967967
; GFX11: ; %bb.0:
968-
; GFX11-NEXT: s_add_i32 s3, s3, 4
969-
; GFX11-NEXT: s_add_i32 s2, s2, 3
970968
; GFX11-NEXT: s_add_i32 s1, s1, 2
971969
; GFX11-NEXT: s_add_i32 s4, s4, 5
972970
; GFX11-NEXT: s_add_i32 s0, s0, 1
971+
; GFX11-NEXT: s_add_i32 s3, s3, 4
972+
; GFX11-NEXT: s_add_i32 s2, s2, 3
973973
; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1
974974
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
975975
; GFX11-NEXT: v_mov_b32_e32 v2, s2
@@ -980,12 +980,11 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
980980
;
981981
; GFX1250-LABEL: ps_mesa_inreg_v5i32:
982982
; GFX1250: ; %bb.0:
983-
; GFX1250-NEXT: s_add_co_i32 s3, s3, 4
984-
; GFX1250-NEXT: s_add_co_i32 s2, s2, 3
985983
; GFX1250-NEXT: s_add_co_i32 s1, s1, 2
986984
; GFX1250-NEXT: s_add_co_i32 s4, s4, 5
987985
; GFX1250-NEXT: s_add_co_i32 s0, s0, 1
988-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
986+
; GFX1250-NEXT: s_add_co_i32 s3, s3, 4
987+
; GFX1250-NEXT: s_add_co_i32 s2, s2, 3
989988
; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0
990989
; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
991990
; GFX1250-NEXT: v_mov_b32_e32 v3, s3
@@ -1014,36 +1013,36 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
10141013
;
10151014
; VI-LABEL: ps_mesa_inreg_v5f32:
10161015
; VI: ; %bb.0:
1017-
; VI-NEXT: v_add_f32_e64 v3, s3, -1.0
1018-
; VI-NEXT: v_add_f32_e64 v2, s2, 4.0
10191016
; VI-NEXT: v_add_f32_e64 v1, s1, 2.0
10201017
; VI-NEXT: v_add_f32_e64 v0, s0, 1.0
10211018
; VI-NEXT: v_add_f32_e64 v4, s4, 0.5
1019+
; VI-NEXT: v_add_f32_e64 v3, s3, -1.0
1020+
; VI-NEXT: v_add_f32_e64 v2, s2, 4.0
10221021
; VI-NEXT: flat_store_dword v[0:1], v4
10231022
; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
10241023
; VI-NEXT: s_endpgm
10251024
;
10261025
; GFX11-LABEL: ps_mesa_inreg_v5f32:
10271026
; GFX11: ; %bb.0:
1028-
; GFX11-NEXT: v_add_f32_e64 v3, s3, -1.0
1029-
; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0
10301027
; GFX11-NEXT: v_add_f32_e64 v1, s1, 2.0
10311028
; GFX11-NEXT: v_add_f32_e64 v4, s4, 0.5
10321029
; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0
1030+
; GFX11-NEXT: v_add_f32_e64 v3, s3, -1.0
1031+
; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0
10331032
; GFX11-NEXT: s_clause 0x1
10341033
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
10351034
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
10361035
; GFX11-NEXT: s_endpgm
10371036
;
10381037
; GFX1250-LABEL: ps_mesa_inreg_v5f32:
10391038
; GFX1250: ; %bb.0:
1040-
; GFX1250-NEXT: s_add_f32 s3, s3, -1.0
10411039
; GFX1250-NEXT: s_add_f32 s4, s4, 0.5
10421040
; GFX1250-NEXT: s_add_f32 s0, s0, 1.0
10431041
; GFX1250-NEXT: s_add_f32 s1, s1, 2.0
1042+
; GFX1250-NEXT: s_add_f32 s3, s3, -1.0
10441043
; GFX1250-NEXT: s_add_f32 s2, s2, 4.0
1045-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_2)
10461044
; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0
1045+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
10471046
; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
10481047
; GFX1250-NEXT: v_mov_b32_e32 v3, s3
10491048
; GFX1250-NEXT: s_clause 0x1
@@ -1148,32 +1147,32 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
11481147
;
11491148
; VI-LABEL: ps_mesa_v5i32:
11501149
; VI: ; %bb.0:
1151-
; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v3
1152-
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
11531150
; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1
11541151
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
11551152
; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v4
1153+
; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v3
1154+
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
11561155
; VI-NEXT: flat_store_dword v[0:1], v4
11571156
; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
11581157
; VI-NEXT: s_endpgm
11591158
;
11601159
; GFX11-LABEL: ps_mesa_v5i32:
11611160
; GFX11: ; %bb.0:
1162-
; GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v3
1163-
; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
11641161
; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v1
11651162
; GFX11-NEXT: v_add_nc_u32_e32 v4, 5, v4
11661163
; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0
1164+
; GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v3
1165+
; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
11671166
; GFX11-NEXT: s_clause 0x1
11681167
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
11691168
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
11701169
; GFX11-NEXT: s_endpgm
11711170
;
11721171
; GFX1250-LABEL: ps_mesa_v5i32:
11731172
; GFX1250: ; %bb.0:
1174-
; GFX1250-NEXT: v_dual_add_nc_u32 v3, 4, v3 :: v_dual_add_nc_u32 v2, 3, v2
11751173
; GFX1250-NEXT: v_dual_add_nc_u32 v1, 2, v1 :: v_dual_add_nc_u32 v4, 5, v4
1176-
; GFX1250-NEXT: v_add_nc_u32_e32 v0, 1, v0
1174+
; GFX1250-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v3, 4, v3
1175+
; GFX1250-NEXT: v_add_nc_u32_e32 v2, 3, v2
11771176
; GFX1250-NEXT: s_clause 0x1
11781177
; GFX1250-NEXT: global_store_b32 v[0:1], v4, off
11791178
; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off
@@ -1199,30 +1198,30 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
11991198
;
12001199
; VI-LABEL: ps_mesa_v5f32:
12011200
; VI: ; %bb.0:
1202-
; VI-NEXT: v_add_f32_e32 v3, -1.0, v3
1203-
; VI-NEXT: v_add_f32_e32 v2, 4.0, v2
12041201
; VI-NEXT: v_add_f32_e32 v1, 2.0, v1
12051202
; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
12061203
; VI-NEXT: v_add_f32_e32 v4, 0.5, v4
1204+
; VI-NEXT: v_add_f32_e32 v3, -1.0, v3
1205+
; VI-NEXT: v_add_f32_e32 v2, 4.0, v2
12071206
; VI-NEXT: flat_store_dword v[0:1], v4
12081207
; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
12091208
; VI-NEXT: s_endpgm
12101209
;
12111210
; GFX11-LABEL: ps_mesa_v5f32:
12121211
; GFX11: ; %bb.0:
1213-
; GFX11-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2
12141212
; GFX11-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4
1215-
; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
1213+
; GFX11-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v3, -1.0, v3
1214+
; GFX11-NEXT: v_add_f32_e32 v2, 4.0, v2
12161215
; GFX11-NEXT: s_clause 0x1
12171216
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
12181217
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
12191218
; GFX11-NEXT: s_endpgm
12201219
;
12211220
; GFX1250-LABEL: ps_mesa_v5f32:
12221221
; GFX1250: ; %bb.0:
1223-
; GFX1250-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2
12241222
; GFX1250-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4
1225-
; GFX1250-NEXT: v_add_f32_e32 v0, 1.0, v0
1223+
; GFX1250-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v3, -1.0, v3
1224+
; GFX1250-NEXT: v_add_f32_e32 v2, 4.0, v2
12261225
; GFX1250-NEXT: s_clause 0x1
12271226
; GFX1250-NEXT: global_store_b32 v[0:1], v4, off
12281227
; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off

0 commit comments

Comments
 (0)