Skip to content

Commit 934f802

Browse files
authored
[AMDGPU][True16][CodeGen] true16 isel pattern for fma_mix_f16/bf16 (#159648)
This patch includes: 1. fma_mix inst takes fp16 type as input, but place the operand in vgpr32. Update selector to insert vgpr32 for true16 mode if necessary. 2. fma_mix inst returns fp16 type as output, but place the vdst in vgpr32. Create a fma_mix_t16 pesudo inst for isel pattern, and lower it to mix_lo/hi in the mc lowering pass. These stop isel from emitting illegal `vgpr32 = COPY vgpr16` and improve code quality
1 parent 3be8294 commit 934f802

13 files changed

+534
-289
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4078,18 +4078,26 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
40784078
// register.
40794079

40804080
Mods |= SISrcMods::OP_SEL_1;
4081-
if (IsExtractHigh ||
4082-
(Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
4083-
Mods |= SISrcMods::OP_SEL_0;
4081+
if (Src.getValueSizeInBits() == 16) {
4082+
if (isExtractHiElt(Src, Src)) {
4083+
Mods |= SISrcMods::OP_SEL_0;
40844084

4085-
// TODO: Should we try to look for neg/abs here?
4086-
}
4085+
// TODO: Should we try to look for neg/abs here?
4086+
return true;
4087+
}
4088+
4089+
if (Src.getOpcode() == ISD::TRUNCATE &&
4090+
Src.getOperand(0).getValueType() == MVT::i32) {
4091+
Src = Src.getOperand(0);
4092+
return true;
4093+
}
4094+
4095+
if (Subtarget->useRealTrue16Insts())
4096+
// In true16 mode, pack src to a 32bit
4097+
Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
4098+
} else if (IsExtractHigh)
4099+
Mods |= SISrcMods::OP_SEL_0;
40874100

4088-
// Prevent unnecessary subreg COPY to VGPR_16
4089-
if (Src.getOpcode() == ISD::TRUNCATE &&
4090-
Src.getOperand(0).getValueType() == MVT::i32) {
4091-
Src = Src.getOperand(0);
4092-
}
40934101
return true;
40944102
}
40954103

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,40 @@ void AMDGPUMCInstLower::lowerT16D16Helper(const MachineInstr *MI,
175175
}
176176
}
177177

178+
void AMDGPUMCInstLower::lowerT16FmaMixFP16(const MachineInstr *MI,
179+
MCInst &OutMI) const {
180+
unsigned Opcode = MI->getOpcode();
181+
const auto *TII = static_cast<const SIInstrInfo *>(ST.getInstrInfo());
182+
const SIRegisterInfo &TRI = TII->getRegisterInfo();
183+
184+
int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode, llvm::AMDGPU::OpName::vdst);
185+
const MachineOperand &VDst = MI->getOperand(VDstIdx);
186+
bool IsHi = AMDGPU::isHi16Reg(VDst.getReg(), TRI);
187+
switch (Opcode) {
188+
case AMDGPU::V_FMA_MIX_F16_t16:
189+
Opcode = IsHi ? AMDGPU::V_FMA_MIXHI_F16 : AMDGPU::V_FMA_MIXLO_F16;
190+
break;
191+
case AMDGPU::V_FMA_MIX_BF16_t16:
192+
Opcode = IsHi ? AMDGPU::V_FMA_MIXHI_BF16 : AMDGPU::V_FMA_MIXLO_BF16;
193+
break;
194+
}
195+
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
196+
assert(MCOpcode != -1 &&
197+
"Pseudo instruction doesn't have a target-specific version");
198+
OutMI.setOpcode(MCOpcode);
199+
200+
// lower operands
201+
for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
202+
const MachineOperand &MO = MI->getOperand(I);
203+
MCOperand MCOp;
204+
if (I == VDstIdx)
205+
MCOp = MCOperand::createReg(TRI.get32BitRegister(VDst.getReg()));
206+
else
207+
lowerOperand(MO, MCOp);
208+
OutMI.addOperand(MCOp);
209+
}
210+
}
211+
178212
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
179213
unsigned Opcode = MI->getOpcode();
180214
const auto *TII = static_cast<const SIInstrInfo *>(ST.getInstrInfo());
@@ -201,6 +235,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
201235
} else if (AMDGPU::getT16D16Helper(Opcode)) {
202236
lowerT16D16Helper(MI, OutMI);
203237
return;
238+
} else if (Opcode == AMDGPU::V_FMA_MIX_F16_t16 ||
239+
Opcode == AMDGPU::V_FMA_MIX_BF16_t16) {
240+
lowerT16FmaMixFP16(MI, OutMI);
241+
return;
204242
}
205243

206244
int MCOpcode = TII->pseudoToMCOpcode(Opcode);

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ class AMDGPUMCInstLower {
3838
void lower(const MachineInstr *MI, MCInst &OutMI) const;
3939

4040
void lowerT16D16Helper(const MachineInstr *MI, MCInst &OutMI) const;
41+
void lowerT16FmaMixFP16(const MachineInstr *MI, MCInst &OutMI) const;
4142
};
4243

4344
namespace {

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9506,6 +9506,13 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
95069506
DescSize = Desc.getSize();
95079507
}
95089508

9509+
// If FMA Pseudo inst, get correct MC code size
9510+
if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9511+
// All potential lowerings are the same size; arbitrarily pick one.
9512+
const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9513+
DescSize = Desc.getSize();
9514+
}
9515+
95099516
return DescSize;
95109517
}
95119518
}

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 91 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
6464
"$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
6565
}
6666

67+
class VOP3P_Mix_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR>
68+
: VOP3P_Mix_Profile<P, Features, 0> {
69+
let IsTrue16 = 1;
70+
let IsRealTrue16 = 1;
71+
let DstRC64 = getVALUDstForVT<P.DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
72+
}
73+
6774
multiclass VOP3PInst<string OpName, VOPProfile P,
6875
SDPatternOperator node = null_frag, bit IsDOT = 0> {
6976
def NAME : VOP3P_Pseudo<OpName, P,
@@ -95,6 +102,16 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
95102
} // end SubtargetPredicate = isGFX11Plus
96103
}
97104

105+
multiclass VOP3_VOP3PInst_t16<string OpName, VOP3P_Mix_Profile P> {
106+
def NAME : VOP3P_Pseudo<OpName, P>;
107+
108+
if P.HasExtVOP3DPP then
109+
def _dpp : VOP3_DPP_Pseudo<OpName, P> {
110+
let VOP3P = 1;
111+
let PseudoInstr = OpName#"_dpp";
112+
}
113+
}
114+
98115
let isReMaterializable = 1 in {
99116
let isCommutable = 1 in {
100117
defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
@@ -160,12 +177,9 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_
160177

161178
// TODO: Make sure we're doing the right thing with denormals. Note
162179
// that FMA and MAD will differ.
163-
multiclass MadFmaMixPats<SDPatternOperator fma_like,
164-
Instruction mix_inst,
165-
Instruction mixlo_inst,
166-
Instruction mixhi_inst,
167-
ValueType VT = f16,
168-
ValueType vecVT = v2f16> {
180+
multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like,
181+
Instruction mix_inst,
182+
ValueType VT = f16> {
169183
defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
170184
defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
171185
// At least one of the operands needs to be an fpextend of an f16
@@ -189,7 +203,14 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
189203
(f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
190204
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
191205
DSTCLAMP.NONE)>;
206+
}
192207

208+
multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like,
209+
Instruction mixlo_inst,
210+
Instruction mixhi_inst,
211+
ValueType VT = f16,
212+
ValueType vecVT = v2f16> {
213+
defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
193214
def : GCNPat <
194215
(AMDGPUclamp (build_vector
195216
(VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
@@ -243,9 +264,6 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
243264
// FIXME: Special case handling for maxhi (especially for clamp)
244265
// because dealing with the write to high half of the register is
245266
// difficult.
246-
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
247-
let True16Predicate = p in {
248-
249267
def : GCNPat <
250268
(build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
251269
(f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
@@ -269,45 +287,60 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
269287
DSTCLAMP.ENABLE,
270288
VGPR_32:$elt0))
271289
>;
290+
}
272291

273-
} // end True16Predicate
292+
multiclass MadFmaMixFP16Pats_t16<SDPatternOperator fma_like,
293+
Instruction mix_inst_16,
294+
ValueType VT = f16,
295+
ValueType vecVT = v2f16> {
296+
defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
297+
def : GCNPat <
298+
(VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
299+
(f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
300+
(mix_inst_16 $src0_modifiers, $src0,
301+
$src1_modifiers, $src1,
302+
(i32 0), (i32 0),
303+
DSTCLAMP.NONE)
304+
>;
274305

275-
let True16Predicate = UseRealTrue16Insts in {
276306
def : GCNPat <
277-
(build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
307+
(VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
278308
(f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
279-
(f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1),
280-
(vecVT (mixlo_inst $src0_modifiers, $src0,
281-
$src1_modifiers, $src1,
282-
$src2_modifiers, $src2,
283-
DSTCLAMP.NONE,
284-
(REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16)))
309+
(f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))),
310+
(mix_inst_16 $src0_modifiers, $src0,
311+
$src1_modifiers, $src1,
312+
$src2_modifiers, $src2,
313+
DSTCLAMP.NONE)
285314
>;
286315

316+
287317
def : GCNPat <
288-
(build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
289-
(f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
290-
(f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
291-
(vecVT (mixhi_inst $src0_modifiers, $src0,
292-
$src1_modifiers, $src1,
293-
$src2_modifiers, $src2,
294-
DSTCLAMP.NONE,
295-
(REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
318+
(AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
319+
(f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
320+
(f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
321+
(mix_inst_16 $src0_modifiers, $src0,
322+
$src1_modifiers, $src1,
323+
$src2_modifiers, $src2,
324+
DSTCLAMP.ENABLE)
296325
>;
297326

298327
def : GCNPat <
299-
(build_vector
300-
VT:$elt0,
301-
(AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
302-
(f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
303-
(f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
304-
(vecVT (mixhi_inst $src0_modifiers, $src0,
305-
$src1_modifiers, $src1,
306-
$src2_modifiers, $src2,
307-
DSTCLAMP.ENABLE,
308-
(REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
328+
(AMDGPUclamp (build_vector
329+
(VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
330+
(f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)),
331+
(f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))),
332+
(VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)),
333+
(f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)),
334+
(f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))),
335+
(vecVT (REG_SEQUENCE VGPR_32, (mix_inst_16 $lo_src0_modifiers, $lo_src0,
336+
$lo_src1_modifiers, $lo_src1,
337+
$lo_src2_modifiers, $lo_src2,
338+
DSTCLAMP.ENABLE), lo16,
339+
(mix_inst_16 $hi_src0_modifiers, $hi_src0,
340+
$hi_src1_modifiers, $hi_src1,
341+
$hi_src2_modifiers, $hi_src2,
342+
DSTCLAMP.ENABLE), hi16))
309343
>;
310-
} // end True16Predicate
311344
}
312345

313346
class MinimumMaximumByMinimum3Maximum3VOP3P<SDPatternOperator node,
@@ -341,7 +374,8 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F
341374
} // End FPDPRounding = 1
342375
}
343376

344-
defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
377+
defm : MadFmaMixFP32Pats<fmad, V_MAD_MIX_F32>;
378+
defm : MadFmaMixFP16Pats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
345379
} // OtherPredicates = [NoFP32Denormals]
346380
} // End SubtargetPredicate = HasMadMixInsts
347381

@@ -360,10 +394,19 @@ defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile<VOP_F
360394
let ClampLo = 0, ClampHi = 1 in {
361395
defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
362396
}
397+
398+
// Pseudo true16 inst for v_fma_mixlo/hi_f16
399+
defm V_FMA_MIX_F16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_f16_t16", VOP3P_Mix_Profile_t16<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
363400
} // End FPDPRounding = 1
364401
}
365402

366-
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
403+
defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32>;
404+
405+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
406+
let True16Predicate = p in
407+
defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
408+
let True16Predicate = UseRealTrue16Insts in
409+
defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_F16_t16>;
367410
}
368411

369412
let SubtargetPredicate = HasFmaMixBF16Insts in {
@@ -378,10 +421,18 @@ defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP
378421
let ClampLo = 0, ClampHi = 1 in {
379422
defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
380423
}
424+
425+
// Pseudo true16 inst for v_fma_mixlo/hi_bf16
426+
defm V_FMA_MIX_BF16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_bf16_t16", VOP3P_Mix_Profile_t16<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
381427
} // End FPDPRounding = 1
382428
} // End isCommutable = 1
383429

384-
defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
430+
defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32_BF16, bf16>;
431+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
432+
let True16Predicate = p in
433+
defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
434+
let True16Predicate = UseRealTrue16Insts in
435+
defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
385436
} // End SubtargetPredicate = HasFmaMixBF16Insts
386437

387438
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {

llvm/test/CodeGen/AMDGPU/fdiv.f16.ll

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -137,33 +137,31 @@ define amdgpu_kernel void @v_fdiv_f16(
137137
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
138138
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
139139
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
140-
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
140+
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
141141
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
142-
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
142+
; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] glc dlc
143143
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
144-
; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] glc dlc
144+
; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] glc dlc
145145
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
146-
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l
147-
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l
148-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
149-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
150-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
151-
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
146+
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v2.l
147+
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l
148+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
149+
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
152150
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
153-
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v3
154-
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1]
151+
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v0
152+
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
155153
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
156-
; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v3
157-
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1]
154+
; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v5, v0
155+
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
158156
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
159-
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v5, v3
160-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
157+
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v5, v0
158+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
161159
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
162-
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4
163-
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
160+
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v4
161+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
164162
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
165-
; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l
166-
; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
163+
; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
164+
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
167165
; GFX11-TRUE16-NEXT: s_endpgm
168166
;
169167
; GFX11-FAKE16-LABEL: v_fdiv_f16:

0 commit comments

Comments
 (0)