-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Fix opsel for scaled MFMA operations #140183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Fix for opsel flags encoding and ASM parsing of the scaled MFMA
|
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-amdgpu Author: Vigneshwar Jayakumar (VigneshwarJ) ChangesFix for opsel flags encoding and ASM parsing of the scaled MFMA Patch is 40.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140183.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b50a2cf1becf7..450c4fffb8453 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1878,6 +1878,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
void cvtVOP3(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx);
+ void cvtScaledMFMA(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
@@ -6796,17 +6797,25 @@ ParseStatus AMDGPUAsmParser::parseTH(OperandVector &Operands, int64_t &TH) {
return ParseStatus::Success;
}
-static void addOptionalImmOperand(
- MCInst& Inst, const OperandVector& Operands,
- AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx,
- AMDGPUOperand::ImmTy ImmT,
- int64_t Default = 0) {
+static void
+addOptionalImmOperand(MCInst &Inst, const OperandVector &Operands,
+ AMDGPUAsmParser::OptionalImmIndexMap &OptionalIdx,
+ AMDGPUOperand::ImmTy ImmT, int64_t Default = 0,
+ std::optional<unsigned> InsertAt = std::nullopt) {
auto i = OptionalIdx.find(ImmT);
if (i != OptionalIdx.end()) {
unsigned Idx = i->second;
- ((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1);
+ const AMDGPUOperand &Op =
+ static_cast<const AMDGPUOperand &>(*Operands[Idx]);
+ if (InsertAt)
+ Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Op.getImm()));
+ else
+ Op.addImmOperands(Inst, 1);
} else {
- Inst.addOperand(MCOperand::createImm(Default));
+ if (InsertAt.has_value())
+ Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Default));
+ else
+ Inst.addOperand(MCOperand::createImm(Default));
}
}
@@ -8823,6 +8832,68 @@ void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands)
Inst.getOperand(ModIdx).setImm(ModVal);
}
}
+void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
+ const OperandVector &Operands) {
+ OptionalImmIndexMap OptionalIdx;
+ unsigned Opc = Inst.getOpcode();
+ unsigned I = 1;
+
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ for (unsigned J = 0; J < Desc.getNumDefs(); ++J)
+ static_cast<AMDGPUOperand &>(*Operands[I++]).addRegOperands(Inst, 1);
+
+ for (unsigned E = Operands.size(); I != E; ++I) {
+ AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands[I]);
+
+ if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isImmModifier()) {
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ Op.addRegOrImmOperands(Inst, 1);
+ }
+ }
+
+ // Insert CBSZ and BLGP operands for F8F6F4 variants
+ int InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCBSZ,
+ 0, InsertPos);
+ InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyBLGP,
+ 0, InsertPos);
+
+ // Add dummy src_modifiers
+ Inst.addOperand(MCOperand::createImm(0));
+ Inst.addOperand(MCOperand::createImm(0));
+
+ // Handle op_sel fields
+
+ unsigned OpSel = 0;
+ auto OpselIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSel);
+ if (OpselIdx != OptionalIdx.end())
+ OpSel = static_cast<const AMDGPUOperand &>(*Operands[OpselIdx->second])
+ .getImm();
+
+ unsigned OpSelHi = 0;
+ auto OpselHiIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSelHi);
+ if (OpselHiIdx != OptionalIdx.end())
+ OpSelHi = static_cast<const AMDGPUOperand &>(*Operands[OpselHiIdx->second])
+ .getImm();
+ static const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src1_modifiers};
+
+ for (unsigned J = 0; J < 2; ++J) {
+ unsigned ModVal = 0;
+ if (OpSel & (1 << J))
+ ModVal |= SISrcMods::OP_SEL_0;
+ if (OpSelHi & (1 << J))
+ ModVal |= SISrcMods::OP_SEL_1;
+
+ const int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+ Inst.getOperand(ModIdx).setImm(ModVal);
+ }
+}
void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx) {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index a3be49bf02648..6315d14dbbc92 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -829,12 +829,12 @@ class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> :
// Currently assumes scaled instructions never have abid
class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag <
!if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp,
- node:$scale_src0_opsel, node:$scale_src0,
- node:$scale_src1_opsel, node:$scale_src1),
+ node:$src0_modifiers, node:$scale_src0,
+ node:$src1_modifiers, node:$scale_src1),
!con((ops node:$src0, node:$src1, node:$src2, node:$cbsz),
!if(HasAbid, (ops node:$abid), (ops)),
(ops node:$blgp))),
- !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $scale_src0_opsel, $scale_src0, $scale_src1_opsel, $scale_src1),
+ !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1),
!if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp),
(Op $src0, $src1, $src2, $cbsz, $blgp))),
pred
@@ -895,12 +895,12 @@ class ScaledMAIInst<string OpName, MAIInst BaseInst, SDPatternOperator node> :
let InOperandList = !con(BaseInst.InOperandList,
(ins VSrc_b32:$scale_src0,
VSrc_b32:$scale_src1,
- op_sel0:$scale_src0_opsel,
- op_sel_hi0:$scale_src1_opsel));
+ op_sel0:$src0_modifiers,
+ op_sel_hi0:$src1_modifiers));
let AsmOperands =
"$vdst, $src0, $src1, $src2, $scale_src0, $scale_src1"
- "$scale_src0_opsel$scale_src1_opsel$cbsz$blgp";
-
+ "$src0_modifiers$src1_modifiers$cbsz$blgp";
+ let AsmMatchConverter = "cvtScaledMFMA";
let FixedSize = 1;
let Size = 16;
}
@@ -2041,7 +2041,6 @@ multiclass VOP3PX_Real_ScaledMFMA<bits<7> op> {
defvar PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64");
defvar Name = PS_ACD.Mnemonic;
defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8";
-
let SubtargetPredicate = HasGFX950Insts,
DecoderNamespace = "GFX940",
AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {
@@ -2057,7 +2056,7 @@ multiclass VOP3PX_Real_ScaledMFMA<bits<7> op> {
multiclass VOP3PX_Real_ScaledMFMA_F8F6F4_mc<bits<7> op> {
defm _f8_f8 : VOP3PX_Real_ScaledMFMA<op>;
-
+
let isAsmParserOnly = 1 in { // Disable ambiguous disassembly.
defm _f8_f6 : VOP3PX_Real_ScaledMFMA<op>;
defm _f6_f8 : VOP3PX_Real_ScaledMFMA<op>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f9fa83c3f5ae7..1c97c40e26083 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -526,14 +526,14 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
bits<9> scale_src0;
bits<9> scale_src1;
- bits<2> scale_src0_opsel;
- bits<2> scale_src1_opsel;
+ bits<4> src0_modifiers;
+ bits<4> src1_modifiers;
// Inst{7-0} = unused
// Inst{10-8} = neg_hi;
// Inst{13-11} = op_sel
- let Inst{11} = scale_src0_opsel{0};
- let Inst{12} = scale_src1_opsel{0};
+ let Inst{11} = src0_modifiers{2};
+ let Inst{12} = src1_modifiers{2};
// Inst{13} = unused op_sel
// Inst{14} = unused op_sel_hi2
@@ -542,8 +542,8 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
let Inst{49-41} = scale_src1;
// Inst{50-58} = unused
// Inst{60-59} = op_sel_hi;
- let Inst{59} = scale_src0_opsel{1};
- let Inst{60} = scale_src1_opsel{1};
+ let Inst{59} = src0_modifiers{3};
+ let Inst{60} = src1_modifiers{3};
// Inst{63-61} = neg;
// The high half of the encoding is the unscaled mfma op.
@@ -1437,17 +1437,17 @@ class getVOP3MAIScaledPat<VOPProfile P, SDPatternOperator node> {
// mfma
[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
timm:$cbsz, timm:$blgp,
- MFMALdScaleModifierOp:$scale_src0_opsel,
+ MFMALdScaleModifierOp:$src0_modifiers,
i32:$scale_src0,
- MFMALdScaleModifierOp:$scale_src1_opsel,
+ MFMALdScaleModifierOp:$src1_modifiers,
i32:$scale_src1
))],
// smfmac
[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx,
timm:$cbsz, timm:$abid,
- MFMALdScaleModifierOp:$scale_src0_opsel,
+ MFMALdScaleModifierOp:$src0_modifiers,
i32:$scale_src0,
- MFMALdScaleModifierOp:$scale_src1_opsel,
+ MFMALdScaleModifierOp:$src1_modifiers,
i32:$scale_src1))]);
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 7cc726a3bd79c..e027dda957a6d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -46,7 +46,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -70,7 +70,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -94,7 +94,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -118,7 +118,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -142,7 +142,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -166,7 +166,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -190,7 +190,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1797,7 +1797,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1819,7 +1819,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1837,7 +1837,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1860,7 +1860,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1879,7 +1879,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1921,7 +1921,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15]
@@ -1946,7 +1946,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; GISEL-NEXT: v_accvgpr_write_b32 a3, s27
; GISEL-NEXT: v_mov_b32_e32 v16, s29
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
@@ -1987,7 +1987,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
@@ -2013,7 +2013,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index dac54c9f85e96..5574313f22a47 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -134,7 +134,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -179,7 +179,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: ...
[truncated]
|
| v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 cbsz:3 blgp:1 | ||
|
|
||
| // GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] | ||
| // GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A side question: SP3 says this should be:
v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel:[0,1] op_sel_hi:[0,0] cbsz:3 abid:1 blgp:1
Why do we not emit abid:1 here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
abid is set to 1 default for scaled mfma, we are not just printing it.
| multiclass VOP3PX_Real_ScaledMFMA_F8F6F4_mc<bits<7> op> { | ||
| defm _f8_f8 : VOP3PX_Real_ScaledMFMA<op>; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Introduced trailing whitespace
| bits<4> src0_modifiers; | ||
| bits<4> src1_modifiers; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs comment explaining these. If you're writing custom operand handling maybe we shouldn't rename these? It is a weird case
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought so too, but this matches the current src_modifiers field and uses an existing bit mask 4 bit value (OpselHI, OpselLo, Abs and Neg) in TableGen. There is also already a logic in place for asm printing using this bit mask. Also, in previous revision, it was a 2-bit field, which was reading the lower 2 bits hence not generating the right encoding for opsel bits. Though this MFMA instruction doesn't use neg and abs, I thought it would be cleaner to match the existing implementation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it does support neg, it's just not wired up
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh I see it, I see this in doc " NEG[1:0] and ABS[1:0] must be zero. NEG[2] and ABS[2] may be used to control matrix C" . So I will wire up a src2_modifier with neg/abs bits mapped and add some tests for it as well
| v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 blgp:2 | ||
|
|
||
| // GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] | ||
| // GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:2 blgp:3 ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x10,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't seem like enough lines changed. Do we have every permutation of values tested?
Fixed review comments
This reverts commit 67a22cc.
| if (OpselIdx != OptionalIdx.end()) | ||
| OpSel = static_cast<const AMDGPUOperand &>(*Operands[OpselIdx->second]) | ||
| .getImm(); | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (OpselIdx != OptionalIdx.end()) | |
| OpSel = static_cast<const AMDGPUOperand &>(*Operands[OpselIdx->second]) | |
| .getImm(); | |
| if (OpselIdx != OptionalIdx.end()) { | |
| OpSel = static_cast<const AMDGPUOperand &>(*Operands[OpselIdx->second]) | |
| .getImm(); | |
| } | |
| if (OpselHiIdx != OptionalIdx.end()) | ||
| OpSelHi = static_cast<const AMDGPUOperand &>(*Operands[OpselHiIdx->second]) | ||
| .getImm(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (OpselHiIdx != OptionalIdx.end()) | |
| OpSelHi = static_cast<const AMDGPUOperand &>(*Operands[OpselHiIdx->second]) | |
| .getImm(); | |
| if (OpselHiIdx != OptionalIdx.end()) { | |
| OpSelHi = static_cast<const AMDGPUOperand &>(*Operands[OpselHiIdx->second]) | |
| .getImm(); | |
| } |
| static const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, | ||
| AMDGPU::OpName::src1_modifiers}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| static const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, | |
| AMDGPU::OpName::src1_modifiers}; | |
| const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, | |
| AMDGPU::OpName::src1_modifiers}; |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/51/builds/16552 Here is the relevant piece of the build log for the reference |
Pulls in llvm/llvm-project#139956 to fix an regression on AMD backend. Also to pick up llvm/llvm-project#140183.
Pulls in llvm/llvm-project#139956 to fix an regression on AMD backend. Also to pick up llvm/llvm-project#140183.
Fix for src scale opsel flags encoding and ASM parsing for gfx950 scaled MFMA.
… (llvm#2319) Fix for src scale opsel flags encoding and ASM parsing for gfx950 scaled MFMA.
… (llvm#2466) Fix for src scale opsel flags encoding and ASM parsing for gfx950 scaled MFMA. (cherry picked from commit e12cbd8) Co-authored-by: Vigneshwar Jayakumar <[email protected]>
Pulls in llvm/llvm-project#139956 to fix an regression on AMD backend. Also to pick up llvm/llvm-project#140183.
Fix for opsel flags encoding and ASM parsing of the scaled MFMA