Skip to content

Commit 7bc4fce

Browse files
VigneshwarJjrbyrnes
authored andcommitted
[AMDGPU] Fix opsel for scaled MFMA operations
Fix for opsel flags encoding and ASM parsing of the scaled MFMA Change-Id: Ia21983359d5ca9ebbd55c4536bfed810dc9dd1d7
1 parent 1056c2c commit 7bc4fce

File tree

7 files changed

+186
-77
lines changed

7 files changed

+186
-77
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 78 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1851,6 +1851,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
18511851

18521852
void cvtVOP3(MCInst &Inst, const OperandVector &Operands,
18531853
OptionalImmIndexMap &OptionalIdx);
1854+
void cvtScaledMFMA(MCInst &Inst, const OperandVector &Operands);
18541855
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
18551856
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
18561857
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
@@ -6734,17 +6735,25 @@ ParseStatus AMDGPUAsmParser::parseTH(OperandVector &Operands, int64_t &TH) {
67346735
return ParseStatus::Success;
67356736
}
67366737

6737-
static void addOptionalImmOperand(
6738-
MCInst& Inst, const OperandVector& Operands,
6739-
AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx,
6740-
AMDGPUOperand::ImmTy ImmT,
6741-
int64_t Default = 0) {
6738+
static void
6739+
addOptionalImmOperand(MCInst &Inst, const OperandVector &Operands,
6740+
AMDGPUAsmParser::OptionalImmIndexMap &OptionalIdx,
6741+
AMDGPUOperand::ImmTy ImmT, int64_t Default = 0,
6742+
std::optional<unsigned> InsertAt = std::nullopt) {
67426743
auto i = OptionalIdx.find(ImmT);
67436744
if (i != OptionalIdx.end()) {
67446745
unsigned Idx = i->second;
6745-
((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1);
6746+
const AMDGPUOperand &Op =
6747+
static_cast<const AMDGPUOperand &>(*Operands[Idx]);
6748+
if (InsertAt)
6749+
Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Op.getImm()));
6750+
else
6751+
Op.addImmOperands(Inst, 1);
67466752
} else {
6747-
Inst.addOperand(MCOperand::createImm(Default));
6753+
if (InsertAt.has_value())
6754+
Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Default));
6755+
else
6756+
Inst.addOperand(MCOperand::createImm(Default));
67486757
}
67496758
}
67506759

@@ -8677,6 +8686,68 @@ void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands)
86778686
Inst.getOperand(ModIdx).setImm(ModVal);
86788687
}
86798688
}
8689+
void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
8690+
const OperandVector &Operands) {
8691+
OptionalImmIndexMap OptionalIdx;
8692+
unsigned Opc = Inst.getOpcode();
8693+
unsigned I = 1;
8694+
8695+
const MCInstrDesc &Desc = MII.get(Opc);
8696+
8697+
for (unsigned J = 0; J < Desc.getNumDefs(); ++J)
8698+
static_cast<AMDGPUOperand &>(*Operands[I++]).addRegOperands(Inst, 1);
8699+
8700+
for (unsigned E = Operands.size(); I != E; ++I) {
8701+
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands[I]);
8702+
8703+
if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
8704+
Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
8705+
} else if (Op.isImmModifier()) {
8706+
OptionalIdx[Op.getImmTy()] = I;
8707+
} else {
8708+
Op.addRegOrImmOperands(Inst, 1);
8709+
}
8710+
}
8711+
8712+
// Insert CBSZ and BLGP operands for F8F6F4 variants
8713+
int InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz);
8714+
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCBSZ,
8715+
0, InsertPos);
8716+
InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
8717+
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyBLGP,
8718+
0, InsertPos);
8719+
8720+
// Add dummy src_modifiers
8721+
Inst.addOperand(MCOperand::createImm(0));
8722+
Inst.addOperand(MCOperand::createImm(0));
8723+
8724+
// Handle op_sel fields
8725+
8726+
unsigned OpSel = 0;
8727+
auto OpselIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSel);
8728+
if (OpselIdx != OptionalIdx.end())
8729+
OpSel = static_cast<const AMDGPUOperand &>(*Operands[OpselIdx->second])
8730+
.getImm();
8731+
8732+
unsigned OpSelHi = 0;
8733+
auto OpselHiIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSelHi);
8734+
if (OpselHiIdx != OptionalIdx.end())
8735+
OpSelHi = static_cast<const AMDGPUOperand &>(*Operands[OpselHiIdx->second])
8736+
.getImm();
8737+
static const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers,
8738+
AMDGPU::OpName::src1_modifiers};
8739+
8740+
for (unsigned J = 0; J < 2; ++J) {
8741+
unsigned ModVal = 0;
8742+
if (OpSel & (1 << J))
8743+
ModVal |= SISrcMods::OP_SEL_0;
8744+
if (OpSelHi & (1 << J))
8745+
ModVal |= SISrcMods::OP_SEL_1;
8746+
8747+
const int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
8748+
Inst.getOperand(ModIdx).setImm(ModVal);
8749+
}
8750+
}
86808751

86818752
void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
86828753
OptionalImmIndexMap &OptionalIdx) {

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -784,12 +784,12 @@ class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> :
784784
// Currently assumes scaled instructions never have abid
785785
class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag <
786786
!if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp,
787-
node:$scale_src0_opsel, node:$scale_src0,
788-
node:$scale_src1_opsel, node:$scale_src1),
787+
node:$src0_modifiers, node:$scale_src0,
788+
node:$src1_modifiers, node:$scale_src1),
789789
!con((ops node:$src0, node:$src1, node:$src2, node:$cbsz),
790790
!if(HasAbid, (ops node:$abid), (ops)),
791791
(ops node:$blgp))),
792-
!if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $scale_src0_opsel, $scale_src0, $scale_src1_opsel, $scale_src1),
792+
!if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1),
793793
!if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp),
794794
(Op $src0, $src1, $src2, $cbsz, $blgp))),
795795
pred
@@ -852,12 +852,12 @@ class ScaledMAIInst<string OpName, MAIInst BaseInst, SDPatternOperator node> :
852852
let InOperandList = !con(BaseInst.InOperandList,
853853
(ins VSrc_b32:$scale_src0,
854854
VSrc_b32:$scale_src1,
855-
op_sel0:$scale_src0_opsel,
856-
op_sel_hi0:$scale_src1_opsel));
855+
op_sel0:$src0_modifiers,
856+
op_sel_hi0:$src1_modifiers));
857857
let AsmOperands =
858858
"$vdst, $src0, $src1, $src2, $scale_src0, $scale_src1"
859-
"$scale_src0_opsel$scale_src1_opsel$cbsz$blgp";
860-
859+
"$src0_modifiers$src1_modifiers$cbsz$blgp";
860+
let AsmMatchConverter = "cvtScaledMFMA";
861861
let FixedSize = 1;
862862
let Size = 16;
863863
}
@@ -1994,7 +1994,6 @@ multiclass VOP3PX_Real_ScaledMFMA<bits<7> op> {
19941994
defvar PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64");
19951995
defvar Name = PS_ACD.Mnemonic;
19961996
defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8";
1997-
19981997
let SubtargetPredicate = HasGFX950Insts,
19991998
DecoderNamespace = "GFX940",
20001999
AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {
@@ -2010,7 +2009,7 @@ multiclass VOP3PX_Real_ScaledMFMA<bits<7> op> {
20102009

20112010
multiclass VOP3PX_Real_ScaledMFMA_F8F6F4_mc<bits<7> op> {
20122011
defm _f8_f8 : VOP3PX_Real_ScaledMFMA<op>;
2013-
2012+
20142013
let isAsmParserOnly = 1 in { // Disable ambiguous disassembly.
20152014
defm _f8_f6 : VOP3PX_Real_ScaledMFMA<op>;
20162015
defm _f6_f8 : VOP3PX_Real_ScaledMFMA<op>;

llvm/lib/Target/AMDGPU/VOPInstructions.td

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -490,14 +490,14 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
490490
bits<9> scale_src0;
491491
bits<9> scale_src1;
492492

493-
bits<2> scale_src0_opsel;
494-
bits<2> scale_src1_opsel;
493+
bits<4> src0_modifiers;
494+
bits<4> src1_modifiers;
495495

496496
// Inst{7-0} = unused
497497
// Inst{10-8} = neg_hi;
498498
// Inst{13-11} = op_sel
499-
let Inst{11} = scale_src0_opsel{0};
500-
let Inst{12} = scale_src1_opsel{0};
499+
let Inst{11} = src0_modifiers{2};
500+
let Inst{12} = src1_modifiers{2};
501501
// Inst{13} = unused op_sel
502502
// Inst{14} = unused op_sel_hi2
503503

@@ -506,8 +506,8 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
506506
let Inst{49-41} = scale_src1;
507507
// Inst{50-58} = unused
508508
// Inst{60-59} = op_sel_hi;
509-
let Inst{59} = scale_src0_opsel{1};
510-
let Inst{60} = scale_src1_opsel{1};
509+
let Inst{59} = src0_modifiers{3};
510+
let Inst{60} = src1_modifiers{3};
511511
// Inst{63-61} = neg;
512512

513513
// The high half of the encoding is the unscaled mfma op.
@@ -1325,17 +1325,17 @@ class getVOP3MAIScaledPat<VOPProfile P, SDPatternOperator node> {
13251325
// mfma
13261326
[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
13271327
timm:$cbsz, timm:$blgp,
1328-
MFMALdScaleModifierOp:$scale_src0_opsel,
1328+
MFMALdScaleModifierOp:$src0_modifiers,
13291329
i32:$scale_src0,
1330-
MFMALdScaleModifierOp:$scale_src1_opsel,
1330+
MFMALdScaleModifierOp:$src1_modifiers,
13311331
i32:$scale_src1
13321332
))],
13331333
// smfmac
13341334
[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx,
13351335
timm:$cbsz, timm:$abid,
1336-
MFMALdScaleModifierOp:$scale_src0_opsel,
1336+
MFMALdScaleModifierOp:$src0_modifiers,
13371337
i32:$scale_src0,
1338-
MFMALdScaleModifierOp:$scale_src1_opsel,
1338+
MFMALdScaleModifierOp:$src1_modifiers,
13391339
i32:$scale_src1))]);
13401340
}
13411341

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
4848
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
4949
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
5050
; GCN-NEXT: s_nop 1
51-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
51+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
5252
; GCN-NEXT: s_nop 7
5353
; GCN-NEXT: s_nop 3
5454
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -72,7 +72,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
7272
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
7373
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
7474
; GCN-NEXT: s_nop 1
75-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
75+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
7676
; GCN-NEXT: s_nop 7
7777
; GCN-NEXT: s_nop 3
7878
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -96,7 +96,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
9696
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
9797
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
9898
; GCN-NEXT: s_nop 1
99-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
99+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
100100
; GCN-NEXT: s_nop 7
101101
; GCN-NEXT: s_nop 3
102102
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -120,7 +120,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
120120
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
121121
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
122122
; GCN-NEXT: s_nop 1
123-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
123+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
124124
; GCN-NEXT: s_nop 7
125125
; GCN-NEXT: s_nop 3
126126
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -144,7 +144,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
144144
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
145145
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
146146
; GCN-NEXT: s_nop 1
147-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
147+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
148148
; GCN-NEXT: s_nop 7
149149
; GCN-NEXT: s_nop 3
150150
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -168,7 +168,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
168168
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
169169
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
170170
; GCN-NEXT: s_nop 1
171-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
171+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
172172
; GCN-NEXT: s_nop 7
173173
; GCN-NEXT: s_nop 3
174174
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -192,7 +192,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
192192
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
193193
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
194194
; GCN-NEXT: s_nop 1
195-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
195+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
196196
; GCN-NEXT: s_nop 7
197197
; GCN-NEXT: s_nop 3
198198
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1775,7 +1775,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
17751775
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
17761776
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
17771777
; GCN-NEXT: s_nop 1
1778-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0]
1778+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[1,1,0]
17791779
; GCN-NEXT: s_nop 7
17801780
; GCN-NEXT: s_nop 3
17811781
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1797,7 +1797,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
17971797
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
17981798
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
17991799
; SDAG-NEXT: s_nop 1
1800-
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0]
1800+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[1,1,0]
18011801
; SDAG-NEXT: s_nop 7
18021802
; SDAG-NEXT: s_nop 3
18031803
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1815,7 +1815,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
18151815
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
18161816
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
18171817
; GISEL-NEXT: s_nop 1
1818-
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
1818+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[1,1,0]
18191819
; GISEL-NEXT: s_nop 7
18201820
; GISEL-NEXT: s_nop 3
18211821
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1838,7 +1838,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
18381838
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
18391839
; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d
18401840
; SDAG-NEXT: s_nop 1
1841-
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
1841+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[1,1,0]
18421842
; SDAG-NEXT: s_nop 7
18431843
; SDAG-NEXT: s_nop 3
18441844
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1857,7 +1857,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
18571857
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
18581858
; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d
18591859
; GISEL-NEXT: s_nop 1
1860-
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
1860+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
18611861
; GISEL-NEXT: s_nop 7
18621862
; GISEL-NEXT: s_nop 3
18631863
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1899,7 +1899,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
18991899
; SDAG-NEXT: v_mov_b32_e32 v19, s3
19001900
; SDAG-NEXT: v_mov_b32_e32 v21, s5
19011901
; SDAG-NEXT: s_nop 1
1902+
<<<<<<< HEAD
19021903
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s4, v21 op_sel_hi:[0,0,0] blgp:2
1904+
=======
1905+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
1906+
>>>>>>> f33f71291d5d... [AMDGPU] Fix opsel for scaled MFMA operations
19031907
; SDAG-NEXT: s_nop 7
19041908
; SDAG-NEXT: s_nop 3
19051909
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
@@ -1922,8 +1926,13 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
19221926
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
19231927
; GISEL-NEXT: v_mov_b32_e32 v20, s25
19241928
; GISEL-NEXT: s_nop 1
1929+
<<<<<<< HEAD
19251930
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s24, v20 op_sel_hi:[0,0,0] blgp:2
19261931
; GISEL-NEXT: v_mov_b32_e32 v4, 0
1932+
=======
1933+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
1934+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
1935+
>>>>>>> f33f71291d5d... [AMDGPU] Fix opsel for scaled MFMA operations
19271936
; GISEL-NEXT: s_nop 7
19281937
; GISEL-NEXT: s_nop 2
19291938
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27]
@@ -1964,6 +1973,31 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
19641973
; SDAG-NEXT: s_nop 0
19651974
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s2, -2 op_sel_hi:[0,0,0]
19661975
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1976+
<<<<<<< HEAD
1977+
=======
1978+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
1979+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
1980+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
1981+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
1982+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
1983+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
1984+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
1985+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
1986+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
1987+
; SDAG-NEXT: v_mov_b32_e32 v8, s16
1988+
; SDAG-NEXT: v_mov_b32_e32 v9, s17
1989+
; SDAG-NEXT: v_mov_b32_e32 v10, s18
1990+
; SDAG-NEXT: v_mov_b32_e32 v11, s19
1991+
; SDAG-NEXT: v_mov_b32_e32 v12, s20
1992+
; SDAG-NEXT: v_mov_b32_e32 v13, s21
1993+
; SDAG-NEXT: v_mov_b32_e32 v14, s22
1994+
; SDAG-NEXT: v_mov_b32_e32 v15, s23
1995+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
1996+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
1997+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
1998+
; SDAG-NEXT: s_nop 1
1999+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2000+
>>>>>>> f33f71291d5d... [AMDGPU] Fix opsel for scaled MFMA operations
19672001
; SDAG-NEXT: s_nop 7
19682002
; SDAG-NEXT: s_nop 2
19692003
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1]
@@ -1987,8 +2021,13 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
19872021
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[18:19]
19882022
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
19892023
; GISEL-NEXT: s_nop 1
2024+
<<<<<<< HEAD
19902025
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel_hi:[0,0,0]
19912026
; GISEL-NEXT: v_mov_b32_e32 v4, 0
2027+
=======
2028+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2029+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2030+
>>>>>>> f33f71291d5d... [AMDGPU] Fix opsel for scaled MFMA operations
19922031
; GISEL-NEXT: s_nop 7
19932032
; GISEL-NEXT: s_nop 2
19942033
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]

0 commit comments

Comments
 (0)