From f33f71291d5d26d17fdb518be6a95490fd93542c Mon Sep 17 00:00:00 2001 From: vigneshwar jayakumar Date: Thu, 15 May 2025 22:35:16 -0500 Subject: [PATCH 1/5] [AMDGPU] Fix opsel for scaled MFMA operations Fix for opsel flags encoding and ASM parsing of the scaled MFMA --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 85 +++++++++++++++++-- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 17 ++-- llvm/lib/Target/AMDGPU/VOPInstructions.td | 20 ++--- ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 32 +++---- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 46 +++++----- llvm/test/MC/AMDGPU/mai-gfx950.s | 12 +-- .../MC/Disassembler/AMDGPU/gfx950_mai.txt | 12 +-- 7 files changed, 147 insertions(+), 77 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b50a2cf1becf7..450c4fffb8453 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1878,6 +1878,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { void cvtVOP3(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx); + void cvtScaledMFMA(MCInst &Inst, const OperandVector &Operands); void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); @@ -6796,17 +6797,25 @@ ParseStatus AMDGPUAsmParser::parseTH(OperandVector &Operands, int64_t &TH) { return ParseStatus::Success; } -static void addOptionalImmOperand( - MCInst& Inst, const OperandVector& Operands, - AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx, - AMDGPUOperand::ImmTy ImmT, - int64_t Default = 0) { +static void +addOptionalImmOperand(MCInst &Inst, const OperandVector &Operands, + AMDGPUAsmParser::OptionalImmIndexMap &OptionalIdx, + AMDGPUOperand::ImmTy ImmT, int64_t Default = 0, + std::optional InsertAt = std::nullopt) { auto i = OptionalIdx.find(ImmT); if (i != OptionalIdx.end()) { unsigned Idx = i->second; - ((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1); + const AMDGPUOperand &Op = + static_cast(*Operands[Idx]); + if (InsertAt) + Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Op.getImm())); + else + Op.addImmOperands(Inst, 1); } else { - Inst.addOperand(MCOperand::createImm(Default)); + if (InsertAt.has_value()) + Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Default)); + else + Inst.addOperand(MCOperand::createImm(Default)); } } @@ -8823,6 +8832,68 @@ void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands) Inst.getOperand(ModIdx).setImm(ModVal); } } +void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst, + const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + unsigned Opc = Inst.getOpcode(); + unsigned I = 1; + + const MCInstrDesc &Desc = MII.get(Opc); + + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) + static_cast(*Operands[I++]).addRegOperands(Inst, 1); + + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = static_cast(*Operands[I]); + + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + } else if (Op.isImmModifier()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + Op.addRegOrImmOperands(Inst, 1); + } + } + + // Insert CBSZ and BLGP operands for F8F6F4 variants + int InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCBSZ, + 0, InsertPos); + InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyBLGP, + 0, InsertPos); + + // Add dummy src_modifiers + Inst.addOperand(MCOperand::createImm(0)); + Inst.addOperand(MCOperand::createImm(0)); + + // Handle op_sel fields + + unsigned OpSel = 0; + auto OpselIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSel); + if (OpselIdx != OptionalIdx.end()) + OpSel = static_cast(*Operands[OpselIdx->second]) + .getImm(); + + unsigned OpSelHi = 0; + auto OpselHiIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSelHi); + if (OpselHiIdx != OptionalIdx.end()) + OpSelHi = static_cast(*Operands[OpselHiIdx->second]) + .getImm(); + static const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers}; + + for (unsigned J = 0; J < 2; ++J) { + unsigned ModVal = 0; + if (OpSel & (1 << J)) + ModVal |= SISrcMods::OP_SEL_0; + if (OpSelHi & (1 << J)) + ModVal |= SISrcMods::OP_SEL_1; + + const int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + Inst.getOperand(ModIdx).setImm(ModVal); + } +} void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx) { diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index a3be49bf02648..6315d14dbbc92 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -829,12 +829,12 @@ class MFMA_F8F6F4_WithSizeTable_Helper : // Currently assumes scaled instructions never have abid class MAIFrag : PatFrag < !if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp, - node:$scale_src0_opsel, node:$scale_src0, - node:$scale_src1_opsel, node:$scale_src1), + node:$src0_modifiers, node:$scale_src0, + node:$src1_modifiers, node:$scale_src1), !con((ops node:$src0, node:$src1, node:$src2, node:$cbsz), !if(HasAbid, (ops node:$abid), (ops)), (ops node:$blgp))), - !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $scale_src0_opsel, $scale_src0, $scale_src1_opsel, $scale_src1), + !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1), !if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), (Op $src0, $src1, $src2, $cbsz, $blgp))), pred @@ -895,12 +895,12 @@ class ScaledMAIInst : let InOperandList = !con(BaseInst.InOperandList, (ins VSrc_b32:$scale_src0, VSrc_b32:$scale_src1, - op_sel0:$scale_src0_opsel, - op_sel_hi0:$scale_src1_opsel)); + op_sel0:$src0_modifiers, + op_sel_hi0:$src1_modifiers)); let AsmOperands = "$vdst, $src0, $src1, $src2, $scale_src0, $scale_src1" - "$scale_src0_opsel$scale_src1_opsel$cbsz$blgp"; - + "$src0_modifiers$src1_modifiers$cbsz$blgp"; + let AsmMatchConverter = "cvtScaledMFMA"; let FixedSize = 1; let Size = 16; } @@ -2041,7 +2041,6 @@ multiclass VOP3PX_Real_ScaledMFMA op> { defvar PS_VCD = !cast(NAME # "_vgprcd" # "_e64"); defvar Name = PS_ACD.Mnemonic; defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8"; - let SubtargetPredicate = HasGFX950Insts, DecoderNamespace = "GFX940", AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in { @@ -2057,7 +2056,7 @@ multiclass VOP3PX_Real_ScaledMFMA op> { multiclass VOP3PX_Real_ScaledMFMA_F8F6F4_mc op> { defm _f8_f8 : VOP3PX_Real_ScaledMFMA; - + let isAsmParserOnly = 1 in { // Disable ambiguous disassembly. defm _f8_f6 : VOP3PX_Real_ScaledMFMA; defm _f6_f8 : VOP3PX_Real_ScaledMFMA; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index f9fa83c3f5ae7..1c97c40e26083 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -526,14 +526,14 @@ class VOP3PXe op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_ bits<9> scale_src0; bits<9> scale_src1; - bits<2> scale_src0_opsel; - bits<2> scale_src1_opsel; + bits<4> src0_modifiers; + bits<4> src1_modifiers; // Inst{7-0} = unused // Inst{10-8} = neg_hi; // Inst{13-11} = op_sel - let Inst{11} = scale_src0_opsel{0}; - let Inst{12} = scale_src1_opsel{0}; + let Inst{11} = src0_modifiers{2}; + let Inst{12} = src1_modifiers{2}; // Inst{13} = unused op_sel // Inst{14} = unused op_sel_hi2 @@ -542,8 +542,8 @@ class VOP3PXe op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_ let Inst{49-41} = scale_src1; // Inst{50-58} = unused // Inst{60-59} = op_sel_hi; - let Inst{59} = scale_src0_opsel{1}; - let Inst{60} = scale_src1_opsel{1}; + let Inst{59} = src0_modifiers{3}; + let Inst{60} = src1_modifiers{3}; // Inst{63-61} = neg; // The high half of the encoding is the unscaled mfma op. @@ -1437,17 +1437,17 @@ class getVOP3MAIScaledPat { // mfma [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, timm:$cbsz, timm:$blgp, - MFMALdScaleModifierOp:$scale_src0_opsel, + MFMALdScaleModifierOp:$src0_modifiers, i32:$scale_src0, - MFMALdScaleModifierOp:$scale_src1_opsel, + MFMALdScaleModifierOp:$src1_modifiers, i32:$scale_src1 ))], // smfmac [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx, timm:$cbsz, timm:$abid, - MFMALdScaleModifierOp:$scale_src0_opsel, + MFMALdScaleModifierOp:$src0_modifiers, i32:$scale_src0, - MFMALdScaleModifierOp:$scale_src1_opsel, + MFMALdScaleModifierOp:$src1_modifiers, i32:$scale_src1))]); } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 7cc726a3bd79c..e027dda957a6d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -46,7 +46,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -70,7 +70,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -94,7 +94,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -118,7 +118,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -142,7 +142,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -166,7 +166,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -190,7 +190,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1797,7 +1797,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__ ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[1,1,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1819,7 +1819,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1837,7 +1837,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1860,7 +1860,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1879,7 +1879,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 ; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1921,7 +1921,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 ; SDAG-NEXT: v_mov_b32_e32 v17, s13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15] @@ -1946,7 +1946,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s27 ; GISEL-NEXT: v_mov_b32_e32 v16, s29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 @@ -1987,7 +1987,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] @@ -2013,7 +2013,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index dac54c9f85e96..5574313f22a47 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -134,7 +134,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -179,7 +179,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -231,7 +231,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -276,7 +276,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -328,7 +328,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -373,7 +373,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -425,7 +425,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -470,7 +470,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -522,7 +522,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -567,7 +567,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -619,7 +619,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -664,7 +664,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -716,7 +716,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -761,7 +761,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -4135,7 +4135,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__ ; GCN-NEXT: v_accvgpr_write_b32 a14, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 33, -2 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 33, -2 op_sel_hi:[1,1,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 @@ -4183,7 +4183,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, -2 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, -2 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -4227,7 +4227,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, -2 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, -2 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -4276,7 +4276,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -4321,7 +4321,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -4387,7 +4387,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 ; SDAG-NEXT: v_mov_b32_e32 v16, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 @@ -4430,7 +4430,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 ; GISEL-NEXT: v_mov_b32_e32 v16, s1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 @@ -4486,7 +4486,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 @@ -4529,7 +4529,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s index dd090cb73e56d..588d0a8fd1c2f 100644 --- a/llvm/test/MC/AMDGPU/mai-gfx950.s +++ b/llvm/test/MC/AMDGPU/mai-gfx950.s @@ -445,23 +445,23 @@ v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] blgp:2 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] blgp:2 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 @@ -514,7 +514,7 @@ v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 blgp:2 -// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:2 blgp:3 ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x10,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:2 blgp:3 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt index 8adc8b79fbbf5..accd1cc8db03d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt @@ -386,7 +386,7 @@ # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x84] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x84 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44] +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[1,1,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44 # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64] @@ -416,7 +416,7 @@ # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,1,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24] @@ -452,16 +452,16 @@ # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x64] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x64 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04] +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,1,0] cbsz:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04 # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24 # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] @@ -530,7 +530,7 @@ # GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] 0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64 -# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[1,1,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] 0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64 # GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x44] From 67a22ccd8d16cbb48df8e8fa5105a6cc80b31af9 Mon Sep 17 00:00:00 2001 From: vigneshwar jayakumar Date: Tue, 20 May 2025 01:26:15 -0500 Subject: [PATCH 2/5] Added neg and abs modifier for src2 Fixed review comments --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 +- .../CodeGenOpenCL/builtins-amdgcn-mfma.cl | 8 +- .../builtins-amdgcn-error-gfx950-param.cl | 20 +- .../builtins-amdgcn-error-gfx950.cl | 4 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 45 +- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 14 + .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 14 +- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 5 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 11 +- llvm/lib/Target/AMDGPU/VOPInstructions.td | 21 +- .../UniformityAnalysis/AMDGPU/intrinsics.ll | 12 +- ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 268 +++++++++- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 493 ++++++++++++++++-- .../AMDGPU/mai-hazards-mfma-scale.gfx950.mir | 48 +- llvm/test/MC/AMDGPU/mai-gfx950.s | 131 ++++- .../InstCombine/AMDGPU/mfma-scale.ll | 84 ++- llvm/test/Verifier/AMDGPU/mfma-scale.ll | 78 ++- 19 files changed, 1096 insertions(+), 174 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 39fef9e4601f8..467756c5e3aa6 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -438,8 +438,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion- //===----------------------------------------------------------------------===// // GFX950 only builtins. //===----------------------------------------------------------------------===// -TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4, "V4fV8ZiV8ZiV4fIiIiIiiIii", "nc", "gfx950-insts") -TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4, "V16fV8ZiV8ZiV16fIiIiIiiIii", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4, "V4fV8ZiV8ZiV4fIiIiIbIbIiiIii", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4, "V16fV8ZiV8ZiV16fIiIiIbIbIiiIii", "nc", "gfx950-insts") TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts") TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_bf16, "V4fV8yV8yV4fIiIiIi", "nc", "gfx950-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl index 79083c3c5f0f9..6367989b5209b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl @@ -435,18 +435,18 @@ v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) { // CHECK-GFX950-LABEL: @test_mfma_scale_f32_16x16x128_f8f6f4 // CHECK-GFX950: [[EXTRACT_A:%.+]] = shufflevector <8 x i32> %a, <8 x i32> poison, <6 x i32> -// CHECK-GFX950: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <4 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b) +// CHECK-GFX950: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <4 x float> %c, i32 3, i32 1, i1 false, i1 false, i32 2, i32 %scale_a, i32 3, i32 %scale_b) void test_mfma_scale_f32_16x16x128_f8f6f4(global v4f* out, v8i a, v8i b, v4f c, int scale_a, int scale_b) { - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b); + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 3, 1, false, false, 2, scale_a, 3, scale_b); } // CHECK-GFX950-LABEL: @test_mfma_scale_f32_32x32x64_f8f6f4 // CHECK-GFX950: [[EXTRACT_A:%.+]] = shufflevector <8 x i32> %a, <8 x i32> poison, <6 x i32> -// CHECK-GFX950: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <16 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b) +// CHECK-GFX950: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <16 x float> %c, i32 3, i32 1, i1 false, i1 false, i32 2, i32 %scale_a, i32 3, i32 %scale_b) void test_mfma_scale_f32_32x32x64_f8f6f4(global v16f* out, v8i a, v8i b, v16f c, int scale_a, int scale_b) { - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b); + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 3, 1, false, false, 2, scale_a, 3, scale_b); } // CHECK-GFX950-LABEL: @test_mfma_i32_16x16x64_i8( diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl index ce4f6ba8f407f..4ab7a5796a24e 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl @@ -38,17 +38,21 @@ void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, fl } void test_mfma_scale_f32_16x16x128_f8f6f4(__global float4* out, int8 a, int8 b, float4 c, int X, int Y) { - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, X, 0, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, X, 0, false, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, X, false, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, X, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, false, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, false, false, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, false, false, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} } void test_mfma_scale_f32_32x32x64_f8f6f4(__global float16* out, int8 a, int8 b, float16 c, int X, int Y) { - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, X, 0, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, X, 0, false, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, X, false, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, X, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, false, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, false, false, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, false, false, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} } void test_mfma_i32_16x16x64_i8(__global int4* out, int4 a, int4 b, int4 c, int X) { diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl index b40b1c841b453..6d1f1543fde40 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl @@ -50,8 +50,8 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0, *out13 = __builtin_amdgcn_smfmac_f32_32x32x64_bf8_fp8(a13, b13, c13, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x64_bf8_fp8' needs target feature gfx950-insts}} *out13 = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_bf8(a13, b13, c13, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x64_fp8_bf8' needs target feature gfx950-insts}} *out13 = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a13, b13, c13, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8' needs target feature gfx950-insts}} - *out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}} - *out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}} + *out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, false, false, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}} + *out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, false, false, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}} *out16 = __builtin_amdgcn_permlane16_swap(a16, b16, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}} *out16 = __builtin_amdgcn_permlane32_swap(a16, b16, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}} *out17 = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(*out17, a17, b17, c17, 0); // expected-error{{'__builtin_amdgcn_cvt_scalef32_sr_bf8_bf16' needs target feature bf8-cvt-scale-insts}} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index a57eb4a6dba49..400d841a1dc33 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3239,9 +3239,8 @@ class AMDGPUMfmaScaleIntrinsic : [llvm_anyvector_ty, llvm_anyvector_ty, DestTy, llvm_i32_ty, // cbsz llvm_i32_ty, // blgp - // llvm_i1_ty, // TODO: neg_src2 - // llvm_i1_ty, // TODO: abs_src2 - // llvm_i1_ty, // TODO: clamp + llvm_i1_ty, // neg_src2 + llvm_i1_ty, // abs_src2 llvm_i32_ty, // op_sel (A matrix scale, 2-bits) // TODO: Make i2? llvm_i32_ty, // v_mfma_ld_scale_b32 src0 (A matrix scale) llvm_i32_ty, // op_sel (B matrix scale, 2-bits) // TODO: Make i2? @@ -3249,7 +3248,8 @@ class AMDGPUMfmaScaleIntrinsic : ], [IntrConvergent, IntrNoMem, ImmArg>, ImmArg>, - ImmArg>, ImmArg> + ImmArg>, ImmArg>, + ImmArg>, ImmArg> ]>; defset list AMDGPUMFMAIntrinsics908 = { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 734bded69ba03..b7c6dbe3a0976 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4890,8 +4890,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); - OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI); + OpdsMapping[12] = getVGPROpMapping(MI.getOperand(12).getReg(), MRI, *TRI); break; } case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 450c4fffb8453..5ff9a699cdf1f 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -152,6 +152,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { ImmTyOpSelHi, ImmTyNegLo, ImmTyNegHi, + ImmTyNegLoSrc2, + ImmTyNegHiSrc2, ImmTyIndexKey8bit, ImmTyIndexKey16bit, ImmTyDPP8, @@ -416,6 +418,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); } bool isNegLo() const { return isImmTy(ImmTyNegLo); } bool isNegHi() const { return isImmTy(ImmTyNegHi); } + bool isNegHiSrc2() const { return isImmTy(ImmTyNegHiSrc2); } + bool isNegLoSrc2() const { return isImmTy(ImmTyNegLoSrc2); } bool isBitOp3() const { return isImmTy(ImmTyBitOp3) && isUInt<8>(getImm()); } bool isRegOrImm() const { @@ -1138,6 +1142,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { case ImmTyHigh: OS << "High"; break; case ImmTyBLGP: OS << "BLGP"; break; case ImmTyCBSZ: OS << "CBSZ"; break; + case ImmTyNegLoSrc2: OS << "NegSrc2"; break; + case ImmTyNegHiSrc2: OS << "AbsSrc2"; break; case ImmTyABID: OS << "ABID"; break; case ImmTyEndpgm: OS << "Endpgm"; break; case ImmTyWaitVDST: OS << "WaitVDST"; break; @@ -1632,7 +1638,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { ParseStatus parseOperandArrayWithPrefix( const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, - bool (*ConvertResult)(int64_t &) = nullptr); + std::function ConvertResult = nullptr); ParseStatus parseNamedBit(StringRef Name, OperandVector &Operands, @@ -1687,6 +1693,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { ParseStatus parseFlatOffset(OperandVector &Operands); ParseStatus parseR128A16(OperandVector &Operands); ParseStatus parseBLGP(OperandVector &Operands); + ParseStatus parseNegHiSrc2(OperandVector &Operands); + ParseStatus parseNegLoSrc2(OperandVector &Operands); bool tryParseFmt(const char *Pref, int64_t MaxVal, int64_t &Val); bool matchDfmtNfmt(int64_t &Dfmt, int64_t &Nfmt, StringRef FormatStr, SMLoc Loc); @@ -6560,7 +6568,7 @@ ParseStatus AMDGPUAsmParser::parseIntWithPrefix( ParseStatus AMDGPUAsmParser::parseOperandArrayWithPrefix( const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy, - bool (*ConvertResult)(int64_t &)) { + std::function ConvertResult) { SMLoc S = getLoc(); if (!trySkipId(Prefix, AsmToken::Colon)) return ParseStatus::NoMatch; @@ -6568,7 +6576,7 @@ ParseStatus AMDGPUAsmParser::parseOperandArrayWithPrefix( if (!skipToken(AsmToken::LBrac, "expected a left square bracket")) return ParseStatus::Failure; - unsigned Val = 0; + int64_t Val = 0; const unsigned MaxSize = 4; // FIXME: How to verify the number of elements matches the number of src @@ -6593,7 +6601,9 @@ ParseStatus AMDGPUAsmParser::parseOperandArrayWithPrefix( if (!skipToken(AsmToken::Comma, "expected a comma")) return ParseStatus::Failure; } - + if (ConvertResult && !ConvertResult(Val)) { + Error(S, "invalid " + StringRef(Prefix) + " value."); + } Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy)); return ParseStatus::Success; } @@ -7163,6 +7173,23 @@ ParseStatus AMDGPUAsmParser::parseBLGP(OperandVector &Operands) { return Res; } +static bool RightShift2Bits(int64_t &Neg) { + Neg >>= 2; + return true; +} + +ParseStatus AMDGPUAsmParser::parseNegLoSrc2(OperandVector &Operands) { + return parseOperandArrayWithPrefix( + "neg_lo", Operands, AMDGPUOperand::ImmTyNegLoSrc2, + RightShift2Bits); // Extracting only neg_lo[2] +} + +ParseStatus AMDGPUAsmParser::parseNegHiSrc2(OperandVector &Operands) { + return parseOperandArrayWithPrefix( + "neg_hi", Operands, AMDGPUOperand::ImmTyNegHiSrc2, + RightShift2Bits); // Extracting only neg_hi[2] +} + //===----------------------------------------------------------------------===// // Exp //===----------------------------------------------------------------------===// @@ -8863,6 +8890,12 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyBLGP, 0, InsertPos); + // add neg and abs for src2 + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyNegLoSrc2, 0); + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyNegHiSrc2, 0); + // Add dummy src_modifiers Inst.addOperand(MCOperand::createImm(0)); Inst.addOperand(MCOperand::createImm(0)); @@ -8886,9 +8919,9 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst, for (unsigned J = 0; J < 2; ++J) { unsigned ModVal = 0; if (OpSel & (1 << J)) - ModVal |= SISrcMods::OP_SEL_0; + ModVal |= SISrcMods::OP_SEL_0; // 3rd bit is from opsel if (OpSelHi & (1 << J)) - ModVal |= SISrcMods::OP_SEL_1; + ModVal |= SISrcMods::OP_SEL_1; // 4th bit is from opsel_hi const int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); Inst.getOperand(ModIdx).setImm(ModVal); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 210b0dc18ffc4..50db5bda233d7 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1300,6 +1300,20 @@ void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo, printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O); } +void AMDGPUInstPrinter::printNegLoSrc2(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (unsigned NegLo = !!(MI->getOperand(OpNo).getImm())) + O << " neg_lo:[0,0," << NegLo << ']'; +} + +void AMDGPUInstPrinter::printNegHiSrc2(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (unsigned NegHi = !!(MI->getOperand(OpNo).getImm())) + O << " neg_hi:[0,0," << NegHi << ']'; +} + void AMDGPUInstPrinter::printIndexKey8bit(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 071e0a9d0fee6..78370f6552a81 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -19,9 +19,9 @@ class MCInstrDesc; class AMDGPUInstPrinter : public MCInstPrinter { public: - AMDGPUInstPrinter(const MCAsmInfo &MAI, - const MCInstrInfo &MII, const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} + AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} // Autogenerated by tblgen std::pair @@ -50,7 +50,7 @@ class AMDGPUInstPrinter : public MCInstPrinter { raw_ostream &O); void printSMRDOffset8(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); + const MCSubtargetInfo &STI, raw_ostream &O); void printSMEMOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, @@ -62,7 +62,7 @@ class AMDGPUInstPrinter : public MCInstPrinter { void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); + raw_ostream &O); void printFORMAT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printSymbolicFormat(const MCInst *MI, @@ -126,6 +126,10 @@ class AMDGPUInstPrinter : public MCInstPrinter { const MCSubtargetInfo &STI, raw_ostream &O); void printNegHi(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printNegLoSrc2(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNegHiSrc2(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printIndexKey8bit(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printIndexKey16bit(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 3710a54a828ce..f21e0bea4ad05 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -310,7 +310,7 @@ def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE", class UnscaledMFMAOptimizationPat : PatFrag< (ops node:$srca, node:$srcb, node:$srcc, node:$cbsz, node:$blgp), - (intrin $srca, $srcb, $srcc, $cbsz, $blgp, + (intrin $srca, $srcb, $srcc, $cbsz, $blgp, 0, 0, srcvalue, 0, srcvalue, 0) >; @@ -1244,6 +1244,9 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">; def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">; def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">; +def neg_lo_src2 : CustomOperand; +def neg_hi_src2 : CustomOperand; + def IndexKey16bit : CustomOperand; def IndexKey8bit : CustomOperand; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 6315d14dbbc92..b2462926d0974 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -829,12 +829,13 @@ class MFMA_F8F6F4_WithSizeTable_Helper : // Currently assumes scaled instructions never have abid class MAIFrag : PatFrag < !if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp, + node:$neg_src2, node:$abs_src2, node:$src0_modifiers, node:$scale_src0, node:$src1_modifiers, node:$scale_src1), !con((ops node:$src0, node:$src1, node:$src2, node:$cbsz), !if(HasAbid, (ops node:$abid), (ops)), (ops node:$blgp))), - !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1), + !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $neg_src2, $abs_src2, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1), !if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), (Op $src0, $src1, $src2, $cbsz, $blgp))), pred @@ -886,8 +887,6 @@ class MAIInst : MAIInst { @@ -895,11 +894,13 @@ class ScaledMAIInst : let InOperandList = !con(BaseInst.InOperandList, (ins VSrc_b32:$scale_src0, VSrc_b32:$scale_src1, + neg_lo_src2:$neg_src2, + neg_hi_src2:$abs_src2, op_sel0:$src0_modifiers, op_sel_hi0:$src1_modifiers)); let AsmOperands = "$vdst, $src0, $src1, $src2, $scale_src0, $scale_src1" - "$src0_modifiers$src1_modifiers$cbsz$blgp"; + "$neg_src2$abs_src2$src0_modifiers$src1_modifiers$cbsz$blgp"; let AsmMatchConverter = "cvtScaledMFMA"; let FixedSize = 1; let Size = 16; @@ -2056,7 +2057,7 @@ multiclass VOP3PX_Real_ScaledMFMA op> { multiclass VOP3PX_Real_ScaledMFMA_F8F6F4_mc op> { defm _f8_f8 : VOP3PX_Real_ScaledMFMA; - + let isAsmParserOnly = 1 in { // Disable ambiguous disassembly. defm _f8_f6 : VOP3PX_Real_ScaledMFMA; defm _f6_f8 : VOP3PX_Real_ScaledMFMA; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 1c97c40e26083..ca9201dcc567f 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -526,14 +526,20 @@ class VOP3PXe op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_ bits<9> scale_src0; bits<9> scale_src1; + //MFMALdScaleModifierOp transforms 2 bit opsel input to 4 bit value + //where opsel and opselHi are in 3rd and 4th bit. bits<4> src0_modifiers; bits<4> src1_modifiers; + bits<1> neg_src2; + bits<1> abs_src2; // Inst{7-0} = unused // Inst{10-8} = neg_hi; + let Inst{10} = abs_src2; //neg_hi[2] + // Inst{13-11} = op_sel - let Inst{11} = src0_modifiers{2}; - let Inst{12} = src1_modifiers{2}; + let Inst{11} = src0_modifiers{2}; //opsel[0] + let Inst{12} = src1_modifiers{2}; //opsel[1] // Inst{13} = unused op_sel // Inst{14} = unused op_sel_hi2 @@ -542,9 +548,10 @@ class VOP3PXe op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_ let Inst{49-41} = scale_src1; // Inst{50-58} = unused // Inst{60-59} = op_sel_hi; - let Inst{59} = src0_modifiers{3}; - let Inst{60} = src1_modifiers{3}; + let Inst{59} = src0_modifiers{3}; //opsel_hi[0] + let Inst{60} = src1_modifiers{3}; //opsel_hi[1] // Inst{63-61} = neg; + let Inst{63} = neg_src2; //neg_lo[2] // The high half of the encoding is the unscaled mfma op. // @@ -1437,6 +1444,8 @@ class getVOP3MAIScaledPat { // mfma [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, timm:$cbsz, timm:$blgp, + i1:$neg_src2, + i1:$abs_src2, MFMALdScaleModifierOp:$src0_modifiers, i32:$scale_src0, MFMALdScaleModifierOp:$src1_modifiers, @@ -1444,7 +1453,9 @@ class getVOP3MAIScaledPat { ))], // smfmac [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx, - timm:$cbsz, timm:$abid, + timm:$cbsz, timm:$blgp, + i1:$neg_src2, + i1:$abs_src2, MFMALdScaleModifierOp:$src0_modifiers, i32:$scale_src0, MFMALdScaleModifierOp:$src1_modifiers, diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 1028cc9ebb342..dbd45d6af7688 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -352,22 +352,22 @@ define amdgpu_kernel void @mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloa ret void } -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) -; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) +; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i1 false, i1 false, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) define amdgpu_kernel void @mfma_scale_f32_16x16x128_f8f6f4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 %arg4, ptr addrspace(1) %out) { - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i1 false, i1 false, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) store <4 x float> %result, ptr addrspace(1) %out ret void } -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) -; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) +; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i1 false, i1 false, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) define amdgpu_kernel void @mfma_f32_scale_32x32x64_f8f6f4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 %arg4, ptr addrspace(1) %out) { - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i1 false, i1 false, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) store <16 x float> %result, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index e027dda957a6d..d5e22852b25d2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -33,6 +33,87 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +;abs and neg modifier for src2 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_1_neg_hi_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_1_neg_hi_0__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 neg_lo:[0,0,1] op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i1 true, + i1 false, + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_0_neg_hi_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_0_neg_hi_1__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 neg_hi:[0,0,1] op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i1 false, + i1 true, + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_1_neg_hi_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_1_neg_hi_1__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 neg_lo:[0,0,1] neg_hi:[0,0,1] op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i1 true, + i1 true, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -57,6 +138,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 1, i32 %scale0, i32 1, i32 %scale1) ret <4 x float> %result } @@ -81,6 +164,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 2, i32 %scale0, i32 2, i32 %scale1) ret <4 x float> %result } @@ -105,6 +190,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 3, i32 %scale0, i32 3, i32 %scale1) ret <4 x float> %result } @@ -129,6 +216,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 3, i32 %scale1) ret <4 x float> %result } @@ -153,6 +242,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 3, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -177,6 +268,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 2, i32 %scale0, i32 3, i32 %scale1) ret <4 x float> %result } @@ -201,10 +294,13 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 3, i32 %scale0, i32 2, i32 %scale1) ret <4 x float> %result } + ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: @@ -226,6 +322,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -251,6 +349,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -276,6 +376,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -301,6 +403,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -326,6 +430,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -351,6 +457,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -376,6 +484,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -401,6 +511,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -426,6 +538,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -451,6 +565,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -476,6 +592,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -501,6 +619,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -527,6 +647,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -552,6 +674,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -576,6 +700,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -601,6 +727,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -626,6 +754,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -651,6 +781,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -676,6 +808,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -701,6 +835,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -726,6 +862,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -751,6 +889,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -776,6 +916,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -800,6 +942,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -824,6 +968,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -848,6 +994,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -872,6 +1020,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -898,6 +1048,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -923,6 +1075,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -948,6 +1102,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -973,6 +1129,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -997,6 +1155,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1021,6 +1181,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1045,6 +1207,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1069,6 +1233,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1093,6 +1259,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1117,6 +1285,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1141,6 +1311,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1165,6 +1337,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1190,6 +1364,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1215,6 +1391,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1240,6 +1418,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1265,6 +1445,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1289,6 +1471,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1313,6 +1497,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1337,6 +1523,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1361,6 +1549,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1385,6 +1575,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1409,6 +1601,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1435,7 +1629,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1456,7 +1650,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1477,7 +1671,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1547,7 +1741,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1601,7 +1795,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1655,7 +1849,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1709,7 +1903,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1730,7 +1924,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1784,7 +1978,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1805,7 +1999,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__ ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 33, i32 2, i32 -2) ret <4 x float> %result } @@ -1845,7 +2039,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 65, i32 2, i32 -2) ret <4 x float> %result } @@ -1887,7 +2081,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 65, i32 2, i32 77) ret <4 x float> %result } @@ -1952,7 +2146,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] ; GISEL-NEXT: s_endpgm - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 3, i32 %scale0, i32 1, i32 %scale1) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 ret void } @@ -2019,7 +2213,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] ; GISEL-NEXT: s_endpgm - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 3, i32 65, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 ret void } @@ -2042,7 +2236,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -2064,7 +2258,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 3, i32 0, i32 1, i32 0) ret <4 x float> %result } @@ -2085,7 +2279,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 0, i32 0, i32 1) ret <4 x float> %result } @@ -2106,7 +2300,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a( ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 1, i32 0, i32 0) ret <4 x float> %result } @@ -2134,6 +2328,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2158,6 +2354,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2181,6 +2379,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2204,6 +2404,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -2228,6 +2430,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2252,6 +2456,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2276,6 +2482,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2300,6 +2508,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2323,6 +2533,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2346,19 +2558,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4_ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 5574313f22a47..256903b02692d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -106,6 +106,305 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_0__cbsz0__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 neg_lo:[0,0,1] op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_0__cbsz0__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 neg_lo:[0,0,1] op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i1 true, + i1 false, + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_0_neg_hi_1__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_0_neg_hi_1__cbsz0__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 neg_hi:[0,0,1] op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_0_neg_hi_1__cbsz0__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 neg_hi:[0,0,1] op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i1 false, + i1 true, + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_1__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_1__cbsz0__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 neg_lo:[0,0,1] neg_hi:[0,0,1] op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_1__cbsz0__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 neg_lo:[0,0,1] neg_hi:[0,0,1] op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i1 true, + i1 true, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -203,6 +502,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 1, i32 %scale0, i32 1, i32 %scale1) ret <16 x float> %result } @@ -300,6 +601,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 2, i32 %scale0, i32 2, i32 %scale1) ret <16 x float> %result } @@ -397,6 +700,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 3, i32 %scale0, i32 3, i32 %scale1) ret <16 x float> %result } @@ -494,6 +799,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 3, i32 %scale1) ret <16 x float> %result } @@ -591,6 +898,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 3, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -688,6 +997,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 2, i32 %scale0, i32 3, i32 %scale1) ret <16 x float> %result } @@ -785,6 +1096,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 3, i32 %scale0, i32 2, i32 %scale1) ret <16 x float> %result } @@ -836,6 +1149,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -934,6 +1249,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -984,6 +1301,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1036,6 +1355,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1085,6 +1406,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1137,6 +1460,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1186,6 +1511,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1236,6 +1563,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1285,6 +1614,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1383,6 +1714,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1433,6 +1766,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1531,6 +1866,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1582,6 +1919,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1634,6 +1973,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1683,6 +2024,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1735,6 +2078,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1784,6 +2129,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1834,6 +2181,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1883,6 +2232,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1935,6 +2286,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1984,6 +2337,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2036,6 +2391,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2085,6 +2442,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2134,6 +2493,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2182,6 +2543,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2231,6 +2594,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2279,6 +2644,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2332,6 +2699,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2381,6 +2750,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2433,6 +2804,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2482,6 +2855,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2531,6 +2906,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2579,6 +2956,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2628,6 +3007,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2676,6 +3057,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2725,6 +3108,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2773,6 +3158,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2822,6 +3209,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2870,6 +3259,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2920,6 +3311,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2969,6 +3362,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3019,6 +3414,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3068,6 +3465,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3117,6 +3516,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3165,6 +3566,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3214,6 +3617,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3262,6 +3667,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3311,6 +3718,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3359,6 +3768,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3411,7 +3822,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3459,7 +3870,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3507,7 +3918,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3627,7 +4038,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3731,7 +4142,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3835,7 +4246,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3939,7 +4350,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3985,7 +4396,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -4109,7 +4520,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -4156,7 +4567,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__ ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 33, i32 2, i32 -2) ret <16 x float> %result } @@ -4248,7 +4659,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 65, i32 2, i32 -2) ret <16 x float> %result } @@ -4342,7 +4753,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 65, i32 2, i32 77) ret <16 x float> %result } @@ -4440,7 +4851,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 ; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 3, i32 %scale0, i32 1, i32 %scale1) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 ret void } @@ -4539,7 +4950,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 3, i32 65, i32 1, i32 -2) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 ret void } @@ -4689,7 +5100,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 store volatile <16 x float> %result, ptr addrspace(1) null, align 64 ret void @@ -4836,7 +5247,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 25, i32 0, i32 42) store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 store volatile <16 x float> %result, ptr addrspace(1) null, align 64 ret void @@ -4983,7 +5394,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 store volatile <16 x float> %result, ptr addrspace(1) null, align 64 ret void @@ -5130,7 +5541,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 25, i32 0, i32 42) store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 store volatile <16 x float> %result, ptr addrspace(1) null, align 64 ret void @@ -5179,7 +5590,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -5226,7 +5637,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 3, i32 0, i32 1, i32 0) ret <16 x float> %result } @@ -5273,7 +5684,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 0, i32 0, i32 1) ret <16 x float> %result } @@ -5320,7 +5731,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a( ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 1, i32 0, i32 0) ret <16 x float> %result } @@ -5421,6 +5832,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -5518,6 +5931,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -5613,6 +6028,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -5662,6 +6079,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -5759,6 +6178,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -5856,6 +6277,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -5907,6 +6330,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -5958,6 +6383,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -6053,6 +6480,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -6102,19 +6531,21 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir index 4585eca8fe894..bb64cb106b5e3 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir @@ -162,14 +162,14 @@ body: | ; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -185,13 +185,13 @@ body: | ; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -207,14 +207,14 @@ body: | ; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -230,13 +230,13 @@ body: | ; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -252,13 +252,13 @@ body: | ; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 3 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -274,12 +274,12 @@ body: | ; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s index 588d0a8fd1c2f..7bf197706f9a0 100644 --- a/llvm/test/MC/AMDGPU/mai-gfx950.s +++ b/llvm/test/MC/AMDGPU/mai-gfx950.s @@ -375,7 +375,6 @@ v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:9], a[0:15] cbsz:1 blgp:3 //===----------------------------------------------------------------------===// // v_mfma_scale_f32_16x16x128_f8f6f4 //===----------------------------------------------------------------------===// -// FIXME: Test op_sel, neg, clamp // GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU @@ -465,10 +464,74 @@ v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 +// op_sel combinations + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x08,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,1,0] op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x18,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,1,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,1,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,0,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,1,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[1,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[1,1,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[1,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[1,0,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x08,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[1,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x08,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[1,0,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[1,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x08,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[1,1,0] cbsz:3 blgp:1 + + +// neg combinations + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_lo:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x80,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_lo:[0,0,1] + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_hi:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x04,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_hi:[0,0,1] + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_lo:[0,0,1] neg_hi:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x04,0xac,0xd3,0xa1,0x12,0x01,0x80,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_lo:[0,0,1] neg_hi:[0,0,1] + //===----------------------------------------------------------------------===// // v_mfma_scale_f32_32x32x64_f8f6f4 //===----------------------------------------------------------------------===// -// FIXME: Test op_sel, neg, clamp // GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU @@ -518,6 +581,70 @@ v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:2 blgp:3 +// op_sel combinations + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,0,0] ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[0,0,0] ; encoding: [0x00,0x08,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; encoding: [0x00,0x18,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,1,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x10,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[1,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x08,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[1,0,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[1,1,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[1,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x10,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[1,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x08,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[1,0,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x00,0x08,0xac,0xd3,0x30,0x63,0x02,0x10,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[0,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; encoding: [0x00,0x08,0xac,0xd3,0x30,0x63,0x02,0x08,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[1,0,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; encoding: [0x00,0x08,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[1,1,0] + +// neg combinations + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_lo:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x80,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_lo:[0,0,1] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_hi:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x04,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_hi:[0,0,1] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_lo:[0,0,1] neg_hi:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x04,0xac,0xd3,0x30,0x63,0x02,0x80,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_lo:[0,0,1] neg_hi:[0,0,1] + //===----------------------------------------------------------------------===// // v_mfma_f32_16x16x128_f8f6f4 with appropriate register widths //===----------------------------------------------------------------------===// diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/mfma-scale.ll b/llvm/test/Transforms/InstCombine/AMDGPU/mfma-scale.ll index 709f143a4745a..988c831990dbc 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/mfma-scale.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/mfma-scale.ll @@ -9,12 +9,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 2, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 2, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -23,12 +25,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 2, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 2, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -38,12 +42,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 2, i32 2, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 2, i32 2, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -53,12 +59,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_ ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 2, i32 2, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 2, i32 2, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -67,12 +75,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -81,12 +91,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 4, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 4, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -95,12 +107,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <6 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i32> [[ARG1]], <6 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -109,12 +123,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( ; CHECK-SAME: <6 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i32> [[ARG0]], <6 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 4, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 4, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -124,12 +140,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 4, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 4, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -139,12 +157,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4_ ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 4, i32 4, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 4, i32 4, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -153,12 +173,14 @@ define <4 x float> @test_flags_shrink_src0(<8 x i32> %arg0, <8 x i32> %arg1, <4 ; CHECK-LABEL: define <4 x float> @test_flags_shrink_src0( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call nnan nsz <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 2, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call nnan nsz <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 2, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call nnan nsz <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -171,12 +193,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 2, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 2, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -185,12 +209,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 2, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 2, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -200,12 +226,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 2, i32 2, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 2, i32 2, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -215,12 +243,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 2, i32 2, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 2, i32 2, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -229,12 +259,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -243,12 +275,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 4, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 4, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -257,12 +291,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <6 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i32> [[ARG1]], <6 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -271,12 +307,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( ; CHECK-SAME: <6 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i32> [[ARG0]], <6 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 4, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 4, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -286,12 +324,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 4, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 4, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -301,12 +341,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 4, i32 4, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 4, i32 4, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } diff --git a/llvm/test/Verifier/AMDGPU/mfma-scale.ll b/llvm/test/Verifier/AMDGPU/mfma-scale.ll index 1e3e8856df3d1..a1bc73284aa3b 100644 --- a/llvm/test/Verifier/AMDGPU/mfma-scale.ll +++ b/llvm/test/Verifier/AMDGPU/mfma-scale.ll @@ -5,45 +5,53 @@ ; -------------------------------------------------------------------- ; CHECK: operand 0 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i64> %arg0 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v4i64_fp8__v8i32_fp8(<4 x i64> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: operand 1 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <4 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i64> %arg1 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8v4i64_fp8(<8 x i32> %arg0, <4 x i64> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: operand 0 must be 4, 6 or 8 element i32 vector -; CHECK: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK: <4 x i64> %arg0 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v4i64_fp8__v8i32_fp8(<4 x i64> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } ; CHECK: operand 1 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i64> %arg1 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8v4i64_fp8(<8 x i32> %arg0, <4 x i64> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -53,45 +61,53 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8v4i64_fp8(<8 ; -------------------------------------------------------------------- ; CHECK: operand 0 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v5i32.v8i32(<5 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v5i32.v8i32(<5 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <5 x i32> %arg0 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v5i32_fp4__v8i32_fp4(<5 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i64.v8i32(<5 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: operand 1 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v5i32(<8 x i32> %arg0, <5 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v5i32(<8 x i32> %arg0, <5 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <5 x i32> %arg1 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v5i32_fp4(<8 x i32> %arg0, <5 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v5i32(<8 x i32> %arg0, <5 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: operand 0 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v7i32.v8i32(<7 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v7i32.v8i32(<7 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <7 x i32> %arg0 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v7i32_fp4__v8i32_fp4(<7 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i64.v8i32(<7 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: operand 1 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v7i32(<8 x i32> %arg0, <7 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v7i32(<8 x i32> %arg0, <7 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <7 x i32> %arg1 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v7i32_fp4(<8 x i32> %arg0, <7 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v7i32(<8 x i32> %arg0, <7 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -101,56 +117,66 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v7i32_fp4( ; -------------------------------------------------------------------- ; CHECK: invalid value for cbsz format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 9999, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 9999, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: i32 9999 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_invalid0__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 9999, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid value for blgp format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 9999, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 9999, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: i32 9999 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_invalid0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 9999, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid value for cbsz format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 5, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 5, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: i32 5 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_invalid1__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 5, ; cbsz i32 2, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid value for blgp format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 5, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 5, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: i32 5 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i321_invalid(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 5, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid value for cbsz format -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 5, i32 5, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 5, i32 5, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: i32 5 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_invalid__v8i32_invalid(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 5, ; cbsz i32 5, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -160,71 +186,83 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_invalid__v8i32_ ; -------------------------------------------------------------------- ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i32> %arg0 ; CHECK-NEXT: i32 0 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v4i32_fp8__v8i32_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i32> %arg1 -; CHECK-NEXT: i32 0 +; CHECK-NEXT: i1 false define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4__v8i32_fp8___v4i32_fp8(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i32> %arg0 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v4i32_fp8__v4i32_fp8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <6 x i32> %arg0 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp8__v6i32_fp8(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i32> %arg0 ; CHECK-NEXT: i32 0 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v4i32_fp8__v4i32_fp8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <6 x i32> %arg0 ; CHECK-NEXT: i32 0 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp8__v6i32_fp8(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp + i1 false, + i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } From e73a9cb96b504557216c3edae4d0cbf324dc8be1 Mon Sep 17 00:00:00 2001 From: vigneshwar jayakumar Date: Tue, 20 May 2025 01:34:31 -0500 Subject: [PATCH 3/5] minor change --- .../lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 78370f6552a81..48c9656ebbbc3 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -19,9 +19,9 @@ class MCInstrDesc; class AMDGPUInstPrinter : public MCInstPrinter { public: - AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} + AMDGPUInstPrinter(const MCAsmInfo &MAI, + const MCInstrInfo &MII, const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} // Autogenerated by tblgen std::pair @@ -50,7 +50,7 @@ class AMDGPUInstPrinter : public MCInstPrinter { raw_ostream &O); void printSMRDOffset8(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); + const MCSubtargetInfo &STI, raw_ostream &O); void printSMEMOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, @@ -62,7 +62,7 @@ class AMDGPUInstPrinter : public MCInstPrinter { void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); + raw_ostream &O); void printFORMAT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printSymbolicFormat(const MCInst *MI, From 87f8800767a60c9a77a3b5b857767dee934decc1 Mon Sep 17 00:00:00 2001 From: vigneshwar jayakumar Date: Tue, 20 May 2025 12:45:12 -0500 Subject: [PATCH 4/5] Revert "Added neg and abs modifier for src2" This reverts commit 67a22ccd8d16cbb48df8e8fa5105a6cc80b31af9. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 +- .../CodeGenOpenCL/builtins-amdgcn-mfma.cl | 8 +- .../builtins-amdgcn-error-gfx950-param.cl | 20 +- .../builtins-amdgcn-error-gfx950.cl | 4 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 45 +- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 14 - .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 4 - llvm/lib/Target/AMDGPU/SIInstrInfo.td | 5 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 9 +- llvm/lib/Target/AMDGPU/VOPInstructions.td | 11 +- .../UniformityAnalysis/AMDGPU/intrinsics.ll | 12 +- ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 268 +--------- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 493 ++---------------- .../AMDGPU/mai-hazards-mfma-scale.gfx950.mir | 48 +- llvm/test/MC/AMDGPU/mai-gfx950.s | 29 +- .../InstCombine/AMDGPU/mfma-scale.ll | 84 +-- llvm/test/Verifier/AMDGPU/mfma-scale.ll | 78 +-- 19 files changed, 164 insertions(+), 982 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 51fcbd65db94c..730fd15913c11 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -439,8 +439,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion- //===----------------------------------------------------------------------===// // GFX950 only builtins. //===----------------------------------------------------------------------===// -TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4, "V4fV8ZiV8ZiV4fIiIiIbIbIiiIii", "nc", "gfx950-insts") -TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4, "V16fV8ZiV8ZiV16fIiIiIbIbIiiIii", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4, "V4fV8ZiV8ZiV4fIiIiIiiIii", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4, "V16fV8ZiV8ZiV16fIiIiIiiIii", "nc", "gfx950-insts") TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts") TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_bf16, "V4fV8yV8yV4fIiIiIi", "nc", "gfx950-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl index 6367989b5209b..79083c3c5f0f9 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl @@ -435,18 +435,18 @@ v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) { // CHECK-GFX950-LABEL: @test_mfma_scale_f32_16x16x128_f8f6f4 // CHECK-GFX950: [[EXTRACT_A:%.+]] = shufflevector <8 x i32> %a, <8 x i32> poison, <6 x i32> -// CHECK-GFX950: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <4 x float> %c, i32 3, i32 1, i1 false, i1 false, i32 2, i32 %scale_a, i32 3, i32 %scale_b) +// CHECK-GFX950: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <4 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b) void test_mfma_scale_f32_16x16x128_f8f6f4(global v4f* out, v8i a, v8i b, v4f c, int scale_a, int scale_b) { - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 3, 1, false, false, 2, scale_a, 3, scale_b); + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b); } // CHECK-GFX950-LABEL: @test_mfma_scale_f32_32x32x64_f8f6f4 // CHECK-GFX950: [[EXTRACT_A:%.+]] = shufflevector <8 x i32> %a, <8 x i32> poison, <6 x i32> -// CHECK-GFX950: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <16 x float> %c, i32 3, i32 1, i1 false, i1 false, i32 2, i32 %scale_a, i32 3, i32 %scale_b) +// CHECK-GFX950: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <16 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b) void test_mfma_scale_f32_32x32x64_f8f6f4(global v16f* out, v8i a, v8i b, v16f c, int scale_a, int scale_b) { - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 3, 1, false, false, 2, scale_a, 3, scale_b); + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b); } // CHECK-GFX950-LABEL: @test_mfma_i32_16x16x64_i8( diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl index 4ab7a5796a24e..ce4f6ba8f407f 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl @@ -38,21 +38,17 @@ void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, fl } void test_mfma_scale_f32_16x16x128_f8f6f4(__global float4* out, int8 a, int8 b, float4 c, int X, int Y) { - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, X, 0, false, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, X, false, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, X, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, false, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, false, false, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, false, false, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, X, 0, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} } void test_mfma_scale_f32_32x32x64_f8f6f4(__global float16* out, int8 a, int8 b, float16 c, int X, int Y) { - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, X, 0, false, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, X, false, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, X, false, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, false, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, false, false, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} - *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, false, false, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, X, 0, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} } void test_mfma_i32_16x16x64_i8(__global int4* out, int4 a, int4 b, int4 c, int X) { diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl index 6d1f1543fde40..b40b1c841b453 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl @@ -50,8 +50,8 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0, *out13 = __builtin_amdgcn_smfmac_f32_32x32x64_bf8_fp8(a13, b13, c13, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x64_bf8_fp8' needs target feature gfx950-insts}} *out13 = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_bf8(a13, b13, c13, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x64_fp8_bf8' needs target feature gfx950-insts}} *out13 = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a13, b13, c13, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8' needs target feature gfx950-insts}} - *out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, false, false, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}} - *out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, false, false, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}} + *out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}} + *out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}} *out16 = __builtin_amdgcn_permlane16_swap(a16, b16, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}} *out16 = __builtin_amdgcn_permlane32_swap(a16, b16, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}} *out17 = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(*out17, a17, b17, c17, 0); // expected-error{{'__builtin_amdgcn_cvt_scalef32_sr_bf8_bf16' needs target feature bf8-cvt-scale-insts}} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 74204909a9b63..0e5d48ee8f567 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3261,8 +3261,9 @@ class AMDGPUMfmaScaleIntrinsic : [llvm_anyvector_ty, llvm_anyvector_ty, DestTy, llvm_i32_ty, // cbsz llvm_i32_ty, // blgp - llvm_i1_ty, // neg_src2 - llvm_i1_ty, // abs_src2 + // llvm_i1_ty, // TODO: neg_src2 + // llvm_i1_ty, // TODO: abs_src2 + // llvm_i1_ty, // TODO: clamp llvm_i32_ty, // op_sel (A matrix scale, 2-bits) // TODO: Make i2? llvm_i32_ty, // v_mfma_ld_scale_b32 src0 (A matrix scale) llvm_i32_ty, // op_sel (B matrix scale, 2-bits) // TODO: Make i2? @@ -3270,8 +3271,7 @@ class AMDGPUMfmaScaleIntrinsic : ], [IntrConvergent, IntrNoMem, ImmArg>, ImmArg>, - ImmArg>, ImmArg>, - ImmArg>, ImmArg> + ImmArg>, ImmArg> ]>; defset list AMDGPUMFMAIntrinsics908 = { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 6ae0bd1ebda79..dd7aef8f0c583 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4891,8 +4891,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI); - OpdsMapping[12] = getVGPROpMapping(MI.getOperand(12).getReg(), MRI, *TRI); break; } case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 37f79b279edc4..ec05073866f61 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -152,8 +152,6 @@ class AMDGPUOperand : public MCParsedAsmOperand { ImmTyOpSelHi, ImmTyNegLo, ImmTyNegHi, - ImmTyNegLoSrc2, - ImmTyNegHiSrc2, ImmTyIndexKey8bit, ImmTyIndexKey16bit, ImmTyDPP8, @@ -418,8 +416,6 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); } bool isNegLo() const { return isImmTy(ImmTyNegLo); } bool isNegHi() const { return isImmTy(ImmTyNegHi); } - bool isNegHiSrc2() const { return isImmTy(ImmTyNegHiSrc2); } - bool isNegLoSrc2() const { return isImmTy(ImmTyNegLoSrc2); } bool isBitOp3() const { return isImmTy(ImmTyBitOp3) && isUInt<8>(getImm()); } bool isRegOrImm() const { @@ -1142,8 +1138,6 @@ class AMDGPUOperand : public MCParsedAsmOperand { case ImmTyHigh: OS << "High"; break; case ImmTyBLGP: OS << "BLGP"; break; case ImmTyCBSZ: OS << "CBSZ"; break; - case ImmTyNegLoSrc2: OS << "NegSrc2"; break; - case ImmTyNegHiSrc2: OS << "AbsSrc2"; break; case ImmTyABID: OS << "ABID"; break; case ImmTyEndpgm: OS << "Endpgm"; break; case ImmTyWaitVDST: OS << "WaitVDST"; break; @@ -1638,7 +1632,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { ParseStatus parseOperandArrayWithPrefix( const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, - std::function ConvertResult = nullptr); + bool (*ConvertResult)(int64_t &) = nullptr); ParseStatus parseNamedBit(StringRef Name, OperandVector &Operands, @@ -1693,8 +1687,6 @@ class AMDGPUAsmParser : public MCTargetAsmParser { ParseStatus parseFlatOffset(OperandVector &Operands); ParseStatus parseR128A16(OperandVector &Operands); ParseStatus parseBLGP(OperandVector &Operands); - ParseStatus parseNegHiSrc2(OperandVector &Operands); - ParseStatus parseNegLoSrc2(OperandVector &Operands); bool tryParseFmt(const char *Pref, int64_t MaxVal, int64_t &Val); bool matchDfmtNfmt(int64_t &Dfmt, int64_t &Nfmt, StringRef FormatStr, SMLoc Loc); @@ -6556,7 +6548,7 @@ ParseStatus AMDGPUAsmParser::parseIntWithPrefix( ParseStatus AMDGPUAsmParser::parseOperandArrayWithPrefix( const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy, - std::function ConvertResult) { + bool (*ConvertResult)(int64_t &)) { SMLoc S = getLoc(); if (!trySkipId(Prefix, AsmToken::Colon)) return ParseStatus::NoMatch; @@ -6564,7 +6556,7 @@ ParseStatus AMDGPUAsmParser::parseOperandArrayWithPrefix( if (!skipToken(AsmToken::LBrac, "expected a left square bracket")) return ParseStatus::Failure; - int64_t Val = 0; + unsigned Val = 0; const unsigned MaxSize = 4; // FIXME: How to verify the number of elements matches the number of src @@ -6589,9 +6581,7 @@ ParseStatus AMDGPUAsmParser::parseOperandArrayWithPrefix( if (!skipToken(AsmToken::Comma, "expected a comma")) return ParseStatus::Failure; } - if (ConvertResult && !ConvertResult(Val)) { - Error(S, "invalid " + StringRef(Prefix) + " value."); - } + Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy)); return ParseStatus::Success; } @@ -7161,23 +7151,6 @@ ParseStatus AMDGPUAsmParser::parseBLGP(OperandVector &Operands) { return Res; } -static bool RightShift2Bits(int64_t &Neg) { - Neg >>= 2; - return true; -} - -ParseStatus AMDGPUAsmParser::parseNegLoSrc2(OperandVector &Operands) { - return parseOperandArrayWithPrefix( - "neg_lo", Operands, AMDGPUOperand::ImmTyNegLoSrc2, - RightShift2Bits); // Extracting only neg_lo[2] -} - -ParseStatus AMDGPUAsmParser::parseNegHiSrc2(OperandVector &Operands) { - return parseOperandArrayWithPrefix( - "neg_hi", Operands, AMDGPUOperand::ImmTyNegHiSrc2, - RightShift2Bits); // Extracting only neg_hi[2] -} - //===----------------------------------------------------------------------===// // Exp //===----------------------------------------------------------------------===// @@ -8878,12 +8851,6 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyBLGP, 0, InsertPos); - // add neg and abs for src2 - addOptionalImmOperand(Inst, Operands, OptionalIdx, - AMDGPUOperand::ImmTyNegLoSrc2, 0); - addOptionalImmOperand(Inst, Operands, OptionalIdx, - AMDGPUOperand::ImmTyNegHiSrc2, 0); - // Add dummy src_modifiers Inst.addOperand(MCOperand::createImm(0)); Inst.addOperand(MCOperand::createImm(0)); @@ -8907,9 +8874,9 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst, for (unsigned J = 0; J < 2; ++J) { unsigned ModVal = 0; if (OpSel & (1 << J)) - ModVal |= SISrcMods::OP_SEL_0; // 3rd bit is from opsel + ModVal |= SISrcMods::OP_SEL_0; if (OpSelHi & (1 << J)) - ModVal |= SISrcMods::OP_SEL_1; // 4th bit is from opsel_hi + ModVal |= SISrcMods::OP_SEL_1; const int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); Inst.getOperand(ModIdx).setImm(ModVal); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 251825762a03a..a56bca514aff3 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1297,20 +1297,6 @@ void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo, printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O); } -void AMDGPUInstPrinter::printNegLoSrc2(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (unsigned NegLo = !!(MI->getOperand(OpNo).getImm())) - O << " neg_lo:[0,0," << NegLo << ']'; -} - -void AMDGPUInstPrinter::printNegHiSrc2(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (unsigned NegHi = !!(MI->getOperand(OpNo).getImm())) - O << " neg_hi:[0,0," << NegHi << ']'; -} - void AMDGPUInstPrinter::printIndexKey8bit(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 48c9656ebbbc3..071e0a9d0fee6 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -126,10 +126,6 @@ class AMDGPUInstPrinter : public MCInstPrinter { const MCSubtargetInfo &STI, raw_ostream &O); void printNegHi(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printNegLoSrc2(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printNegHiSrc2(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); void printIndexKey8bit(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printIndexKey16bit(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index ac1f658f576b5..84a6aeacc226a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -310,7 +310,7 @@ def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE", class UnscaledMFMAOptimizationPat : PatFrag< (ops node:$srca, node:$srcb, node:$srcc, node:$cbsz, node:$blgp), - (intrin $srca, $srcb, $srcc, $cbsz, $blgp, 0, 0, + (intrin $srca, $srcb, $srcc, $cbsz, $blgp, srcvalue, 0, srcvalue, 0) >; @@ -1244,9 +1244,6 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">; def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">; def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">; -def neg_lo_src2 : CustomOperand; -def neg_hi_src2 : CustomOperand; - def IndexKey16bit : CustomOperand; def IndexKey8bit : CustomOperand; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index b2462926d0974..06ee41acf41ac 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -829,13 +829,12 @@ class MFMA_F8F6F4_WithSizeTable_Helper : // Currently assumes scaled instructions never have abid class MAIFrag : PatFrag < !if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp, - node:$neg_src2, node:$abs_src2, node:$src0_modifiers, node:$scale_src0, node:$src1_modifiers, node:$scale_src1), !con((ops node:$src0, node:$src1, node:$src2, node:$cbsz), !if(HasAbid, (ops node:$abid), (ops)), (ops node:$blgp))), - !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $neg_src2, $abs_src2, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1), + !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1), !if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), (Op $src0, $src1, $src2, $cbsz, $blgp))), pred @@ -887,6 +886,8 @@ class MAIInst : MAIInst { @@ -894,13 +895,11 @@ class ScaledMAIInst : let InOperandList = !con(BaseInst.InOperandList, (ins VSrc_b32:$scale_src0, VSrc_b32:$scale_src1, - neg_lo_src2:$neg_src2, - neg_hi_src2:$abs_src2, op_sel0:$src0_modifiers, op_sel_hi0:$src1_modifiers)); let AsmOperands = "$vdst, $src0, $src1, $src2, $scale_src0, $scale_src1" - "$neg_src2$abs_src2$src0_modifiers$src1_modifiers$cbsz$blgp"; + "$src0_modifiers$src1_modifiers$cbsz$blgp"; let AsmMatchConverter = "cvtScaledMFMA"; let FixedSize = 1; let Size = 16; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 8df0654476eaa..952ee2fe2c955 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -531,12 +531,8 @@ class VOP3PXe op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_ bits<4> src0_modifiers; bits<4> src1_modifiers; - bits<1> neg_src2; - bits<1> abs_src2; // Inst{7-0} = unused // Inst{10-8} = neg_hi; - let Inst{10} = abs_src2; //neg_hi[2] - // Inst{13-11} = op_sel let Inst{11} = src0_modifiers{2}; //opsel[0] let Inst{12} = src1_modifiers{2}; //opsel[1] @@ -551,7 +547,6 @@ class VOP3PXe op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_ let Inst{59} = src0_modifiers{3}; //opsel_hi[0] let Inst{60} = src1_modifiers{3}; //opsel_hi[1] // Inst{63-61} = neg; - let Inst{63} = neg_src2; //neg_lo[2] // The high half of the encoding is the unscaled mfma op. // @@ -1444,8 +1439,6 @@ class getVOP3MAIScaledPat { // mfma [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, timm:$cbsz, timm:$blgp, - i1:$neg_src2, - i1:$abs_src2, MFMALdScaleModifierOp:$src0_modifiers, i32:$scale_src0, MFMALdScaleModifierOp:$src1_modifiers, @@ -1453,9 +1446,7 @@ class getVOP3MAIScaledPat { ))], // smfmac [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx, - timm:$cbsz, timm:$blgp, - i1:$neg_src2, - i1:$abs_src2, + timm:$cbsz, timm:$abid, MFMALdScaleModifierOp:$src0_modifiers, i32:$scale_src0, MFMALdScaleModifierOp:$src1_modifiers, diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index dbd45d6af7688..1028cc9ebb342 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -352,22 +352,22 @@ define amdgpu_kernel void @mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloa ret void } -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) -; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i1 false, i1 false, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) +; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) define amdgpu_kernel void @mfma_scale_f32_16x16x128_f8f6f4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 %arg4, ptr addrspace(1) %out) { - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i1 false, i1 false, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) store <4 x float> %result, ptr addrspace(1) %out ret void } -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) -; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i1 false, i1 false, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) +; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) define amdgpu_kernel void @mfma_f32_scale_32x32x64_f8f6f4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 %arg4, ptr addrspace(1) %out) { - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i1 false, i1 false, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) store <16 x float> %result, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index d5e22852b25d2..e027dda957a6d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -33,87 +33,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -;abs and neg modifier for src2 -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_1_neg_hi_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_1_neg_hi_0__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 neg_lo:[0,0,1] op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 0, ; cbsz - i32 0, ; blgp - i1 true, - i1 false, - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_0_neg_hi_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_0_neg_hi_1__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 neg_hi:[0,0,1] op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 0, ; cbsz - i32 0, ; blgp - i1 false, - i1 true, - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_1_neg_hi_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_neg_1_neg_hi_1__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 neg_lo:[0,0,1] neg_hi:[0,0,1] op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 0, ; cbsz - i32 0, ; blgp - i1 true, - i1 true, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -138,8 +57,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 1, i32 %scale0, i32 1, i32 %scale1) ret <4 x float> %result } @@ -164,8 +81,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 2, i32 %scale0, i32 2, i32 %scale1) ret <4 x float> %result } @@ -190,8 +105,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 3, i32 %scale0, i32 3, i32 %scale1) ret <4 x float> %result } @@ -216,8 +129,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 3, i32 %scale1) ret <4 x float> %result } @@ -242,8 +153,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 3, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -268,8 +177,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 2, i32 %scale0, i32 3, i32 %scale1) ret <4 x float> %result } @@ -294,13 +201,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 3, i32 %scale0, i32 2, i32 %scale1) ret <4 x float> %result } - ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: @@ -322,8 +226,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -349,8 +251,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -376,8 +276,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -403,8 +301,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -430,8 +326,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -457,8 +351,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -484,8 +376,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -511,8 +401,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -538,8 +426,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -565,8 +451,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -592,8 +476,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -619,8 +501,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -647,8 +527,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -674,8 +552,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -700,8 +576,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -727,8 +601,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -754,8 +626,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -781,8 +651,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -808,8 +676,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -835,8 +701,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -862,8 +726,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -889,8 +751,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -916,8 +776,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -942,8 +800,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -968,8 +824,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -994,8 +848,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1020,8 +872,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1048,8 +898,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1075,8 +923,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1102,8 +948,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1129,8 +973,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1155,8 +997,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1181,8 +1021,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1207,8 +1045,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1233,8 +1069,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1259,8 +1093,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1285,8 +1117,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1311,8 +1141,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1337,8 +1165,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1364,8 +1190,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1391,8 +1215,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1418,8 +1240,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1445,8 +1265,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1471,8 +1289,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1497,8 +1313,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1523,8 +1337,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1549,8 +1361,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1575,8 +1385,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1601,8 +1409,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1629,7 +1435,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1650,7 +1456,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1671,7 +1477,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1741,7 +1547,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1795,7 +1601,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1849,7 +1655,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1903,7 +1709,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1924,7 +1730,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1978,7 +1784,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1999,7 +1805,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__ ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 33, i32 2, i32 -2) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) ret <4 x float> %result } @@ -2039,7 +1845,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 65, i32 2, i32 -2) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) ret <4 x float> %result } @@ -2081,7 +1887,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 65, i32 2, i32 77) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) ret <4 x float> %result } @@ -2146,7 +1952,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] ; GISEL-NEXT: s_endpgm - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 3, i32 %scale0, i32 1, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 ret void } @@ -2213,7 +2019,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] ; GISEL-NEXT: s_endpgm - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 3, i32 65, i32 1, i32 -2) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 ret void } @@ -2236,7 +2042,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -2258,7 +2064,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 3, i32 0, i32 1, i32 0) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) ret <4 x float> %result } @@ -2279,7 +2085,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 0, i32 0, i32 1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) ret <4 x float> %result } @@ -2300,7 +2106,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a( ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 1, i32 0, i32 0) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) ret <4 x float> %result } @@ -2328,8 +2134,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2354,8 +2158,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2379,8 +2181,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2404,8 +2204,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -2430,8 +2228,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2456,8 +2252,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2482,8 +2276,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2508,8 +2300,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2533,8 +2323,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4( %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -2558,21 +2346,19 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4_ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 -declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 256903b02692d..5574313f22a47 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -106,305 +106,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_0__cbsz0__blgp0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 neg_lo:[0,0,1] op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_0__cbsz0__blgp0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 neg_lo:[0,0,1] op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 0, ; blgp - i1 true, - i1 false, - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_0_neg_hi_1__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_0_neg_hi_1__cbsz0__blgp0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 neg_hi:[0,0,1] op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_0_neg_hi_1__cbsz0__blgp0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 neg_hi:[0,0,1] op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 0, ; blgp - i1 false, - i1 true, - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_1__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_1__cbsz0__blgp0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 neg_lo:[0,0,1] neg_hi:[0,0,1] op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_neg_1_neg_hi_1__cbsz0__blgp0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 neg_lo:[0,0,1] neg_hi:[0,0,1] op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 0, ; blgp - i1 true, - i1 true, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -502,8 +203,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 1, i32 %scale0, i32 1, i32 %scale1) ret <16 x float> %result } @@ -601,8 +300,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 2, i32 %scale0, i32 2, i32 %scale1) ret <16 x float> %result } @@ -700,8 +397,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 3, i32 %scale0, i32 3, i32 %scale1) ret <16 x float> %result } @@ -799,8 +494,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 3, i32 %scale1) ret <16 x float> %result } @@ -898,8 +591,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 3, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -997,8 +688,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 2, i32 %scale0, i32 3, i32 %scale1) ret <16 x float> %result } @@ -1096,8 +785,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 3, i32 %scale0, i32 2, i32 %scale1) ret <16 x float> %result } @@ -1149,8 +836,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1249,8 +934,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1301,8 +984,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1355,8 +1036,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1406,8 +1085,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1460,8 +1137,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1511,8 +1186,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1563,8 +1236,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1614,8 +1285,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1714,8 +1383,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1766,8 +1433,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1866,8 +1531,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -1919,8 +1582,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -1973,8 +1634,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2024,8 +1683,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2078,8 +1735,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2129,8 +1784,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2181,8 +1834,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2232,8 +1883,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2286,8 +1935,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2337,8 +1984,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2391,8 +2036,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2442,8 +2085,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2493,8 +2134,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2543,8 +2182,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2594,8 +2231,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2644,8 +2279,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2699,8 +2332,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2750,8 +2381,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2804,8 +2433,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2855,8 +2482,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -2906,8 +2531,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -2956,8 +2579,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3007,8 +2628,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3057,8 +2676,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3108,8 +2725,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3158,8 +2773,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3209,8 +2822,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3259,8 +2870,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3311,8 +2920,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3362,8 +2969,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3414,8 +3019,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3465,8 +3068,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3516,8 +3117,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3566,8 +3165,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3617,8 +3214,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3667,8 +3262,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3718,8 +3311,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3768,8 +3359,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -3822,7 +3411,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3870,7 +3459,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3918,7 +3507,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -4038,7 +3627,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -4142,7 +3731,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -4246,7 +3835,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -4350,7 +3939,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -4396,7 +3985,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -4520,7 +4109,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -4567,7 +4156,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__ ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 33, i32 2, i32 -2) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) ret <16 x float> %result } @@ -4659,7 +4248,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 65, i32 2, i32 -2) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) ret <16 x float> %result } @@ -4753,7 +4342,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 2, i32 65, i32 2, i32 77) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) ret <16 x float> %result } @@ -4851,7 +4440,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 ; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 3, i32 %scale0, i32 1, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 ret void } @@ -4950,7 +4539,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 3, i32 65, i32 1, i32 -2) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 ret void } @@ -5100,7 +4689,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 store volatile <16 x float> %result, ptr addrspace(1) null, align 64 ret void @@ -5247,7 +4836,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 25, i32 0, i32 42) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 store volatile <16 x float> %result, ptr addrspace(1) null, align 64 ret void @@ -5394,7 +4983,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 store volatile <16 x float> %result, ptr addrspace(1) null, align 64 ret void @@ -5541,7 +5130,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 25, i32 0, i32 42) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 store volatile <16 x float> %result, ptr addrspace(1) null, align 64 ret void @@ -5590,7 +5179,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -5637,7 +5226,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 3, i32 0, i32 1, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) ret <16 x float> %result } @@ -5684,7 +5273,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 0, i32 0, i32 1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) ret <16 x float> %result } @@ -5731,7 +5320,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a( ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 1, i32 0, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) ret <16 x float> %result } @@ -5832,8 +5421,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -5931,8 +5518,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -6028,8 +5613,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -6079,8 +5662,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -6178,8 +5759,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -6277,8 +5856,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -6330,8 +5907,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -6383,8 +5958,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -6480,8 +6053,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -6531,21 +6102,19 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 -declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i1 immarg, i1 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir index bb64cb106b5e3..4585eca8fe894 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir @@ -162,14 +162,14 @@ body: | ; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -185,13 +185,13 @@ body: | ; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -207,14 +207,14 @@ body: | ; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -230,13 +230,13 @@ body: | ; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -252,13 +252,13 @@ body: | ; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 3 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -274,12 +274,12 @@ body: | ; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 0, 0, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s index 7bf197706f9a0..cf8ca70c07a3f 100644 --- a/llvm/test/MC/AMDGPU/mai-gfx950.s +++ b/llvm/test/MC/AMDGPU/mai-gfx950.s @@ -375,6 +375,7 @@ v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:9], a[0:15] cbsz:1 blgp:3 //===----------------------------------------------------------------------===// // v_mfma_scale_f32_16x16x128_f8f6f4 //===----------------------------------------------------------------------===// +// FIXME: Test neg, clamp // GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU @@ -515,23 +516,10 @@ v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 o v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[1,1,0] cbsz:3 blgp:1 -// neg combinations - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_lo:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x80,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_lo:[0,0,1] - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_hi:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x04,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_hi:[0,0,1] - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_lo:[0,0,1] neg_hi:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x04,0xac,0xd3,0xa1,0x12,0x01,0x80,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 neg_lo:[0,0,1] neg_hi:[0,0,1] - //===----------------------------------------------------------------------===// // v_mfma_scale_f32_32x32x64_f8f6f4 //===----------------------------------------------------------------------===// +// FIXME: Test neg, clamp // GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU @@ -631,19 +619,6 @@ v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[1,1,0] -// neg combinations - -// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_lo:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x80,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_lo:[0,0,1] - -// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_hi:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x04,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_hi:[0,0,1] - -// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_lo:[0,0,1] neg_hi:[0,0,1] op_sel_hi:[0,0,0] ; encoding: [0x00,0x04,0xac,0xd3,0x30,0x63,0x02,0x80,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 neg_lo:[0,0,1] neg_hi:[0,0,1] //===----------------------------------------------------------------------===// // v_mfma_f32_16x16x128_f8f6f4 with appropriate register widths diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/mfma-scale.ll b/llvm/test/Transforms/InstCombine/AMDGPU/mfma-scale.ll index 988c831990dbc..709f143a4745a 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/mfma-scale.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/mfma-scale.ll @@ -9,14 +9,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 2, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 2, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -25,14 +23,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 2, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 2, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -42,14 +38,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 2, i32 2, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 2, i32 2, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -59,14 +53,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_ ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 2, i32 2, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 2, i32 2, i32 0, i32 0, i32 0, i32 0) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -75,14 +67,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -91,14 +81,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 4, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 4, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -107,14 +95,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <6 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i32> [[ARG1]], <6 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -123,14 +109,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( ; CHECK-LABEL: define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( ; CHECK-SAME: <6 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i32> [[ARG0]], <6 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 4, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <4 x float> [[ARG2]], i32 4, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -140,14 +124,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 4, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 4, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -157,14 +139,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4_ ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 4, i32 4, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x float> [[ARG2]], i32 4, i32 4, i32 0, i32 0, i32 0, i32 0) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -173,14 +153,12 @@ define <4 x float> @test_flags_shrink_src0(<8 x i32> %arg0, <8 x i32> %arg1, <4 ; CHECK-LABEL: define <4 x float> @test_flags_shrink_src0( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <4 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call nnan nsz <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 2, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call nnan nsz <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <4 x float> [[ARG2]], i32 0, i32 2, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <4 x float> [[RESULT]] ; %result = call nnan nsz <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -193,14 +171,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 2, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> [[ARG0]], <6 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 2, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -209,14 +185,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 2, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 2, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -226,14 +200,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 2, i32 2, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 2, i32 2, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -243,14 +215,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <6 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <6 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 2, i32 2, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> [[TMP1]], <6 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 2, i32 2, i32 0, i32 0, i32 0, i32 0) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -259,14 +229,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -275,14 +243,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 4, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 4, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -291,14 +257,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <6 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i32> [[ARG1]], <6 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> [[ARG0]], <4 x i32> [[TMP1]], <16 x float> [[ARG2]], i32 0, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -307,14 +271,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( ; CHECK-LABEL: define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( ; CHECK-SAME: <6 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i32> [[ARG0]], <6 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 4, i32 0, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> [[TMP1]], <8 x i32> [[ARG1]], <16 x float> [[ARG2]], i32 4, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -324,14 +286,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]], i32 [[SCALE0:%.*]], i32 [[SCALE1:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 4, i32 4, i1 false, i1 false, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 4, i32 4, i32 0, i32 [[SCALE0]], i32 0, i32 [[SCALE1]]) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -341,14 +301,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ ; CHECK-SAME: <8 x i32> [[ARG0:%.*]], <8 x i32> [[ARG1:%.*]], <16 x float> [[ARG2:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[ARG0]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[ARG1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 4, i32 4, i1 false, i1 false, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <16 x float> [[ARG2]], i32 4, i32 4, i32 0, i32 0, i32 0, i32 0) ; CHECK-NEXT: ret <16 x float> [[RESULT]] ; %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } diff --git a/llvm/test/Verifier/AMDGPU/mfma-scale.ll b/llvm/test/Verifier/AMDGPU/mfma-scale.ll index a1bc73284aa3b..1e3e8856df3d1 100644 --- a/llvm/test/Verifier/AMDGPU/mfma-scale.ll +++ b/llvm/test/Verifier/AMDGPU/mfma-scale.ll @@ -5,53 +5,45 @@ ; -------------------------------------------------------------------- ; CHECK: operand 0 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i64> %arg0 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v4i64_fp8__v8i32_fp8(<4 x i64> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: operand 1 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <4 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i64> %arg1 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8v4i64_fp8(<8 x i32> %arg0, <4 x i64> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: operand 0 must be 4, 6 or 8 element i32 vector -; CHECK: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK: <4 x i64> %arg0 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v4i64_fp8__v8i32_fp8(<4 x i64> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i64.v8i32(<4 x i64> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } ; CHECK: operand 1 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <16 x float> %arg2, i32 0, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i64> %arg1 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8v4i64_fp8(<8 x i32> %arg0, <4 x i64> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i64(<8 x i32> %arg0, <4 x i64> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -61,53 +53,45 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8v4i64_fp8(<8 ; -------------------------------------------------------------------- ; CHECK: operand 0 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v5i32.v8i32(<5 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v5i32.v8i32(<5 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <5 x i32> %arg0 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v5i32_fp4__v8i32_fp4(<5 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i64.v8i32(<5 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: operand 1 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v5i32(<8 x i32> %arg0, <5 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v5i32(<8 x i32> %arg0, <5 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <5 x i32> %arg1 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v5i32_fp4(<8 x i32> %arg0, <5 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v5i32(<8 x i32> %arg0, <5 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: operand 0 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v7i32.v8i32(<7 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v7i32.v8i32(<7 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <7 x i32> %arg0 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v7i32_fp4__v8i32_fp4(<7 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i64.v8i32(<7 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: operand 1 must be 4, 6 or 8 element i32 vector -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v7i32(<8 x i32> %arg0, <7 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v7i32(<8 x i32> %arg0, <7 x i32> %arg1, <4 x float> %arg2, i32 4, i32 4, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <7 x i32> %arg1 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v7i32_fp4(<8 x i32> %arg0, <7 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v7i32(<8 x i32> %arg0, <7 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -117,66 +101,56 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v7i32_fp4( ; -------------------------------------------------------------------- ; CHECK: invalid value for cbsz format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 9999, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 9999, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: i32 9999 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_invalid0__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 9999, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid value for blgp format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 9999, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 9999, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: i32 9999 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_invalid0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 9999, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid value for cbsz format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 5, i32 2, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 5, i32 2, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: i32 5 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_invalid1__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 5, ; cbsz i32 2, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid value for blgp format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 5, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 5, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: i32 5 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i321_invalid(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 5, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid value for cbsz format -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 5, i32 5, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 5, i32 5, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: i32 5 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_invalid__v8i32_invalid(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 5, ; cbsz i32 5, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -186,83 +160,71 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_invalid__v8i32_ ; -------------------------------------------------------------------- ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i32> %arg0 ; CHECK-NEXT: i32 0 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v4i32_fp8__v8i32_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i32> %arg1 -; CHECK-NEXT: i1 false +; CHECK-NEXT: i32 0 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4__v8i32_fp8___v4i32_fp8(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i32> %arg0 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v4i32_fp8__v4i32_fp8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <6 x i32> %arg0 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp8__v6i32_fp8(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <4 x i32> %arg0 ; CHECK-NEXT: i32 0 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v4i32_fp8__v4i32_fp8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } ; CHECK: invalid vector type for format -; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i1 false, i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) +; CHECK-NEXT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ; CHECK-NEXT: <6 x i32> %arg0 ; CHECK-NEXT: i32 0 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp8__v6i32_fp8(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp - i1 false, - i1 false, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } From 8e30e0a449c051b00496b02c60be66674bc1db65 Mon Sep 17 00:00:00 2001 From: vigneshwar jayakumar Date: Wed, 21 May 2025 11:42:57 -0500 Subject: [PATCH 5/5] review changes --- llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ec05073866f61..dc5bb00a15274 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -8859,17 +8859,19 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst, unsigned OpSel = 0; auto OpselIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSel); - if (OpselIdx != OptionalIdx.end()) + if (OpselIdx != OptionalIdx.end()) { OpSel = static_cast(*Operands[OpselIdx->second]) .getImm(); + } unsigned OpSelHi = 0; auto OpselHiIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSelHi); - if (OpselHiIdx != OptionalIdx.end()) + if (OpselHiIdx != OptionalIdx.end()) { OpSelHi = static_cast(*Operands[OpselHiIdx->second]) .getImm(); - static const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, - AMDGPU::OpName::src1_modifiers}; + } + const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers}; for (unsigned J = 0; J < 2; ++J) { unsigned ModVal = 0;