From b941f831f8fe737a589bb7f443895de8ec94f3d4 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 25 Mar 2025 15:36:55 -0700 Subject: [PATCH 1/3] [RegisterCoalescer]: Try inflated RC for coalescing reg->subreg Change-Id: Id1123431e9fdcabac2811f5b18abff7d66e2d1f3 --- .../llvm/CodeGen/MachineRegisterInfo.h | 9 +- llvm/lib/CodeGen/MachineRegisterInfo.cpp | 16 +- llvm/lib/CodeGen/RegisterCoalescer.cpp | 9 +- .../coalesce-copy-to-agpr-to-av-registers.mir | 137 +- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 300 +- .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 3266 ++++++++++------- .../AMDGPU/mfma-no-register-aliasing.ll | 706 ++-- 7 files changed, 2559 insertions(+), 1884 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 8e288cf212360..5f868efde8d9c 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -731,7 +731,14 @@ class MachineRegisterInfo { /// recomputeRegClass - Try to find a legal super-class of Reg's register /// class that still satisfies the constraints from the instructions using - /// Reg. Returns true if Reg was upgraded. + /// \p Reg. \p return the super-class TargetRegisterClass if one was found, + /// otherwise \p return the original TargetRegisterClass. + const TargetRegisterClass * + getLargestConstrainedSuperClass(Register Reg) const; + + /// recomputeRegClass - Try to find a legal super-class of Reg's register + /// class that still satisfies the constraints from the instructions using + /// \p Reg. \p return true if Reg was upgraded. /// /// This method can be used after constraints have been removed from a /// virtual register, for example after removing instructions or splitting diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 937f63f6c5e00..d483625212c55 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -118,8 +118,8 @@ MachineRegisterInfo::constrainRegAttrs(Register Reg, return true; } -bool -MachineRegisterInfo::recomputeRegClass(Register Reg) { +const TargetRegisterClass * +MachineRegisterInfo::getLargestConstrainedSuperClass(Register Reg) const { const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); const TargetRegisterClass *OldRC = getRegClass(Reg); const TargetRegisterInfo *TRI = getTargetRegisterInfo(); @@ -127,7 +127,7 @@ MachineRegisterInfo::recomputeRegClass(Register Reg) { // Stop early if there is no room to grow. if (NewRC == OldRC) - return false; + return NewRC; // Accumulate constraints from all uses. for (MachineOperand &MO : reg_nodbg_operands(Reg)) { @@ -136,8 +136,16 @@ MachineRegisterInfo::recomputeRegClass(Register Reg) { unsigned OpNo = &MO - &MI->getOperand(0); NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, TII, TRI); if (!NewRC || NewRC == OldRC) - return false; + return OldRC; } + return NewRC; +} + +bool MachineRegisterInfo::recomputeRegClass(Register Reg) { + const TargetRegisterClass *OldRC = getRegClass(Reg); + const TargetRegisterClass *NewRC = getLargestConstrainedSuperClass(Reg); + if (NewRC == OldRC) + return false; setRegClass(Reg, NewRC); return true; } diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index c27435aa2dae0..15cb68012d0c9 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -477,7 +477,9 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) { Flipped = true; } - const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); + const MachineFunction *MF = MI->getMF(); + + const MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *SrcRC = MRI.getRegClass(Src); if (Dst.isPhysical()) { @@ -515,6 +517,11 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) { // SrcReg will be merged with a sub-register of DstReg. SrcIdx = DstSub; NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub); + if (!NewRC) { + auto SuperDstRC = MRI.getLargestConstrainedSuperClass(Dst); + if (SuperDstRC != DstRC) + NewRC = TRI.getMatchingSuperRegClass(SuperDstRC, SrcRC, DstSub); + } } else if (SrcSub) { // DstReg will be merged with a sub-register of SrcReg. DstIdx = SrcSub; diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir index f153b30c80b22..3f1965107b361 100644 --- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir +++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir @@ -1440,11 +1440,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_snop ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64 = COPY $vgpr1 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -1465,11 +1463,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_snop ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64_align2 = COPY $vgpr1 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -1490,13 +1486,10 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_snop ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96 = COPY $vgpr2 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -1519,13 +1512,10 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_snop ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96_align2 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96_align2 = COPY $vgpr2 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -1548,11 +1538,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_snop ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_128 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:av_128 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 @@ -1573,11 +1561,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_snop ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_128_align2 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:av_128_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 @@ -1623,11 +1609,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_snop ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:av_96 = COPY $vgpr1_vgpr2 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = COPY $vgpr1_vgpr2 @@ -1648,11 +1632,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_snop ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:av_96_align2 = COPY $vgpr1_vgpr2 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = COPY $vgpr1_vgpr2 @@ -1673,11 +1655,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_snop ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_96 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96 = COPY $vgpr2 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vgpr_32 = COPY $vgpr2 @@ -1698,11 +1678,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_snop ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_96_align2 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96_align2 = COPY $vgpr2 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vgpr_32 = COPY $vgpr2 @@ -1723,10 +1701,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_snop ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64 = COPY [[COPY]].sub0 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %2.sub0:areg_64 = COPY %0 @@ -1746,11 +1723,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2_snop ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64_align2 = COPY $vgpr1 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -1771,10 +1746,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_snop ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96 = COPY [[COPY]].sub0 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96 = COPY %0 @@ -1794,10 +1768,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_snop ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96_align2 = COPY [[COPY]].sub0 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96_align2 = COPY %0 @@ -1817,12 +1790,11 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_snop ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_128 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_128 = COPY [[COPY]].sub0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_128 = COPY [[COPY]].sub0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub3:av_128 = COPY [[COPY]].sub0 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0 @@ -1844,12 +1816,11 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_snop ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_128_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_128_align2 = COPY [[COPY]].sub0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_128_align2 = COPY [[COPY]].sub0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub3:av_128_align2 = COPY [[COPY]].sub0 + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index dac54c9f85e96..1ab7de7cda935 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -3515,42 +3515,56 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v24, s0 -; SDAG-NEXT: v_mov_b32_e32 v25, s1 -; SDAG-NEXT: v_mov_b32_e32 v26, s2 -; SDAG-NEXT: v_mov_b32_e32 v27, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s28 -; SDAG-NEXT: v_mov_b32_e32 v33, s29 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b32_e32 v20, s24 -; SDAG-NEXT: v_mov_b32_e32 v21, s25 -; SDAG-NEXT: v_mov_b32_e32 v22, s26 -; SDAG-NEXT: v_mov_b32_e32 v23, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v32 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v33 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v2 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v3 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v4 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v5 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v6 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v7 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 +; SDAG-NEXT: v_mov_b32_e32 v32, s0 +; SDAG-NEXT: v_mov_b32_e32 v33, s1 +; SDAG-NEXT: v_mov_b32_e32 v34, s2 +; SDAG-NEXT: v_mov_b32_e32 v35, s3 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s28 +; SDAG-NEXT: v_mov_b32_e32 v31, v13 +; SDAG-NEXT: v_mov_b32_e32 v30, v12 +; SDAG-NEXT: v_mov_b32_e32 v29, v11 +; SDAG-NEXT: v_mov_b32_e32 v28, v10 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v17, s29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v31 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[24:31], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -3579,34 +3593,48 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v32, s28 -; GISEL-NEXT: v_mov_b32_e32 v33, s29 -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[20:21] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v33 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v2 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v3 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v4 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v5 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v6 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v7 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v28, v10 +; GISEL-NEXT: v_mov_b32_e32 v29, v11 +; GISEL-NEXT: v_mov_b32_e32 v30, v12 +; GISEL-NEXT: v_mov_b32_e32 v31, v13 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21] +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v31 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -3843,6 +3871,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v26, s0 ; SDAG-NEXT: v_mov_b32_e32 v27, s1 ; SDAG-NEXT: v_mov_b32_e32 v28, s2 @@ -3851,7 +3880,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v31, s17 ; SDAG-NEXT: v_mov_b32_e32 v32, s18 ; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 @@ -3898,10 +3926,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 @@ -3993,42 +4021,48 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s0 -; SDAG-NEXT: v_mov_b32_e32 v17, s1 -; SDAG-NEXT: v_mov_b32_e32 v18, s2 -; SDAG-NEXT: v_mov_b32_e32 v19, s3 -; SDAG-NEXT: v_mov_b32_e32 v20, s16 -; SDAG-NEXT: v_mov_b32_e32 v21, s17 -; SDAG-NEXT: v_mov_b32_e32 v22, s18 -; SDAG-NEXT: v_mov_b32_e32 v23, s19 -; SDAG-NEXT: v_mov_b32_e32 v24, s20 -; SDAG-NEXT: v_mov_b32_e32 v25, s21 -; SDAG-NEXT: v_mov_b32_e32 v26, s22 -; SDAG-NEXT: v_mov_b32_e32 v27, s23 -; SDAG-NEXT: v_mov_b32_e32 v28, s24 -; SDAG-NEXT: v_mov_b32_e32 v29, s25 -; SDAG-NEXT: v_mov_b32_e32 v30, s26 -; SDAG-NEXT: v_mov_b32_e32 v31, s27 -; SDAG-NEXT: v_mov_b32_e32 v32, s28 -; SDAG-NEXT: v_mov_b32_e32 v33, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v31 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v32 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v33 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 +; SDAG-NEXT: v_mov_b32_e32 v32, s0 +; SDAG-NEXT: v_mov_b32_e32 v33, s1 +; SDAG-NEXT: v_mov_b32_e32 v34, s2 +; SDAG-NEXT: v_mov_b32_e32 v35, s3 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v31, v13 +; SDAG-NEXT: v_mov_b32_e32 v30, v12 +; SDAG-NEXT: v_mov_b32_e32 v29, v11 +; SDAG-NEXT: v_mov_b32_e32 v28, v10 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b32_e32 v20, s24 +; SDAG-NEXT: v_mov_b32_e32 v21, s25 +; SDAG-NEXT: v_mov_b32_e32 v22, s26 +; SDAG-NEXT: v_mov_b32_e32 v23, s27 +; SDAG-NEXT: v_mov_b32_e32 v24, s28 +; SDAG-NEXT: v_mov_b32_e32 v25, s29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v31 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -4057,38 +4091,44 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v24, s20 -; GISEL-NEXT: v_mov_b32_e32 v25, s21 -; GISEL-NEXT: v_mov_b32_e32 v26, s22 -; GISEL-NEXT: v_mov_b32_e32 v27, s23 -; GISEL-NEXT: v_mov_b32_e32 v28, s24 -; GISEL-NEXT: v_mov_b32_e32 v29, s25 -; GISEL-NEXT: v_mov_b32_e32 v30, s26 -; GISEL-NEXT: v_mov_b32_e32 v31, s27 -; GISEL-NEXT: v_mov_b32_e32 v32, s28 -; GISEL-NEXT: v_mov_b32_e32 v33, s29 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v30 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v31 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v32 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v33 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v16, s20 +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v28, v10 +; GISEL-NEXT: v_mov_b32_e32 v29, v11 +; GISEL-NEXT: v_mov_b32_e32 v30, v12 +; GISEL-NEXT: v_mov_b32_e32 v31, v13 +; GISEL-NEXT: v_mov_b32_e32 v17, s21 +; GISEL-NEXT: v_mov_b32_e32 v18, s22 +; GISEL-NEXT: v_mov_b32_e32 v19, s23 +; GISEL-NEXT: v_mov_b32_e32 v20, s24 +; GISEL-NEXT: v_mov_b32_e32 v21, s25 +; GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GISEL-NEXT: v_mov_b32_e32 v23, s27 +; GISEL-NEXT: v_mov_b32_e32 v24, s28 +; GISEL-NEXT: v_mov_b32_e32 v25, s29 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v31 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 77d4aad5f3174..08167cd226a4c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -72,46 +72,94 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x64_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x64_f16: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x64_f16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -252,25 +300,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16: @@ -288,22 +353,14 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -315,25 +372,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0: @@ -351,22 +425,14 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -378,25 +444,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1: @@ -414,22 +497,14 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -441,94 +516,108 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s0 +; SDAG-NEXT: v_mov_b32_e32 v29, s1 +; SDAG-NEXT: v_mov_b32_e32 v30, s2 +; SDAG-NEXT: v_mov_b32_e32 v31, s3 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s20 +; SDAG-NEXT: v_mov_b32_e32 v5, s21 +; SDAG-NEXT: v_mov_b32_e32 v6, s22 +; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, s24 +; GISEL-NEXT: v_mov_b32_e32 v19, s25 +; GISEL-NEXT: v_mov_b32_e32 v24, v0 +; GISEL-NEXT: v_mov_b32_e32 v25, v1 +; GISEL-NEXT: v_mov_b32_e32 v26, v2 +; GISEL-NEXT: v_mov_b32_e32 v27, v3 +; GISEL-NEXT: v_mov_b32_e32 v28, v4 +; GISEL-NEXT: v_mov_b32_e32 v29, v5 +; GISEL-NEXT: v_mov_b32_e32 v30, v6 +; GISEL-NEXT: v_mov_b32_e32 v31, v7 +; GISEL-NEXT: v_mov_b32_e32 v32, v8 +; GISEL-NEXT: v_mov_b32_e32 v33, v9 +; GISEL-NEXT: v_mov_b32_e32 v16, v10 +; GISEL-NEXT: v_mov_b32_e32 v20, s26 +; GISEL-NEXT: v_mov_b32_e32 v21, s27 +; GISEL-NEXT: v_mov_b32_e32 v22, s28 +; GISEL-NEXT: v_mov_b32_e32 v23, s29 +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21 +; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[34:37], v[48:55], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -579,12 +668,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -594,12 +688,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result @@ -609,12 +708,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result @@ -705,25 +809,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -733,25 +854,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, < ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result @@ -761,25 +899,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, < ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result @@ -789,54 +944,70 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v36, s0 -; GCN-NEXT: v_mov_b32_e32 v37, s1 -; GCN-NEXT: v_mov_b32_e32 v38, s2 -; GCN-NEXT: v_mov_b32_e32 v39, s3 +; GCN-NEXT: v_mov_b32_e32 v28, s0 +; GCN-NEXT: v_mov_b32_e32 v29, s1 +; GCN-NEXT: v_mov_b32_e32 v30, s2 +; GCN-NEXT: v_mov_b32_e32 v31, s3 +; GCN-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NEXT: v_mov_b32_e32 v27, v9 +; GCN-NEXT: v_mov_b32_e32 v26, v8 +; GCN-NEXT: v_mov_b32_e32 v25, v7 +; GCN-NEXT: v_mov_b32_e32 v24, v6 +; GCN-NEXT: v_mov_b32_e32 v23, v5 +; GCN-NEXT: v_mov_b32_e32 v22, v4 +; GCN-NEXT: v_mov_b32_e32 v21, v3 +; GCN-NEXT: v_mov_b32_e32 v20, v2 +; GCN-NEXT: v_mov_b32_e32 v19, v1 +; GCN-NEXT: v_mov_b32_e32 v18, v0 ; GCN-NEXT: v_mov_b32_e32 v13, s25 ; GCN-NEXT: v_mov_b32_e32 v14, s26 ; GCN-NEXT: v_mov_b32_e32 v15, s27 ; GCN-NEXT: v_mov_b32_e32 v16, s28 ; GCN-NEXT: v_mov_b32_e32 v17, s29 -; GCN-NEXT: v_mov_b32_e32 v28, s16 -; GCN-NEXT: v_mov_b32_e32 v29, s17 -; GCN-NEXT: v_mov_b32_e32 v30, s18 -; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: v_mov_b32_e32 v32, s20 -; GCN-NEXT: v_mov_b32_e32 v33, s21 -; GCN-NEXT: v_mov_b32_e32 v34, s22 -; GCN-NEXT: v_mov_b32_e32 v35, s23 -; GCN-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NEXT: v_mov_b32_e32 v18, v0 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NEXT: v_mov_b32_e32 v23, v5 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v7 -; GCN-NEXT: v_mov_b32_e32 v26, v8 -; GCN-NEXT: v_mov_b32_e32 v27, v9 +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NEXT: v_mov_b32_e32 v6, s22 +; GCN-NEXT: v_mov_b32_e32 v7, s23 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -916,46 +1087,94 @@ bb: } define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_i32_16x16x128_i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_i32_16x16x128_i8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_i32_16x16x128_i8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x i32> %result } define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x i32> %result } define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x i32> %result } @@ -1102,25 +1321,42 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8: @@ -1138,22 +1374,14 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1165,25 +1393,42 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0: @@ -1201,22 +1446,14 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32 ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1228,25 +1465,42 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1: @@ -1264,22 +1518,14 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32 ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1291,94 +1537,108 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s0 +; SDAG-NEXT: v_mov_b32_e32 v29, s1 +; SDAG-NEXT: v_mov_b32_e32 v30, s2 +; SDAG-NEXT: v_mov_b32_e32 v31, s3 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s20 +; SDAG-NEXT: v_mov_b32_e32 v5, s21 +; SDAG-NEXT: v_mov_b32_e32 v6, s22 +; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, s24 +; GISEL-NEXT: v_mov_b32_e32 v19, s25 +; GISEL-NEXT: v_mov_b32_e32 v24, v0 +; GISEL-NEXT: v_mov_b32_e32 v25, v1 +; GISEL-NEXT: v_mov_b32_e32 v26, v2 +; GISEL-NEXT: v_mov_b32_e32 v27, v3 +; GISEL-NEXT: v_mov_b32_e32 v28, v4 +; GISEL-NEXT: v_mov_b32_e32 v29, v5 +; GISEL-NEXT: v_mov_b32_e32 v30, v6 +; GISEL-NEXT: v_mov_b32_e32 v31, v7 +; GISEL-NEXT: v_mov_b32_e32 v32, v8 +; GISEL-NEXT: v_mov_b32_e32 v33, v9 +; GISEL-NEXT: v_mov_b32_e32 v16, v10 +; GISEL-NEXT: v_mov_b32_e32 v20, s26 +; GISEL-NEXT: v_mov_b32_e32 v21, s27 +; GISEL-NEXT: v_mov_b32_e32 v22, s28 +; GISEL-NEXT: v_mov_b32_e32 v23, s29 +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21 +; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[34:37], v[48:55], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x i32> %result @@ -1458,46 +1718,94 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -1627,46 +1935,94 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -1796,46 +2152,94 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -1965,46 +2369,94 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -2151,25 +2603,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: @@ -2186,23 +2655,15 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: v_mov_b32_e32 v34, v8 ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2214,25 +2675,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: @@ -2250,22 +2728,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2277,25 +2747,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: @@ -2313,22 +2800,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2340,94 +2819,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s0 +; SDAG-NEXT: v_mov_b32_e32 v29, s1 +; SDAG-NEXT: v_mov_b32_e32 v30, s2 +; SDAG-NEXT: v_mov_b32_e32 v31, s3 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s20 +; SDAG-NEXT: v_mov_b32_e32 v5, s21 +; SDAG-NEXT: v_mov_b32_e32 v6, s22 +; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, s24 +; GISEL-NEXT: v_mov_b32_e32 v19, s25 +; GISEL-NEXT: v_mov_b32_e32 v24, v0 +; GISEL-NEXT: v_mov_b32_e32 v25, v1 +; GISEL-NEXT: v_mov_b32_e32 v26, v2 +; GISEL-NEXT: v_mov_b32_e32 v27, v3 +; GISEL-NEXT: v_mov_b32_e32 v28, v4 +; GISEL-NEXT: v_mov_b32_e32 v29, v5 +; GISEL-NEXT: v_mov_b32_e32 v30, v6 +; GISEL-NEXT: v_mov_b32_e32 v31, v7 +; GISEL-NEXT: v_mov_b32_e32 v32, v8 +; GISEL-NEXT: v_mov_b32_e32 v33, v9 +; GISEL-NEXT: v_mov_b32_e32 v16, v10 +; GISEL-NEXT: v_mov_b32_e32 v20, s26 +; GISEL-NEXT: v_mov_b32_e32 v21, s27 +; GISEL-NEXT: v_mov_b32_e32 v22, s28 +; GISEL-NEXT: v_mov_b32_e32 v23, s29 +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[34:37], v[48:55], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -2524,25 +3017,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: @@ -2560,22 +3070,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2587,25 +3089,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: @@ -2623,22 +3142,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2650,25 +3161,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: @@ -2686,22 +3214,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2713,94 +3233,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s0 +; SDAG-NEXT: v_mov_b32_e32 v29, s1 +; SDAG-NEXT: v_mov_b32_e32 v30, s2 +; SDAG-NEXT: v_mov_b32_e32 v31, s3 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s20 +; SDAG-NEXT: v_mov_b32_e32 v5, s21 +; SDAG-NEXT: v_mov_b32_e32 v6, s22 +; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, s24 +; GISEL-NEXT: v_mov_b32_e32 v19, s25 +; GISEL-NEXT: v_mov_b32_e32 v24, v0 +; GISEL-NEXT: v_mov_b32_e32 v25, v1 +; GISEL-NEXT: v_mov_b32_e32 v26, v2 +; GISEL-NEXT: v_mov_b32_e32 v27, v3 +; GISEL-NEXT: v_mov_b32_e32 v28, v4 +; GISEL-NEXT: v_mov_b32_e32 v29, v5 +; GISEL-NEXT: v_mov_b32_e32 v30, v6 +; GISEL-NEXT: v_mov_b32_e32 v31, v7 +; GISEL-NEXT: v_mov_b32_e32 v32, v8 +; GISEL-NEXT: v_mov_b32_e32 v33, v9 +; GISEL-NEXT: v_mov_b32_e32 v16, v10 +; GISEL-NEXT: v_mov_b32_e32 v20, s26 +; GISEL-NEXT: v_mov_b32_e32 v21, s27 +; GISEL-NEXT: v_mov_b32_e32 v22, s28 +; GISEL-NEXT: v_mov_b32_e32 v23, s29 +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[34:37], v[48:55], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -2897,25 +3431,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: @@ -2933,22 +3484,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2960,25 +3503,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: @@ -2996,22 +3556,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3023,25 +3575,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: @@ -3059,22 +3628,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3086,94 +3647,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s0 +; SDAG-NEXT: v_mov_b32_e32 v29, s1 +; SDAG-NEXT: v_mov_b32_e32 v30, s2 +; SDAG-NEXT: v_mov_b32_e32 v31, s3 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s20 +; SDAG-NEXT: v_mov_b32_e32 v5, s21 +; SDAG-NEXT: v_mov_b32_e32 v6, s22 +; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, s24 +; GISEL-NEXT: v_mov_b32_e32 v19, s25 +; GISEL-NEXT: v_mov_b32_e32 v24, v0 +; GISEL-NEXT: v_mov_b32_e32 v25, v1 +; GISEL-NEXT: v_mov_b32_e32 v26, v2 +; GISEL-NEXT: v_mov_b32_e32 v27, v3 +; GISEL-NEXT: v_mov_b32_e32 v28, v4 +; GISEL-NEXT: v_mov_b32_e32 v29, v5 +; GISEL-NEXT: v_mov_b32_e32 v30, v6 +; GISEL-NEXT: v_mov_b32_e32 v31, v7 +; GISEL-NEXT: v_mov_b32_e32 v32, v8 +; GISEL-NEXT: v_mov_b32_e32 v33, v9 +; GISEL-NEXT: v_mov_b32_e32 v16, v10 +; GISEL-NEXT: v_mov_b32_e32 v20, s26 +; GISEL-NEXT: v_mov_b32_e32 v21, s27 +; GISEL-NEXT: v_mov_b32_e32 v22, s28 +; GISEL-NEXT: v_mov_b32_e32 v23, s29 +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[34:37], v[48:55], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -3270,25 +3845,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: @@ -3306,22 +3898,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3333,25 +3917,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: @@ -3369,22 +3970,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3396,25 +3989,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: @@ -3432,22 +4042,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3459,94 +4061,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s0 +; SDAG-NEXT: v_mov_b32_e32 v29, s1 +; SDAG-NEXT: v_mov_b32_e32 v30, s2 +; SDAG-NEXT: v_mov_b32_e32 v31, s3 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s20 +; SDAG-NEXT: v_mov_b32_e32 v5, s21 +; SDAG-NEXT: v_mov_b32_e32 v6, s22 +; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, s24 +; GISEL-NEXT: v_mov_b32_e32 v19, s25 +; GISEL-NEXT: v_mov_b32_e32 v24, v0 +; GISEL-NEXT: v_mov_b32_e32 v25, v1 +; GISEL-NEXT: v_mov_b32_e32 v26, v2 +; GISEL-NEXT: v_mov_b32_e32 v27, v3 +; GISEL-NEXT: v_mov_b32_e32 v28, v4 +; GISEL-NEXT: v_mov_b32_e32 v29, v5 +; GISEL-NEXT: v_mov_b32_e32 v30, v6 +; GISEL-NEXT: v_mov_b32_e32 v31, v7 +; GISEL-NEXT: v_mov_b32_e32 v32, v8 +; GISEL-NEXT: v_mov_b32_e32 v33, v9 +; GISEL-NEXT: v_mov_b32_e32 v16, v10 +; GISEL-NEXT: v_mov_b32_e32 v20, s26 +; GISEL-NEXT: v_mov_b32_e32 v21, s27 +; GISEL-NEXT: v_mov_b32_e32 v22, s28 +; GISEL-NEXT: v_mov_b32_e32 v23, s29 +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[34:37], v[48:55], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 21af2dde2c4bf..3fffb04de1a63 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -96,66 +96,66 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32 -; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a61 ; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60 -; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a33 -; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a59 -; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a58 -; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34 -; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a57 -; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a56 -; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a35 -; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a55 -; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a54 -; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a36 -; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a53 -; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a52 -; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37 -; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a51 -; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a50 -; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a38 -; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a49 -; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a48 -; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a39 -; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a47 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a46 -; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40 -; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v19 -; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a41 -; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v18 -; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v17 -; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a42 -; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v16 -; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v15 -; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a59 +; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a58 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a57 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a56 +; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a55 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a54 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a53 +; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a52 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a51 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a50 +; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a49 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a48 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a47 +; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a46 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a45 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a44 +; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a43 -; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v14 -; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v13 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a42 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a41 ; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a44 -; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v12 -; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v11 -; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a45 -; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v10 -; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v9 -; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v8 -; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v7 -; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5 +; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a39 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a38 +; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a36 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a35 +; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a33 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a32 +; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GREEDY908-NEXT: s_nop 7 @@ -266,36 +266,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 2 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a32 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a33 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a34 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a35 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a36 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a37 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a38 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a39 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a40 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a41 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a42 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a43 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a44 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a45 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a16, a46 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a17, a47 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a18, a48 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a19, a49 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a20, a50 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a21, a51 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a22, a52 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a23, a53 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a24, a54 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a25, a55 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a26, a56 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a27, a57 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a28, a58 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a29, a59 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a30, a60 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a31, a61 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a30, a60 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a29, a59 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a28, a58 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a27, a57 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a26, a56 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a25, a55 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a24, a54 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a23, a53 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a22, a52 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a21, a51 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a20, a50 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a19, a49 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a18, a48 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a17, a47 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a16, a46 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a45 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a44 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a43 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a42 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a41 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a40 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a39 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a38 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a37 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a36 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a35 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a34 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a33 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a32 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GREEDY90A-NEXT: s_nop 7 @@ -359,36 +359,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a32 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a33 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a34 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a35 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a36 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a37 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a38 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a39 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a40 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a41 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a42 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a43 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a44 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a45 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a16, a46 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a17, a47 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a18, a48 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a19, a49 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a20, a50 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a21, a51 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a22, a52 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a23, a53 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a24, a54 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a25, a55 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a26, a56 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a27, a57 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a28, a58 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a29, a59 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a30, a60 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a31, a61 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a30, a60 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a29, a59 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a28, a58 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a27, a57 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a26, a56 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a25, a55 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a24, a54 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a23, a53 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a22, a52 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a21, a51 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a20, a50 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a19, a49 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a18, a48 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a17, a47 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a16, a46 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a45 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a44 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a43 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a42 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a41 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a40 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a39 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a38 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a37 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a36 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a35 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a34 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a33 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a32 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GREEDY942-NEXT: s_nop 7 @@ -507,106 +507,76 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) -; FAST90A-NEXT: v_accvgpr_write_b32 a32, s36 -; FAST90A-NEXT: v_accvgpr_write_b32 a33, s37 -; FAST90A-NEXT: v_accvgpr_write_b32 a34, s38 -; FAST90A-NEXT: v_accvgpr_write_b32 a35, s39 -; FAST90A-NEXT: v_accvgpr_write_b32 a36, s40 -; FAST90A-NEXT: v_accvgpr_write_b32 a37, s41 -; FAST90A-NEXT: v_accvgpr_write_b32 a38, s42 -; FAST90A-NEXT: v_accvgpr_write_b32 a39, s43 -; FAST90A-NEXT: v_accvgpr_write_b32 a40, s44 -; FAST90A-NEXT: v_accvgpr_write_b32 a41, s45 -; FAST90A-NEXT: v_accvgpr_write_b32 a42, s46 -; FAST90A-NEXT: v_accvgpr_write_b32 a43, s47 -; FAST90A-NEXT: v_accvgpr_write_b32 a44, s48 -; FAST90A-NEXT: v_accvgpr_write_b32 a45, s49 -; FAST90A-NEXT: v_accvgpr_write_b32 a46, s50 -; FAST90A-NEXT: v_accvgpr_write_b32 a47, s51 -; FAST90A-NEXT: v_accvgpr_write_b32 a48, s4 -; FAST90A-NEXT: v_accvgpr_write_b32 a49, s5 -; FAST90A-NEXT: v_accvgpr_write_b32 a50, s6 -; FAST90A-NEXT: v_accvgpr_write_b32 a51, s7 -; FAST90A-NEXT: v_accvgpr_write_b32 a52, s8 -; FAST90A-NEXT: v_accvgpr_write_b32 a53, s9 -; FAST90A-NEXT: v_accvgpr_write_b32 a54, s10 -; FAST90A-NEXT: v_accvgpr_write_b32 a55, s11 -; FAST90A-NEXT: v_accvgpr_write_b32 a56, s12 -; FAST90A-NEXT: v_accvgpr_write_b32 a57, s13 -; FAST90A-NEXT: v_accvgpr_write_b32 a58, s14 -; FAST90A-NEXT: v_accvgpr_write_b32 a59, s15 -; FAST90A-NEXT: v_accvgpr_write_b32 a60, s16 -; FAST90A-NEXT: v_accvgpr_write_b32 a61, s17 -; FAST90A-NEXT: v_accvgpr_write_b32 a62, s18 -; FAST90A-NEXT: v_accvgpr_write_b32 a63, s19 +; FAST90A-NEXT: v_accvgpr_write_b32 a64, s36 +; FAST90A-NEXT: v_accvgpr_write_b32 a65, s37 +; FAST90A-NEXT: v_accvgpr_write_b32 a66, s38 +; FAST90A-NEXT: v_accvgpr_write_b32 a67, s39 +; FAST90A-NEXT: v_accvgpr_write_b32 a68, s40 +; FAST90A-NEXT: v_accvgpr_write_b32 a69, s41 +; FAST90A-NEXT: v_accvgpr_write_b32 a70, s42 +; FAST90A-NEXT: v_accvgpr_write_b32 a71, s43 +; FAST90A-NEXT: v_accvgpr_write_b32 a72, s44 +; FAST90A-NEXT: v_accvgpr_write_b32 a73, s45 +; FAST90A-NEXT: v_accvgpr_write_b32 a74, s46 +; FAST90A-NEXT: v_accvgpr_write_b32 a75, s47 +; FAST90A-NEXT: v_accvgpr_write_b32 a76, s48 +; FAST90A-NEXT: v_accvgpr_write_b32 a77, s49 +; FAST90A-NEXT: v_accvgpr_write_b32 a78, s50 +; FAST90A-NEXT: v_accvgpr_write_b32 a79, s51 +; FAST90A-NEXT: v_accvgpr_write_b32 a80, s4 +; FAST90A-NEXT: v_accvgpr_write_b32 a81, s5 +; FAST90A-NEXT: v_accvgpr_write_b32 a82, s6 +; FAST90A-NEXT: v_accvgpr_write_b32 a83, s7 +; FAST90A-NEXT: v_accvgpr_write_b32 a84, s8 +; FAST90A-NEXT: v_accvgpr_write_b32 a85, s9 +; FAST90A-NEXT: v_accvgpr_write_b32 a86, s10 +; FAST90A-NEXT: v_accvgpr_write_b32 a87, s11 +; FAST90A-NEXT: v_accvgpr_write_b32 a88, s12 +; FAST90A-NEXT: v_accvgpr_write_b32 a89, s13 +; FAST90A-NEXT: v_accvgpr_write_b32 a90, s14 +; FAST90A-NEXT: v_accvgpr_write_b32 a91, s15 +; FAST90A-NEXT: v_accvgpr_write_b32 a92, s16 +; FAST90A-NEXT: v_accvgpr_write_b32 a93, s17 +; FAST90A-NEXT: v_accvgpr_write_b32 a94, s18 +; FAST90A-NEXT: v_accvgpr_write_b32 a95, s19 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[64:95] ; FAST90A-NEXT: s_nop 7 ; FAST90A-NEXT: s_nop 7 -; FAST90A-NEXT: s_nop 2 -; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29 -; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28 -; FAST90A-NEXT: v_accvgpr_read_b32 v5, a27 -; FAST90A-NEXT: v_accvgpr_read_b32 v6, a26 -; FAST90A-NEXT: v_accvgpr_read_b32 v7, a25 -; FAST90A-NEXT: v_accvgpr_read_b32 v8, a24 -; FAST90A-NEXT: v_accvgpr_read_b32 v9, a23 -; FAST90A-NEXT: v_accvgpr_read_b32 v10, a22 -; FAST90A-NEXT: v_accvgpr_read_b32 v11, a21 -; FAST90A-NEXT: v_accvgpr_read_b32 v12, a20 -; FAST90A-NEXT: v_accvgpr_read_b32 v13, a19 -; FAST90A-NEXT: v_accvgpr_read_b32 v14, a18 -; FAST90A-NEXT: v_accvgpr_read_b32 v15, a17 -; FAST90A-NEXT: v_accvgpr_read_b32 v16, a16 -; FAST90A-NEXT: v_accvgpr_read_b32 v17, a15 -; FAST90A-NEXT: v_accvgpr_read_b32 v18, a14 -; FAST90A-NEXT: v_accvgpr_read_b32 v19, a13 -; FAST90A-NEXT: v_accvgpr_read_b32 v20, a12 -; FAST90A-NEXT: v_accvgpr_read_b32 v21, a11 -; FAST90A-NEXT: v_accvgpr_read_b32 v22, a10 -; FAST90A-NEXT: v_accvgpr_read_b32 v23, a9 -; FAST90A-NEXT: v_accvgpr_read_b32 v24, a8 -; FAST90A-NEXT: v_accvgpr_read_b32 v25, a7 -; FAST90A-NEXT: v_accvgpr_read_b32 v26, a6 -; FAST90A-NEXT: v_accvgpr_read_b32 v27, a5 -; FAST90A-NEXT: v_accvgpr_read_b32 v28, a4 -; FAST90A-NEXT: v_accvgpr_read_b32 v29, a3 -; FAST90A-NEXT: v_accvgpr_read_b32 v30, a2 -; FAST90A-NEXT: v_accvgpr_read_b32 v31, a1 -; FAST90A-NEXT: v_accvgpr_read_b32 v32, a0 -; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32 -; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33 -; FAST90A-NEXT: v_accvgpr_write_b32 a2, v32 -; FAST90A-NEXT: v_accvgpr_write_b32 a3, v31 -; FAST90A-NEXT: v_accvgpr_write_b32 a4, v30 -; FAST90A-NEXT: v_accvgpr_write_b32 a5, v29 -; FAST90A-NEXT: v_accvgpr_write_b32 a6, v28 -; FAST90A-NEXT: v_accvgpr_write_b32 a7, v27 -; FAST90A-NEXT: v_accvgpr_write_b32 a8, v26 -; FAST90A-NEXT: v_accvgpr_write_b32 a9, v25 -; FAST90A-NEXT: v_accvgpr_write_b32 a10, v24 -; FAST90A-NEXT: v_accvgpr_write_b32 a11, v23 -; FAST90A-NEXT: v_accvgpr_write_b32 a12, v22 -; FAST90A-NEXT: v_accvgpr_write_b32 a13, v21 -; FAST90A-NEXT: v_accvgpr_write_b32 a14, v20 -; FAST90A-NEXT: v_accvgpr_write_b32 a15, v19 -; FAST90A-NEXT: v_accvgpr_write_b32 a16, v18 -; FAST90A-NEXT: v_accvgpr_write_b32 a17, v17 -; FAST90A-NEXT: v_accvgpr_write_b32 a18, v16 -; FAST90A-NEXT: v_accvgpr_write_b32 a19, v15 -; FAST90A-NEXT: v_accvgpr_write_b32 a20, v14 -; FAST90A-NEXT: v_accvgpr_write_b32 a21, v13 -; FAST90A-NEXT: v_accvgpr_write_b32 a22, v12 -; FAST90A-NEXT: v_accvgpr_write_b32 a23, v11 -; FAST90A-NEXT: v_accvgpr_write_b32 a24, v10 -; FAST90A-NEXT: v_accvgpr_write_b32 a25, v9 -; FAST90A-NEXT: v_accvgpr_write_b32 a26, v8 -; FAST90A-NEXT: v_accvgpr_write_b32 a27, v7 -; FAST90A-NEXT: v_accvgpr_write_b32 a28, v6 -; FAST90A-NEXT: v_accvgpr_write_b32 a29, v5 -; FAST90A-NEXT: v_accvgpr_write_b32 a30, v4 -; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3 +; FAST90A-NEXT: s_nop 1 +; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a65 +; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a64 +; FAST90A-NEXT: v_accvgpr_mov_b32 a31, a61 +; FAST90A-NEXT: v_accvgpr_mov_b32 a30, a60 +; FAST90A-NEXT: v_accvgpr_mov_b32 a29, a59 +; FAST90A-NEXT: v_accvgpr_mov_b32 a28, a58 +; FAST90A-NEXT: v_accvgpr_mov_b32 a27, a57 +; FAST90A-NEXT: v_accvgpr_mov_b32 a26, a56 +; FAST90A-NEXT: v_accvgpr_mov_b32 a25, a55 +; FAST90A-NEXT: v_accvgpr_mov_b32 a24, a54 +; FAST90A-NEXT: v_accvgpr_mov_b32 a23, a53 +; FAST90A-NEXT: v_accvgpr_mov_b32 a22, a52 +; FAST90A-NEXT: v_accvgpr_mov_b32 a21, a51 +; FAST90A-NEXT: v_accvgpr_mov_b32 a20, a50 +; FAST90A-NEXT: v_accvgpr_mov_b32 a19, a49 +; FAST90A-NEXT: v_accvgpr_mov_b32 a18, a48 +; FAST90A-NEXT: v_accvgpr_mov_b32 a17, a47 +; FAST90A-NEXT: v_accvgpr_mov_b32 a16, a46 +; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a45 +; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a44 +; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a43 +; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a42 +; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a41 +; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a40 +; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a39 +; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a38 +; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a37 +; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a36 +; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a35 +; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a34 +; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a33 +; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a32 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; FAST90A-NEXT: s_nop 7 @@ -636,42 +606,42 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908: ; %bb.0: ; %bb ; GREEDY908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 +; GREEDY908-NEXT: v_mov_b32_e32 v12, 0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY908-NEXT: v_mov_b32_e32 v5, s15 +; GREEDY908-NEXT: v_mov_b32_e32 v13, s15 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s14 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s13 -; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s12 +; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v13 +; GREEDY908-NEXT: v_mov_b32_e32 v13, s12 ; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5 +; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v13 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s11 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s10 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s9 +; GREEDY908-NEXT: v_mov_b32_e32 v13, s9 ; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v5 +; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v13 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s8 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s7 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s6 +; GREEDY908-NEXT: v_mov_b32_e32 v13, s6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v5 +; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v13 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s5 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s4 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s3 +; GREEDY908-NEXT: v_mov_b32_e32 v13, s3 ; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v5 +; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v13 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s2 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s1 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s0 +; GREEDY908-NEXT: v_mov_b32_e32 v13, s0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v5 +; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v13 ; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] @@ -679,10 +649,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18 +; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a18 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v13 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY908-NEXT: s_nop 7 @@ -691,113 +661,167 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a11 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a10 +; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a9 +; GREEDY908-NEXT: v_accvgpr_read_b32 v4, a8 +; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a7 +; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a6 +; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a5 +; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a4 +; GREEDY908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 +; GREEDY908-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 +; GREEDY908-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GREEDY908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GREEDY908-NEXT: s_endpgm ; ; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY90A-NEXT: v_mov_b32_e32 v16, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v17, 2.0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s14 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s13 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s12 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s11 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s10 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s9 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s8 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s7 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s6 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s5 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s4 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s3 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s2 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a17, s1 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a16, s0 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v16, v17, a[16:31] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v16, v17, a[16:31] ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19 -; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY90A-NEXT: v_accvgpr_read_b32 v0, a16 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v1, a17 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v15, a13 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v14, a12 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v13, a11 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v12, a10 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v11, a9 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v10, a8 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v9, a7 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v8, a6 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v7, a5 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v6, a4 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v5, a3 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v4, a2 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GREEDY90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GREEDY90A-NEXT: v_mov_b32_e32 v0, 0 +; GREEDY90A-NEXT: s_nop 0 +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v16, v17, a[0:15] ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 2 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GREEDY90A-NEXT: s_endpgm ; ; GREEDY942-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY942-NEXT: v_mov_b32_e32 v16, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v17, 2.0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15 -; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14 -; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13 -; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12 -; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11 -; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10 -; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9 -; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8 -; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7 -; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6 -; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5 -; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4 -; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3 -; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2 -; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1 -; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0 +; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s15 +; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s14 +; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s13 +; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s12 +; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s11 +; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s10 +; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s9 +; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s8 +; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s7 +; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s6 +; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s5 +; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s4 +; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s3 +; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s2 +; GREEDY942-NEXT: v_accvgpr_write_b32 a17, s1 +; GREEDY942-NEXT: v_accvgpr_write_b32 a16, s0 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33] -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[16:31], v16, v17, a[16:31] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v16, v17, a[16:31] ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 0 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 -; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GREEDY942-NEXT: v_accvgpr_read_b32 v0, a16 +; GREEDY942-NEXT: v_accvgpr_read_b32 v1, a17 +; GREEDY942-NEXT: v_accvgpr_read_b32 v15, a13 +; GREEDY942-NEXT: v_accvgpr_read_b32 v14, a12 +; GREEDY942-NEXT: v_accvgpr_read_b32 v13, a11 +; GREEDY942-NEXT: v_accvgpr_read_b32 v12, a10 +; GREEDY942-NEXT: v_accvgpr_read_b32 v11, a9 +; GREEDY942-NEXT: v_accvgpr_read_b32 v10, a8 +; GREEDY942-NEXT: v_accvgpr_read_b32 v9, a7 +; GREEDY942-NEXT: v_accvgpr_read_b32 v8, a6 +; GREEDY942-NEXT: v_accvgpr_read_b32 v7, a5 +; GREEDY942-NEXT: v_accvgpr_read_b32 v6, a4 +; GREEDY942-NEXT: v_accvgpr_read_b32 v5, a3 +; GREEDY942-NEXT: v_accvgpr_read_b32 v4, a2 +; GREEDY942-NEXT: v_accvgpr_read_b32 v3, a1 +; GREEDY942-NEXT: v_accvgpr_read_b32 v2, a0 +; GREEDY942-NEXT: v_accvgpr_write_b32 a0, v0 +; GREEDY942-NEXT: v_accvgpr_write_b32 a1, v1 +; GREEDY942-NEXT: v_accvgpr_write_b32 a2, v2 +; GREEDY942-NEXT: v_accvgpr_write_b32 a3, v3 +; GREEDY942-NEXT: v_accvgpr_write_b32 a4, v4 +; GREEDY942-NEXT: v_accvgpr_write_b32 a5, v5 +; GREEDY942-NEXT: v_accvgpr_write_b32 a6, v6 +; GREEDY942-NEXT: v_accvgpr_write_b32 a7, v7 +; GREEDY942-NEXT: v_accvgpr_write_b32 a8, v8 +; GREEDY942-NEXT: v_accvgpr_write_b32 a9, v9 +; GREEDY942-NEXT: v_accvgpr_write_b32 a10, v10 +; GREEDY942-NEXT: v_accvgpr_write_b32 a11, v11 +; GREEDY942-NEXT: v_accvgpr_write_b32 a12, v12 +; GREEDY942-NEXT: v_accvgpr_write_b32 a13, v13 +; GREEDY942-NEXT: v_accvgpr_write_b32 a14, v14 +; GREEDY942-NEXT: v_accvgpr_write_b32 a15, v15 +; GREEDY942-NEXT: v_mov_b32_e32 v0, 0 +; GREEDY942-NEXT: s_nop 0 +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v16, v17, a[0:15] ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] +; GREEDY942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GREEDY942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GREEDY942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GREEDY942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GREEDY942-NEXT: s_endpgm ; ; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32: @@ -857,51 +881,53 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-LABEL: test_mfma_f32_16x16x1f32: ; FAST90A: ; %bb.0: ; %bb ; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 -; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 -; FAST90A-NEXT: v_mov_b32_e32 v0, 0 +; FAST90A-NEXT: v_mov_b32_e32 v0, 1.0 +; FAST90A-NEXT: v_mov_b32_e32 v1, 2.0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) -; FAST90A-NEXT: v_accvgpr_write_b32 a0, s4 -; FAST90A-NEXT: v_accvgpr_write_b32 a1, s5 -; FAST90A-NEXT: v_accvgpr_write_b32 a2, s6 -; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7 -; FAST90A-NEXT: v_accvgpr_write_b32 a4, s8 -; FAST90A-NEXT: v_accvgpr_write_b32 a5, s9 -; FAST90A-NEXT: v_accvgpr_write_b32 a6, s10 -; FAST90A-NEXT: v_accvgpr_write_b32 a7, s11 -; FAST90A-NEXT: v_accvgpr_write_b32 a8, s12 -; FAST90A-NEXT: v_accvgpr_write_b32 a9, s13 -; FAST90A-NEXT: v_accvgpr_write_b32 a10, s14 -; FAST90A-NEXT: v_accvgpr_write_b32 a11, s15 -; FAST90A-NEXT: v_accvgpr_write_b32 a12, s16 -; FAST90A-NEXT: v_accvgpr_write_b32 a13, s17 -; FAST90A-NEXT: v_accvgpr_write_b32 a14, s18 -; FAST90A-NEXT: v_accvgpr_write_b32 a15, s19 +; FAST90A-NEXT: v_accvgpr_write_b32 a47, s19 +; FAST90A-NEXT: v_accvgpr_write_b32 a46, s18 +; FAST90A-NEXT: v_accvgpr_write_b32 a45, s17 +; FAST90A-NEXT: v_accvgpr_write_b32 a44, s16 +; FAST90A-NEXT: v_accvgpr_write_b32 a43, s15 +; FAST90A-NEXT: v_accvgpr_write_b32 a42, s14 +; FAST90A-NEXT: v_accvgpr_write_b32 a41, s13 +; FAST90A-NEXT: v_accvgpr_write_b32 a40, s12 +; FAST90A-NEXT: v_accvgpr_write_b32 a39, s11 +; FAST90A-NEXT: v_accvgpr_write_b32 a38, s10 +; FAST90A-NEXT: v_accvgpr_write_b32 a37, s9 +; FAST90A-NEXT: v_accvgpr_write_b32 a36, s8 +; FAST90A-NEXT: v_accvgpr_write_b32 a35, s7 +; FAST90A-NEXT: v_accvgpr_write_b32 a34, s6 +; FAST90A-NEXT: v_accvgpr_write_b32 a33, s5 +; FAST90A-NEXT: v_accvgpr_write_b32 a32, s4 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[32:47], v0, v1, a[32:47] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[32:47] ; FAST90A-NEXT: s_nop 7 -; FAST90A-NEXT: s_nop 2 -; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16 -; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17 -; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a18 -; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a19 -; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a20 -; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a21 -; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a22 -; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a23 -; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a24 -; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a25 -; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a26 -; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a27 -; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28 +; FAST90A-NEXT: s_nop 1 +; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33 +; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32 ; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29 +; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28 +; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a27 +; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a26 +; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a25 +; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a24 +; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a23 +; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a22 +; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a21 +; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a20 +; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a19 +; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a18 +; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17 +; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; FAST90A-NEXT: v_mov_b32_e32 v0, 0 ; FAST90A-NEXT: s_nop 7 -; FAST90A-NEXT: s_nop 2 +; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 From 83abf51aed3ad0aab7ca1fd97a51880b61f9de90 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 7 Apr 2025 17:23:19 -0700 Subject: [PATCH 2/3] Inflate src reg for subreg inserts Change-Id: I8562c6fae3b4aefd6ddbf3f6dbad18ffa0cf6331 --- llvm/lib/CodeGen/RegisterCoalescer.cpp | 7 +- .../coalesce-copy-to-agpr-to-av-registers.mir | 156 +- .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 60 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 680 +- ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 3746 ++++++---- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 6378 +++++++++++------ .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 3270 +++------ .../AMDGPU/mfma-no-register-aliasing.ll | 170 +- .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll | 42 +- .../CodeGen/AMDGPU/no-fold-accvgpr-mov.mir | 14 +- 10 files changed, 8247 insertions(+), 6276 deletions(-) diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 15cb68012d0c9..74416cae378ab 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -518,9 +518,10 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) { SrcIdx = DstSub; NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub); if (!NewRC) { - auto SuperDstRC = MRI.getLargestConstrainedSuperClass(Dst); - if (SuperDstRC != DstRC) - NewRC = TRI.getMatchingSuperRegClass(SuperDstRC, SrcRC, DstSub); + auto SuperSrcRC = MRI.getLargestConstrainedSuperClass(Src); + if (SuperSrcRC != SrcRC) { + NewRC = TRI.getMatchingSuperRegClass(DstRC, SuperSrcRC, DstSub); + } } } else if (SrcSub) { // DstReg will be merged with a sub-register of SrcReg. diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir index 3f1965107b361..5043c7b8093b1 100644 --- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir +++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir @@ -16,11 +16,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64 ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_64 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_64 = COPY $vgpr1 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -41,11 +39,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2 ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_64_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_64_align2 = COPY $vgpr1 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -66,13 +62,10 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96 ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY3]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_96 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_96 = COPY $vgpr2 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -95,13 +88,10 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2 ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY3]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_96_align2 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_96_align2 = COPY $vgpr2 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -124,11 +114,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128 ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:areg_128 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:areg_128 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 @@ -149,11 +137,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2 ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 @@ -199,11 +185,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96 ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:areg_96 = COPY $vgpr1_vgpr2 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = COPY $vgpr1_vgpr2 @@ -224,11 +208,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2 ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY $vgpr1_vgpr2 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = COPY $vgpr1_vgpr2 @@ -249,11 +231,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96 ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:areg_96 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_96 = COPY $vgpr2 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vgpr_32 = COPY $vgpr2 @@ -274,11 +254,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2 ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_96_align2 = COPY $vgpr2 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vgpr_32 = COPY $vgpr2 @@ -299,10 +277,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_64 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %2.sub0:areg_64 = COPY %0 @@ -322,11 +299,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2 ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_64_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_64_align2 = COPY $vgpr1 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -347,10 +322,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96 = COPY %0 @@ -370,10 +344,9 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96_align2 = COPY %0 @@ -393,12 +366,11 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_128 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0 @@ -420,12 +392,11 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_128_align2 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0 @@ -448,10 +419,9 @@ body: | ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY2]] + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub1:areg_64 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 @@ -499,12 +469,12 @@ body: | ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_and_vreg64_coalesce_with_av64_other_v_use ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3538953 /* reguse:VReg_64 */, [[COPY]] + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_64 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0 + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0 + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3538953 /* reguse:VReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_64 = COPY %0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 452033f332659..ef27cb554d000 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -160,22 +160,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b ; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 7 @@ -205,22 +205,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, ; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 4628a9c15391b..cc738e74df750 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -10,41 +10,73 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, ; -------------------------------------------------------------------- define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_f32_16x16x32_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_f32_16x16x32_f16: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_f32_16x16x32_f16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) ret <4 x float> %result } define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_f32_16x16x32_f16__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_f32_16x16x32_f16__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1) ret <4 x float> %result } @@ -408,91 +440,173 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < } define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) ret <16 x float> %result } define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1) ret <16 x float> %result } @@ -910,41 +1024,73 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32>, <4 x i32>, <4 x i32>, i32 immarg, i32 immarg, i32 immarg) define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) { -; GCN-LABEL: test_mfma_i32_16x16x64_i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_i32_16x16x64_i8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_i32_16x16x64_i8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0) ret <4 x i32> %result } define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) { -; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_i32_16x16x64_i8__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_i32_16x16x64_i8__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 1, i32 1, i32 1) ret <4 x i32> %result } @@ -1330,91 +1476,173 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 } define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) { -; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) ret <16 x i32> %result } define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) { -; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 1, i32 1) ret <16 x i32> %result } @@ -1859,10 +2087,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> ; GCN-LABEL: test_mfma_f32_16x16x32_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] ; GCN-NEXT: s_nop 7 @@ -1879,10 +2107,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x ; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 7cc726a3bd79c..1a367fd0afb92 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -14,22 +14,39 @@ ; fp8 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -38,22 +55,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -62,22 +96,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -86,22 +137,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -110,22 +178,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -134,22 +219,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -158,46 +260,80 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 0, ; cbsz - i32 0, ; blgp - i32 2, i32 %scale0, i32 3, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 2, i32 %scale0, i32 3, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -207,22 +343,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -232,22 +385,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons ; fp8 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp @@ -257,22 +427,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp @@ -282,22 +469,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons ; fp8 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp @@ -307,22 +511,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp @@ -332,22 +553,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons ; fp8 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp @@ -357,22 +595,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp @@ -382,22 +637,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons ; fp8 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp @@ -407,22 +679,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp @@ -432,22 +721,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons ; bf8 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp @@ -457,22 +763,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp @@ -482,22 +805,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons ; bf8 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp @@ -508,47 +848,81 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 1, ; cbsz - i32 1, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <4 x float> %result -} - -; bf8 x fp6 -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; bf8 x fp6 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp @@ -557,22 +931,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp @@ -582,22 +973,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons ; bf8 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp @@ -607,22 +1015,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp @@ -632,22 +1057,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons ; bf8 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp @@ -657,22 +1099,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp @@ -682,22 +1141,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons ; fp6 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp @@ -707,22 +1183,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp @@ -732,22 +1225,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons ; fp6 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp @@ -757,22 +1267,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp @@ -782,21 +1309,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons ; fp6 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp @@ -806,21 +1349,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp @@ -830,21 +1389,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons ; fp6 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp @@ -854,21 +1429,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp @@ -879,22 +1470,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons ; bf6 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp @@ -904,47 +1512,81 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 3, ; cbsz - i32 0, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <4 x float> %result -} - -; bf6 x bf8 -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; bf6 x bf8 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp @@ -954,22 +1596,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp @@ -979,21 +1638,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons ; bf6 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp @@ -1003,21 +1678,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp @@ -1027,21 +1718,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons ; bf6 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp @@ -1051,21 +1758,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp @@ -1075,21 +1798,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons ; bf6 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp @@ -1099,45 +1838,77 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, - i32 3, ; cbsz - i32 3, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <4 x float> %result -} - -; fp6 x fp4 -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp6 x fp4 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp @@ -1147,21 +1918,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp @@ -1171,22 +1958,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons ; fp4 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -1196,22 +2000,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -1221,22 +2042,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons ; fp4 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp @@ -1246,22 +2084,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp @@ -1271,21 +2126,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons ; fp4 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp @@ -1295,21 +2166,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp @@ -1319,21 +2206,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons ; fp4 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp @@ -1343,21 +2246,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp @@ -1367,21 +2286,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons ; fp4 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp @@ -1391,21 +2326,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp @@ -1418,65 +2369,117 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons ; -------------------------------------------------------------------- define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_mov_b32_e32 v16, s1 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, s1 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, s1 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1493,8 +2496,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_mov_b32_e32 v17, s17 ; SDAG-NEXT: v_mov_b32_e32 v18, s18 ; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_mov_b32_e32 v20, s28 -; SDAG-NEXT: v_mov_b32_e32 v21, s29 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 ; SDAG-NEXT: v_mov_b32_e32 v4, s20 ; SDAG-NEXT: v_mov_b32_e32 v5, s21 ; SDAG-NEXT: v_mov_b32_e32 v6, s22 @@ -1503,10 +2506,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_mov_b32_e32 v9, s25 ; SDAG-NEXT: v_mov_b32_e32 v10, s26 ; SDAG-NEXT: v_mov_b32_e32 v11, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s28 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s29 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 @@ -1528,16 +2529,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v20, s28 -; GISEL-NEXT: v_mov_b32_e32 v21, s29 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 +; GISEL-NEXT: v_accvgpr_write_b32 a0, s28 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s29 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 @@ -1563,10 +2562,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 @@ -1617,10 +2616,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 @@ -1663,6 +2662,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v14, s0 ; SDAG-NEXT: v_mov_b32_e32 v15, s1 ; SDAG-NEXT: v_mov_b32_e32 v16, s2 @@ -1671,10 +2674,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 @@ -1693,13 +2692,13 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 @@ -1789,22 +2788,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) ret <4 x float> %result } @@ -1813,12 +2829,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_movk_i32 s0, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -1853,11 +2869,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_movk_i32 s0, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_movk_i32 s0, 0x41 ; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] @@ -2026,86 +3042,154 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } ; This should be optimized to avoid the scale, with non-0 op_sel arguments. define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) ret <4 x float> %result } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) ret <4 x float> %result } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) ret <4 x float> %result } @@ -2115,22 +3199,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a( ; -------------------------------------------------------------------- define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp @@ -2139,22 +3240,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp @@ -2163,21 +3281,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp @@ -2186,21 +3320,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp @@ -2209,22 +3359,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_ } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp @@ -2233,22 +3400,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -2257,22 +3441,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp @@ -2281,22 +3482,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -2305,21 +3523,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp @@ -2328,21 +3562,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 1ab7de7cda935..a4b646b49bef9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -990,331 +990,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons ; fp8 x fp6 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 2, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; fp8 x bf6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 3, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 3, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; fp8 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 4, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 4, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; bf8 x fp8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -1336,30 +1034,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -1380,89 +1077,129 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 0, ; blgp + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 0, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf8 x bf8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: +; fp8 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -1484,30 +1221,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -1528,190 +1264,408 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 1, ; blgp + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 1, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf8 x fp6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) +; fp8 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 2, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf8 x bf6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3: +; bf8 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 @@ -1732,235 +1686,135 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz - i32 3, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 3, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; bf8 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, +; bf8 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz - i32 4, ; blgp + i32 1, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 4, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} -; fp6 x fp8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 0, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 @@ -1981,58 +1835,846 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 0, ; blgp + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 1, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp6 x bf8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; bf8 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf8 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf8 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp6 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp6 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp @@ -2041,47 +2683,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp @@ -2091,46 +2775,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons ; fp6 x fp6 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp @@ -2139,46 +2864,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp @@ -2188,46 +2954,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons ; fp6 x bf6 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp @@ -2236,46 +3043,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp @@ -2286,98 +3134,184 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons ; bf6 x fp8 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 0, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp @@ -2387,49 +3321,93 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons ; bf6 x bf8 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp @@ -2438,47 +3416,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp @@ -2488,46 +3508,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons ; bf6 x fp6 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp @@ -2536,46 +3597,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp @@ -2585,46 +3687,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons ; bf6 x fp4 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp @@ -2633,46 +3776,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp @@ -2682,46 +3866,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons ; bf6 x bf6 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp @@ -2730,46 +3955,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp @@ -2779,46 +4045,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons ; fp6 x fp4 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp @@ -2827,96 +4134,179 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz - i32 4, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; fp4 x fp8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp4 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -2925,47 +4315,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -2975,47 +4407,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons ; fp4 x bf8 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp @@ -3024,47 +4498,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp @@ -3074,46 +4590,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons ; fp4 x fp6 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp @@ -3122,46 +4679,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp @@ -3171,46 +4769,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons ; fp4 x bf6 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp @@ -3219,46 +4858,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp @@ -3268,46 +4948,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons ; fp4 x fp4 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp @@ -3316,46 +5037,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp @@ -3515,31 +5277,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s0 -; SDAG-NEXT: v_mov_b32_e32 v33, s1 -; SDAG-NEXT: v_mov_b32_e32 v34, s2 -; SDAG-NEXT: v_mov_b32_e32 v35, s3 -; SDAG-NEXT: v_mov_b32_e32 v36, s16 -; SDAG-NEXT: v_mov_b32_e32 v37, s17 -; SDAG-NEXT: v_mov_b32_e32 v38, s18 -; SDAG-NEXT: v_mov_b32_e32 v39, s19 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v31, v13 -; SDAG-NEXT: v_mov_b32_e32 v30, v12 -; SDAG-NEXT: v_mov_b32_e32 v29, v11 -; SDAG-NEXT: v_mov_b32_e32 v28, v10 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 +; SDAG-NEXT: v_mov_b32_e32 v18, s2 +; SDAG-NEXT: v_mov_b32_e32 v19, s3 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v7 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v6 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v5 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v4 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v3 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s28 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s29 ; SDAG-NEXT: v_mov_b32_e32 v0, s20 ; SDAG-NEXT: v_mov_b32_e32 v1, s21 ; SDAG-NEXT: v_mov_b32_e32 v2, s22 @@ -3548,23 +5309,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_mov_b32_e32 v5, s25 ; SDAG-NEXT: v_mov_b32_e32 v6, s26 ; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v31 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -3593,48 +5339,32 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b32_e32 v18, v0 -; GISEL-NEXT: v_mov_b32_e32 v19, v1 -; GISEL-NEXT: v_mov_b32_e32 v20, v2 -; GISEL-NEXT: v_mov_b32_e32 v21, v3 -; GISEL-NEXT: v_mov_b32_e32 v22, v4 -; GISEL-NEXT: v_mov_b32_e32 v23, v5 -; GISEL-NEXT: v_mov_b32_e32 v24, v6 -; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v2 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v3 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v4 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v5 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v6 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v7 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s28 -; GISEL-NEXT: v_mov_b32_e32 v26, v8 -; GISEL-NEXT: v_mov_b32_e32 v27, v9 -; GISEL-NEXT: v_mov_b32_e32 v28, v10 -; GISEL-NEXT: v_mov_b32_e32 v29, v11 -; GISEL-NEXT: v_mov_b32_e32 v30, v12 -; GISEL-NEXT: v_mov_b32_e32 v31, v13 -; GISEL-NEXT: v_mov_b32_e32 v17, s29 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21] -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v31 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a0, s28 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s29 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[26:27] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -3662,33 +5392,33 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v30, s16 -; SDAG-NEXT: v_mov_b32_e32 v31, s17 -; SDAG-NEXT: v_mov_b32_e32 v32, s18 -; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[8:15], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -3717,10 +5447,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 @@ -3729,6 +5455,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 ; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 ; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[18:19] ; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 @@ -3738,7 +5468,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[8:15], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -3767,32 +5497,32 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v30, s16 -; SDAG-NEXT: v_mov_b32_e32 v31, s17 -; SDAG-NEXT: v_mov_b32_e32 v32, s18 -; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[8:15], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -3821,10 +5551,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 @@ -3833,6 +5559,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 ; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 ; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[18:19] ; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 @@ -3842,7 +5572,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[8:15], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -3871,32 +5601,32 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v30, s16 -; SDAG-NEXT: v_mov_b32_e32 v31, s17 -; SDAG-NEXT: v_mov_b32_e32 v32, s18 -; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v24, s20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -3925,11 +5655,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 @@ -3937,6 +5663,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 ; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 ; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] ; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 @@ -3945,8 +5672,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[18:19] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v24, s20 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -4021,48 +5751,32 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s0 -; SDAG-NEXT: v_mov_b32_e32 v33, s1 -; SDAG-NEXT: v_mov_b32_e32 v34, s2 -; SDAG-NEXT: v_mov_b32_e32 v35, s3 -; SDAG-NEXT: v_mov_b32_e32 v36, s16 -; SDAG-NEXT: v_mov_b32_e32 v37, s17 -; SDAG-NEXT: v_mov_b32_e32 v38, s18 -; SDAG-NEXT: v_mov_b32_e32 v39, s19 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v31, v13 -; SDAG-NEXT: v_mov_b32_e32 v30, v12 -; SDAG-NEXT: v_mov_b32_e32 v29, v11 -; SDAG-NEXT: v_mov_b32_e32 v28, v10 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b32_e32 v20, s24 -; SDAG-NEXT: v_mov_b32_e32 v21, s25 -; SDAG-NEXT: v_mov_b32_e32 v22, s26 -; SDAG-NEXT: v_mov_b32_e32 v23, s27 -; SDAG-NEXT: v_mov_b32_e32 v24, s28 -; SDAG-NEXT: v_mov_b32_e32 v25, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v31 +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 +; SDAG-NEXT: v_mov_b32_e32 v18, s2 +; SDAG-NEXT: v_mov_b32_e32 v19, s3 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s24 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s25 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s26 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s28 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s29 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -4091,44 +5805,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v16, s20 -; GISEL-NEXT: v_mov_b32_e32 v26, v8 -; GISEL-NEXT: v_mov_b32_e32 v27, v9 -; GISEL-NEXT: v_mov_b32_e32 v28, v10 -; GISEL-NEXT: v_mov_b32_e32 v29, v11 -; GISEL-NEXT: v_mov_b32_e32 v30, v12 -; GISEL-NEXT: v_mov_b32_e32 v31, v13 -; GISEL-NEXT: v_mov_b32_e32 v17, s21 -; GISEL-NEXT: v_mov_b32_e32 v18, s22 -; GISEL-NEXT: v_mov_b32_e32 v19, s23 -; GISEL-NEXT: v_mov_b32_e32 v20, s24 -; GISEL-NEXT: v_mov_b32_e32 v21, s25 -; GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GISEL-NEXT: v_mov_b32_e32 v23, s27 -; GISEL-NEXT: v_mov_b32_e32 v24, s28 -; GISEL-NEXT: v_mov_b32_e32 v25, s29 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v31 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a0, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s23 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s24 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s25 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s26 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s27 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s28 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -5901,49 +7599,93 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp @@ -5952,49 +7694,93 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 08167cd226a4c..ea4560a6c1c85 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -72,94 +72,46 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x64_f16: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x64_f16: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x64_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -297,217 +249,85 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_f16: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x32_f16: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x32_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } @@ -520,7 +340,6 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; SDAG-NEXT: v_mov_b32_e32 v29, s1 ; SDAG-NEXT: v_mov_b32_e32 v30, s2 ; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: v_mov_b32_e32 v26, v8 ; SDAG-NEXT: v_mov_b32_e32 v25, v7 @@ -531,12 +350,12 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; SDAG-NEXT: v_mov_b32_e32 v20, v2 ; SDAG-NEXT: v_mov_b32_e32 v19, v1 ; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 ; SDAG-NEXT: v_mov_b32_e32 v0, s16 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 @@ -545,79 +364,73 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; SDAG-NEXT: v_mov_b32_e32 v5, s21 ; SDAG-NEXT: v_mov_b32_e32 v6, s22 ; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -668,17 +481,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -688,17 +496,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result @@ -708,17 +511,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result @@ -809,42 +607,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -854,42 +635,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, < ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result @@ -899,42 +663,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, < ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result @@ -948,7 +695,6 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-NEXT: v_mov_b32_e32 v29, s1 ; GCN-NEXT: v_mov_b32_e32 v30, s2 ; GCN-NEXT: v_mov_b32_e32 v31, s3 -; GCN-NEXT: v_mov_b32_e32 v12, s24 ; GCN-NEXT: v_mov_b32_e32 v27, v9 ; GCN-NEXT: v_mov_b32_e32 v26, v8 ; GCN-NEXT: v_mov_b32_e32 v25, v7 @@ -959,12 +705,12 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-NEXT: v_mov_b32_e32 v20, v2 ; GCN-NEXT: v_mov_b32_e32 v19, v1 ; GCN-NEXT: v_mov_b32_e32 v18, v0 +; GCN-NEXT: v_mov_b32_e32 v12, s24 ; GCN-NEXT: v_mov_b32_e32 v13, s25 ; GCN-NEXT: v_mov_b32_e32 v14, s26 ; GCN-NEXT: v_mov_b32_e32 v15, s27 ; GCN-NEXT: v_mov_b32_e32 v16, s28 ; GCN-NEXT: v_mov_b32_e32 v17, s29 -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 ; GCN-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NEXT: v_mov_b32_e32 v2, s18 @@ -973,41 +719,26 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-NEXT: v_mov_b32_e32 v5, s21 ; GCN-NEXT: v_mov_b32_e32 v6, s22 ; GCN-NEXT: v_mov_b32_e32 v7, s23 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[28:31], v[0:7], v10 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -1087,94 +818,46 @@ bb: } define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_16x16x128_i8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_16x16x128_i8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_i32_16x16x128_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x i32> %result } define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x i32> %result } define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x i32> %result } @@ -1318,217 +1001,85 @@ bb: } define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_32x32x64_i8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_32x32x64_i8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) +; GCN-LABEL: test_smfmac_i32_32x32x64_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x i32> %result } define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x i32> %result } define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x i32> %result } @@ -1541,7 +1092,6 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-NEXT: v_mov_b32_e32 v29, s1 ; SDAG-NEXT: v_mov_b32_e32 v30, s2 ; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: v_mov_b32_e32 v26, v8 ; SDAG-NEXT: v_mov_b32_e32 v25, v7 @@ -1552,12 +1102,12 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-NEXT: v_mov_b32_e32 v20, v2 ; SDAG-NEXT: v_mov_b32_e32 v19, v1 ; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 ; SDAG-NEXT: v_mov_b32_e32 v0, s16 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 @@ -1566,79 +1116,73 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-NEXT: v_mov_b32_e32 v5, s21 ; SDAG-NEXT: v_mov_b32_e32 v6, s22 ; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x i32> %result @@ -1718,94 +1262,46 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -1935,94 +1431,46 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -2152,94 +1600,46 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -2369,94 +1769,46 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -2600,217 +1952,85 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } @@ -2823,7 +2043,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v29, s1 ; SDAG-NEXT: v_mov_b32_e32 v30, s2 ; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: v_mov_b32_e32 v26, v8 ; SDAG-NEXT: v_mov_b32_e32 v25, v7 @@ -2834,12 +2053,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v20, v2 ; SDAG-NEXT: v_mov_b32_e32 v19, v1 ; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 ; SDAG-NEXT: v_mov_b32_e32 v0, s16 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 @@ -2848,79 +2067,73 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v5, s21 ; SDAG-NEXT: v_mov_b32_e32 v6, s22 ; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -3014,217 +2227,85 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } @@ -3237,7 +2318,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v29, s1 ; SDAG-NEXT: v_mov_b32_e32 v30, s2 ; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: v_mov_b32_e32 v26, v8 ; SDAG-NEXT: v_mov_b32_e32 v25, v7 @@ -3248,12 +2328,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v20, v2 ; SDAG-NEXT: v_mov_b32_e32 v19, v1 ; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 ; SDAG-NEXT: v_mov_b32_e32 v0, s16 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 @@ -3262,79 +2342,73 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v5, s21 ; SDAG-NEXT: v_mov_b32_e32 v6, s22 ; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -3428,217 +2502,85 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } @@ -3651,7 +2593,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v29, s1 ; SDAG-NEXT: v_mov_b32_e32 v30, s2 ; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: v_mov_b32_e32 v26, v8 ; SDAG-NEXT: v_mov_b32_e32 v25, v7 @@ -3662,12 +2603,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v20, v2 ; SDAG-NEXT: v_mov_b32_e32 v19, v1 ; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 ; SDAG-NEXT: v_mov_b32_e32 v0, s16 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 @@ -3676,79 +2617,73 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v5, s21 ; SDAG-NEXT: v_mov_b32_e32 v6, s22 ; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -3842,217 +2777,85 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } @@ -4065,7 +2868,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v29, s1 ; SDAG-NEXT: v_mov_b32_e32 v30, s2 ; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: v_mov_b32_e32 v26, v8 ; SDAG-NEXT: v_mov_b32_e32 v25, v7 @@ -4076,12 +2878,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v20, v2 ; SDAG-NEXT: v_mov_b32_e32 v19, v1 ; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 ; SDAG-NEXT: v_mov_b32_e32 v0, s16 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 @@ -4090,79 +2892,73 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v5, s21 ; SDAG-NEXT: v_mov_b32_e32 v6, s22 ; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 3fffb04de1a63..7dd5366a35c8d 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -683,69 +683,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY90A-NEXT: v_mov_b32_e32 v16, 1.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v17, 2.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s15 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s14 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s13 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s12 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s11 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s10 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s9 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s8 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s7 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s6 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s5 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s4 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s3 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s2 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a17, s1 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a16, s0 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v16, v17, a[16:31] -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v16, v17, a[16:31] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v0, a16 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v1, a17 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v15, a13 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v14, a12 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v13, a11 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v12, a10 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v11, a9 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v10, a8 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v9, a7 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v8, a6 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v7, a5 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v6, a4 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v5, a3 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v4, a2 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GREEDY90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, v3 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, v4 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, v5 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, v6 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, v7 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, v8 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, v9 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, v10 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, v11 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, v12 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, v13 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, v14 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18 +; GREEDY90A-NEXT: s_nop 1 +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY90A-NEXT: v_mov_b32_e32 v0, 0 -; GREEDY90A-NEXT: s_nop 0 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v16, v17, a[0:15] ; GREEDY90A-NEXT: s_nop 7 -; GREEDY90A-NEXT: s_nop 2 +; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GREEDY90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GREEDY90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -755,69 +725,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY942-NEXT: v_mov_b32_e32 v16, 1.0 -; GREEDY942-NEXT: v_mov_b32_e32 v17, 2.0 +; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s15 -; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s14 -; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s13 -; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s12 -; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s11 -; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s10 -; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s9 -; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s8 -; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s7 -; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s6 -; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s5 -; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s4 -; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s3 -; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s2 -; GREEDY942-NEXT: v_accvgpr_write_b32 a17, s1 -; GREEDY942-NEXT: v_accvgpr_write_b32 a16, s0 +; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15 +; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14 +; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13 +; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12 +; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11 +; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10 +; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9 +; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8 +; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7 +; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6 +; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5 +; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4 +; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3 +; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2 +; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1 +; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[16:31], v16, v17, a[16:31] -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v16, v17, a[16:31] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33] ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 0 -; GREEDY942-NEXT: v_accvgpr_read_b32 v0, a16 -; GREEDY942-NEXT: v_accvgpr_read_b32 v1, a17 -; GREEDY942-NEXT: v_accvgpr_read_b32 v15, a13 -; GREEDY942-NEXT: v_accvgpr_read_b32 v14, a12 -; GREEDY942-NEXT: v_accvgpr_read_b32 v13, a11 -; GREEDY942-NEXT: v_accvgpr_read_b32 v12, a10 -; GREEDY942-NEXT: v_accvgpr_read_b32 v11, a9 -; GREEDY942-NEXT: v_accvgpr_read_b32 v10, a8 -; GREEDY942-NEXT: v_accvgpr_read_b32 v9, a7 -; GREEDY942-NEXT: v_accvgpr_read_b32 v8, a6 -; GREEDY942-NEXT: v_accvgpr_read_b32 v7, a5 -; GREEDY942-NEXT: v_accvgpr_read_b32 v6, a4 -; GREEDY942-NEXT: v_accvgpr_read_b32 v5, a3 -; GREEDY942-NEXT: v_accvgpr_read_b32 v4, a2 -; GREEDY942-NEXT: v_accvgpr_read_b32 v3, a1 -; GREEDY942-NEXT: v_accvgpr_read_b32 v2, a0 -; GREEDY942-NEXT: v_accvgpr_write_b32 a0, v0 -; GREEDY942-NEXT: v_accvgpr_write_b32 a1, v1 -; GREEDY942-NEXT: v_accvgpr_write_b32 a2, v2 -; GREEDY942-NEXT: v_accvgpr_write_b32 a3, v3 -; GREEDY942-NEXT: v_accvgpr_write_b32 a4, v4 -; GREEDY942-NEXT: v_accvgpr_write_b32 a5, v5 -; GREEDY942-NEXT: v_accvgpr_write_b32 a6, v6 -; GREEDY942-NEXT: v_accvgpr_write_b32 a7, v7 -; GREEDY942-NEXT: v_accvgpr_write_b32 a8, v8 -; GREEDY942-NEXT: v_accvgpr_write_b32 a9, v9 -; GREEDY942-NEXT: v_accvgpr_write_b32 a10, v10 -; GREEDY942-NEXT: v_accvgpr_write_b32 a11, v11 -; GREEDY942-NEXT: v_accvgpr_write_b32 a12, v12 -; GREEDY942-NEXT: v_accvgpr_write_b32 a13, v13 -; GREEDY942-NEXT: v_accvgpr_write_b32 a14, v14 -; GREEDY942-NEXT: v_accvgpr_write_b32 a15, v15 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] ; GREEDY942-NEXT: v_mov_b32_e32 v0, 0 -; GREEDY942-NEXT: s_nop 0 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v16, v17, a[0:15] ; GREEDY942-NEXT: s_nop 7 -; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: s_nop 0 ; GREEDY942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GREEDY942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GREEDY942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -1058,11 +998,13 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: v_accvgpr_write_b32 a2, s6 ; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] -; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v1, v2, a[0:3] -; FAST90A-NEXT: s_nop 4 -; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a4 +; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[8:11], v1, v2, a[0:3] +; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v1, v2, a[8:11] +; FAST90A-NEXT: s_nop 3 +; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a9 +; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a8 ; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a5 +; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a4 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] ; FAST90A-NEXT: s_nop 4 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index 3844d6054e130..6eca04277585e 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -15,27 +15,27 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GFX942-NEXT: s_mov_b32 s3, 0 +; GFX942-NEXT: s_mov_b32 s6, 0 ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX942-NEXT: s_or_b32 s4, s3, 1 -; GFX942-NEXT: s_ashr_i32 s5, s3, 31 ; GFX942-NEXT: s_mov_b32 s3, s2 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX942-NEXT: s_and_b32 s3, s5, s4 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3] +; GFX942-NEXT: s_or_b32 s4, s6, 1 +; GFX942-NEXT: s_ashr_i32 s3, s6, 31 +; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[0:1], v[0:1], a[0:3] +; GFX942-NEXT: s_and_b32 s6, s3, s4 ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: -; GFX942-NEXT: ; implicit-def: $sgpr3 +; GFX942-NEXT: ; implicit-def: $sgpr6 ; GFX942-NEXT: ; implicit-def: $agpr2 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm @@ -52,31 +52,31 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GFX908-NEXT: s_mov_b32 s3, 0 +; GFX908-NEXT: s_mov_b32 s6, 0 ; GFX908-NEXT: s_branch .LBB0_2 ; GFX908-NEXT: .LBB0_1: ; %bb2 ; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX908-NEXT: s_or_b32 s4, s3, 1 -; GFX908-NEXT: s_ashr_i32 s5, s3, 31 +; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: s_mov_b32 s3, s2 -; GFX908-NEXT: v_mov_b32_e32 v1, s2 -; GFX908-NEXT: s_nop 2 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX908-NEXT: v_mov_b32_e32 v2, s3 +; GFX908-NEXT: v_mov_b32_e32 v0, s2 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, s3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a1 +; GFX908-NEXT: s_or_b32 s4, s6, 1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: s_and_b32 s3, s5, s4 -; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3] +; GFX908-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX908-NEXT: s_ashr_i32 s3, s6, 31 +; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[0:1], v[0:1], a[0:3] +; GFX908-NEXT: s_and_b32 s6, s3, s4 ; GFX908-NEXT: s_cbranch_execz .LBB0_4 ; GFX908-NEXT: .LBB0_2: ; %bb ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX908-NEXT: s_cbranch_vccz .LBB0_1 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: ; implicit-def: $sgpr3 +; GFX908-NEXT: ; implicit-def: $sgpr6 ; GFX908-NEXT: ; implicit-def: $agpr2 ; GFX908-NEXT: .LBB0_4: ; %common.ret ; GFX908-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir index ee5481617cf59..eb1122b6b62f8 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir @@ -89,7 +89,7 @@ body: | ; COALESCE-NEXT: bb.1: ; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; COALESCE-NEXT: {{ $}} - ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 ; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc @@ -102,11 +102,10 @@ body: | ; COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 - ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] - ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] + ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 - ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: bb.3: @@ -143,7 +142,7 @@ body: | ; GFX908-COALESCE-NEXT: bb.1: ; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX908-COALESCE-NEXT: {{ $}} - ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 ; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc @@ -156,11 +155,10 @@ body: | ; GFX908-COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 - ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] - ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] + ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 - ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 ; GFX908-COALESCE-NEXT: {{ $}} ; GFX908-COALESCE-NEXT: bb.3: From 4e7bf6c242ebb2b348a2c178f439f2ffdac5d252 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 4 Jun 2025 10:35:06 -0700 Subject: [PATCH 3/3] Fix comment typo Change-Id: Iac1f38523e51f8abb796ed1f86b95132d86679d4 --- llvm/include/llvm/CodeGen/MachineRegisterInfo.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 5f868efde8d9c..a20e9f98dd410 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -729,8 +729,9 @@ class MachineRegisterInfo { bool constrainRegAttrs(Register Reg, Register ConstrainingReg, unsigned MinNumRegs = 0); - /// recomputeRegClass - Try to find a legal super-class of Reg's register - /// class that still satisfies the constraints from the instructions using + /// getLargestConstrainedSuperClass - Try to find a legal super-class of Reg's + /// register class that still satisfies the constraints from the instructions + /// using /// \p Reg. \p return the super-class TargetRegisterClass if one was found, /// otherwise \p return the original TargetRegisterClass. const TargetRegisterClass *