diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 3403cbab526d4..508f2dd83108d 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -204,7 +204,7 @@ def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel, FeatureISAVersion9_4_2.Features >; -def : ProcessorModel<"gfx950", SIDPGFX940FullSpeedModel, +def : ProcessorModel<"gfx950", SIDPGFX950FullSpeedModel, FeatureISAVersion9_5_0.Features >; diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index a60b1f28e9d34..117add324db56 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -64,6 +64,7 @@ def Write8PassMAI : SchedWrite; def Write16PassMAI : SchedWrite; def Write4PassDGEMM : SchedWrite; def Write8PassDGEMM : SchedWrite; +def Write16PassDGEMM : SchedWrite; // Scalar float instructions def WriteSFPU : SchedWrite; @@ -94,6 +95,7 @@ def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; def SIDPFullSpeedModel : SISchedMachineModel; def SIDPGFX940FullSpeedModel : SISchedMachineModel; +def SIDPGFX950FullSpeedModel : SISchedMachineModel; def GFX10SpeedModel : SISchedMachineModel; def GFX11SpeedModel : SISchedMachineModel; def GFX12SpeedModel : SISchedMachineModel; @@ -169,6 +171,8 @@ multiclass SICommonWriteRes { def : HWVALUWriteRes; let ReleaseAtCycles = [8] in def : HWVALUWriteRes; + let ReleaseAtCycles = [16] in + def : HWVALUWriteRes; let ReleaseAtCycles = [2] in def : HWWriteRes; @@ -201,6 +205,13 @@ def WriteCopy : SchedWriteVariant<[ SchedVar, SchedVar]>; +// Check if any matrix inputs are interpreted as f8 in an f8f6f4 mfma +// instruction. +def PredIsF8_MFMA_SCALE : SchedPredicate<[{ + TII->getNamedOperand(*MI, AMDGPU::OpName::cbsz)->getImm() <= AMDGPU::MFMAScaleFormats::FP8_E5M2 || + TII->getNamedOperand(*MI, AMDGPU::OpName::blgp)->getImm() <= AMDGPU::MFMAScaleFormats::FP8_E5M2 +}]>; + let SchedModel = SIFullSpeedModel in { defm : SICommonWriteRes; @@ -299,6 +310,58 @@ def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>; } // End SchedModel = SIDPGFX940FullSpeedModel + +let SchedModel = SIDPGFX950FullSpeedModel in { +defm : SICommonWriteRes; + +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; + +def : InstRW<[WriteCopy], (instrs COPY)>; +def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; +def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>; + +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X8X")>; +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>; +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X32")>; +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X64")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>; + +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X4XF")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X16")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X32_")>; +def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>; + +def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>; +def : InstRW<[Write16PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; + +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>; + + +// If either matrix format is f8, the instruction takes 2x as many +// cycles. TODO: This isn't reflected in MCA. +def WriteMFMAScale_16X16X128_F8F6F4 : SchedWriteVariant<[ + SchedVar, + SchedVar]>; +def WriteMFMAScale_32X32X64_F8F6F4 : SchedWriteVariant<[ + SchedVar, + SchedVar]>; + +def : InstRW<[WriteMFMAScale_16X16X128_F8F6F4, MIMFMARead], + (instregex "^V_MFMA(_SCALE)?_.32_16X16X128_F8F6F4")>; +def : InstRW<[WriteMFMAScale_32X32X64_F8F6F4, MIMFMARead], + (instregex "^V_MFMA(_SCALE)?_.32_32X32X64_F8F6F4")>; + +} // End SchedModel = SIDPGFX950FullSpeedModel + + let SchedModel = GFX10SpeedModel in { // The latency values are 1 / (operations / cycle). diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index d0ae669ffb3d6..5d149f7c0c62e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s +; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) @@ -49,52 +49,366 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal ret <4 x float> %result } -define <4 x float> @test_mfma_f32_16x16x32_f16__mac(<4 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) { -; GCN-LABEL: test_mfma_f32_16x16x32_f16__mac: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v0 -; GCN-NEXT: v_accvgpr_write_b32 a1, v1 -; GCN-NEXT: v_accvgpr_write_b32 a2, v2 -; GCN-NEXT: v_accvgpr_write_b32 a3, v3 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[4:7], v[8:11], a[0:3] -; GCN-NEXT: s_nop 6 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 { +; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) - ret <4 x float> %result + store <4 x float> %result, ptr addrspace(1) %out + ret void } -define <4 x float> @test_mfma_f32_16x16x32_f16___flags__mac(<4 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) { -; GCN-LABEL: test_mfma_f32_16x16x32_f16___flags__mac: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v0 -; GCN-NEXT: v_accvgpr_write_b32 a1, v1 -; GCN-NEXT: v_accvgpr_write_b32 a2, v2 -; GCN-NEXT: v_accvgpr_write_b32 a3, v3 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[4:7], v[8:11], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 6 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1) - ret <4 x float> %result +define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 { +; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: s_endpgm + %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) + store <4 x float> %result, ptr addrspace(1) %out + ret void } ; -------------------------------------------------------------------- ; llvm.amdgcn.mfma.f32.32x32x16.f16 ; -------------------------------------------------------------------- -define <16 x float> @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_f32_32x32x16_f16: +define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) #1 { +; SDAG-LABEL: test_mfma_f32_32x32x16_f16: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48 +; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_f32_32x32x16_f16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) + store volatile <16 x float> %result, ptr addrspace(1) null + store volatile <16 x float> %arg2, ptr addrspace(1) null + ret void +} + +define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) #1 { +; SDAG-LABEL: test_mfma_f32_32x32x16_f16__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48 +; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_f32_32x32x16_f16__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1) + store volatile <16 x float> %result, ptr addrspace(1) null + store volatile <16 x float> %arg2, ptr addrspace(1) null + ret void +} + +define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_accvgpr_write_b32 a0, v8 @@ -138,8 +452,8 @@ define <16 x float> @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %ar ret <16 x float> %result } -define <16 x float> @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_f32_32x32x16_f16__flags: +define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_accvgpr_write_b32 a0, v8 @@ -183,155 +497,327 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <8 x ha ret <16 x float> %result } -define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<16 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) { -; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v0 -; GCN-NEXT: v_accvgpr_write_b32 a1, v1 -; GCN-NEXT: v_accvgpr_write_b32 a2, v2 -; GCN-NEXT: v_accvgpr_write_b32 a3, v3 -; GCN-NEXT: v_accvgpr_write_b32 a4, v4 -; GCN-NEXT: v_accvgpr_write_b32 a5, v5 -; GCN-NEXT: v_accvgpr_write_b32 a6, v6 -; GCN-NEXT: v_accvgpr_write_b32 a7, v7 -; GCN-NEXT: v_accvgpr_write_b32 a8, v8 -; GCN-NEXT: v_accvgpr_write_b32 a9, v9 -; GCN-NEXT: v_accvgpr_write_b32 a10, v10 -; GCN-NEXT: v_accvgpr_write_b32 a11, v11 -; GCN-NEXT: v_accvgpr_write_b32 a12, v12 -; GCN-NEXT: v_accvgpr_write_b32 a13, v13 -; GCN-NEXT: v_accvgpr_write_b32 a14, v14 -; GCN-NEXT: v_accvgpr_write_b32 a15, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[20:23], a[0:15] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 { +; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) - ret <16 x float> %result + store volatile <16 x float> %arg2, ptr addrspace(1) %out + store volatile <16 x float> %result, ptr addrspace(1) %out + ret void } -define <16 x float> @test_mfma_f32_32x32x16_f16__flags__mac(<16 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) { -; GCN-LABEL: test_mfma_f32_32x32x16_f16__flags__mac: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v0 -; GCN-NEXT: v_accvgpr_write_b32 a1, v1 -; GCN-NEXT: v_accvgpr_write_b32 a2, v2 -; GCN-NEXT: v_accvgpr_write_b32 a3, v3 -; GCN-NEXT: v_accvgpr_write_b32 a4, v4 -; GCN-NEXT: v_accvgpr_write_b32 a5, v5 -; GCN-NEXT: v_accvgpr_write_b32 a6, v6 -; GCN-NEXT: v_accvgpr_write_b32 a7, v7 -; GCN-NEXT: v_accvgpr_write_b32 a8, v8 -; GCN-NEXT: v_accvgpr_write_b32 a9, v9 -; GCN-NEXT: v_accvgpr_write_b32 a10, v10 -; GCN-NEXT: v_accvgpr_write_b32 a11, v11 -; GCN-NEXT: v_accvgpr_write_b32 a12, v12 -; GCN-NEXT: v_accvgpr_write_b32 a13, v13 -; GCN-NEXT: v_accvgpr_write_b32 a14, v14 -; GCN-NEXT: v_accvgpr_write_b32 a15, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[20:23], a[0:15] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1) - ret <16 x float> %result +define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 { +; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3) + store volatile <16 x float> %arg2, ptr addrspace(1) %out + store volatile <16 x float> %result, ptr addrspace(1) %out + ret void } define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 { ; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[18:19] -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa4 -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[18:19] -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa4 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store <16 x float> %result, ptr addrspace(1) %out @@ -341,62 +827,76 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 { ; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[18:19] -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa4 -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[18:19] -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa4 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1) store <16 x float> %result, ptr addrspace(1) %out @@ -419,7 +919,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -439,7 +939,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -452,50 +952,48 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspace(1) %out, <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) #0 { ; SDAG-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s4 -; SDAG-NEXT: v_mov_b32_e32 v1, s5 -; SDAG-NEXT: v_mov_b32_e32 v2, s6 -; SDAG-NEXT: v_mov_b32_e32 v3, s7 -; SDAG-NEXT: v_mov_b32_e32 v4, s8 -; SDAG-NEXT: v_mov_b32_e32 v5, s9 -; SDAG-NEXT: v_mov_b32_e32 v6, s10 -; SDAG-NEXT: v_mov_b32_e32 v7, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s12 -; SDAG-NEXT: v_mov_b32_e32 v9, s13 -; SDAG-NEXT: v_mov_b32_e32 v10, s14 -; SDAG-NEXT: v_mov_b32_e32 v11, s15 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0) store <4 x i32> %result, ptr addrspace(1) %out @@ -505,50 +1003,48 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) #0 { ; SDAG-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s4 -; SDAG-NEXT: v_mov_b32_e32 v1, s5 -; SDAG-NEXT: v_mov_b32_e32 v2, s6 -; SDAG-NEXT: v_mov_b32_e32 v3, s7 -; SDAG-NEXT: v_mov_b32_e32 v4, s8 -; SDAG-NEXT: v_mov_b32_e32 v5, s9 -; SDAG-NEXT: v_mov_b32_e32 v6, s10 -; SDAG-NEXT: v_mov_b32_e32 v7, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s12 -; SDAG-NEXT: v_mov_b32_e32 v9, s13 -; SDAG-NEXT: v_mov_b32_e32 v10, s14 -; SDAG-NEXT: v_mov_b32_e32 v11, s15 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 3, i32 2, i32 1) store <4 x i32> %result, ptr addrspace(1) %out @@ -564,137 +1060,131 @@ declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32>, <4 x i32>, <16 x define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) #1 { ; SDAG-LABEL: test_mfma_i32_32x32x32_i8: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 +; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 +; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a31, s19 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s7 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s6 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s5 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s4 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v6, s30 +; SDAG-NEXT: v_mov_b32_e32 v7, s31 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 ; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] ; SDAG-NEXT: v_mov_b32_e32 v0, s16 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 ; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s4 -; SDAG-NEXT: v_mov_b32_e32 v1, s5 -; SDAG-NEXT: v_mov_b32_e32 v2, s6 -; SDAG-NEXT: v_mov_b32_e32 v3, s7 ; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], 0 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48 +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25] -; GISEL-NEXT: v_accvgpr_write_b32 a31, s19 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[26:27] -; GISEL-NEXT: v_accvgpr_write_b32 a30, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a29, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a28, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a27, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a26, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a25, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a24, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a23, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a22, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a21, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a20, s8 -; GISEL-NEXT: v_accvgpr_write_b32 a19, s7 -; GISEL-NEXT: v_accvgpr_write_b32 a18, s6 -; GISEL-NEXT: v_accvgpr_write_b32 a17, s5 -; GISEL-NEXT: v_accvgpr_write_b32 a16, s4 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 16 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 32 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v[8:9], a[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) @@ -706,137 +1196,131 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) #1 { ; SDAG-LABEL: test_mfma_i32_32x32x32_i8__flags: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 +; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 +; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a31, s19 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s7 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s6 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s5 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s4 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:2 abid:3 blgp:1 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v6, s30 +; SDAG-NEXT: v_mov_b32_e32 v7, s31 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 ; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v0, s16 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 ; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s4 -; SDAG-NEXT: v_mov_b32_e32 v1, s5 -; SDAG-NEXT: v_mov_b32_e32 v2, s6 -; SDAG-NEXT: v_mov_b32_e32 v3, s7 ; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__flags: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], 0 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48 +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25] -; GISEL-NEXT: v_accvgpr_write_b32 a31, s19 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[26:27] -; GISEL-NEXT: v_accvgpr_write_b32 a30, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a29, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a28, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a27, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a26, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a25, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a24, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a23, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a22, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a21, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a20, s8 -; GISEL-NEXT: v_accvgpr_write_b32 a19, s7 -; GISEL-NEXT: v_accvgpr_write_b32 a18, s6 -; GISEL-NEXT: v_accvgpr_write_b32 a17, s5 -; GISEL-NEXT: v_accvgpr_write_b32 a16, s4 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:2 abid:3 blgp:1 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 16 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 32 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v[8:9], a[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 2, i32 3, i32 1) @@ -867,7 +1351,8 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -911,7 +1396,8 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -936,116 +1422,126 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 { ; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[16:23], s[0:1], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v40, 0 +; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s16 -; SDAG-NEXT: v_mov_b32_e32 v33, s17 -; SDAG-NEXT: v_mov_b32_e32 v34, s18 -; SDAG-NEXT: v_mov_b32_e32 v35, s19 -; SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v36, s20 -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v37, s21 -; SDAG-NEXT: v_mov_b32_e32 v38, s22 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[18:19] -; SDAG-NEXT: v_mov_b32_e32 v39, s23 -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[6:7] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[4:5] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s4 -; SDAG-NEXT: v_mov_b32_e32 v17, s5 -; SDAG-NEXT: v_mov_b32_e32 v18, s6 -; SDAG-NEXT: v_mov_b32_e32 v19, s7 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xa4 +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[4:5] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: v_mov_b32_e32 v20, 0 -; GISEL-NEXT: global_store_dwordx4 v20, v[16:19], s[2:3] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: global_store_dwordx4 v20, v[16:19], s[2:3] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] -; GISEL-NEXT: global_store_dwordx4 v20, v[16:19], s[2:3] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v20, v[16:19], s[2:3] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[2:3] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[2:3] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[2:3] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[2:3] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) @@ -1057,116 +1553,126 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 { ; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[16:23], s[0:1], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v40, 0 +; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s16 -; SDAG-NEXT: v_mov_b32_e32 v33, s17 -; SDAG-NEXT: v_mov_b32_e32 v34, s18 -; SDAG-NEXT: v_mov_b32_e32 v35, s19 -; SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v36, s20 -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v37, s21 -; SDAG-NEXT: v_mov_b32_e32 v38, s22 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[18:19] -; SDAG-NEXT: v_mov_b32_e32 v39, s23 -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[6:7] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[4:5] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s4 -; SDAG-NEXT: v_mov_b32_e32 v17, s5 -; SDAG-NEXT: v_mov_b32_e32 v18, s6 -; SDAG-NEXT: v_mov_b32_e32 v19, s7 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xa4 +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[4:5] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: v_mov_b32_e32 v20, 0 -; GISEL-NEXT: global_store_dwordx4 v20, v[16:19], s[2:3] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: global_store_dwordx4 v20, v[16:19], s[2:3] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] -; GISEL-NEXT: global_store_dwordx4 v20, v[16:19], s[2:3] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v20, v[16:19], s[2:3] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[2:3] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[2:3] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[2:3] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[2:3] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 2, i32 3) @@ -1178,67 +1684,81 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 { ; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[16:23], s[0:1], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v20, s20 -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v21, s21 -; SDAG-NEXT: v_mov_b32_e32 v22, s22 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v23, s23 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[18:19] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1] +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xa4 +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] -; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) store <16 x i32> %result, ptr addrspace(1) %out @@ -1248,67 +1768,81 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 { ; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[16:23], s[0:1], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v20, s20 -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v21, s21 -; SDAG-NEXT: v_mov_b32_e32 v22, s22 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v23, s23 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[18:19] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1] +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xa4 +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 3, i32 2, i32 1) store <16 x i32> %result, ptr addrspace(1) %out @@ -1428,41 +1962,44 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 { ; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] +; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] ; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_nop 4 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) store <4 x float> %result, ptr addrspace(1) %out @@ -1472,41 +2009,44 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 { ; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_nop 4 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) store <4 x float> %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 1e0a0bf2ca9d9..9a8282231ac15 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -23,7 +23,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -46,7 +47,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -69,7 +71,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -92,7 +95,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -115,7 +119,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -138,7 +143,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -161,7 +167,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -184,7 +191,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -208,7 +216,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -232,7 +241,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -256,7 +266,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -280,7 +291,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -304,7 +316,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -328,7 +341,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -352,7 +366,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -376,7 +391,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -400,7 +416,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -424,7 +441,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -448,7 +466,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -472,7 +491,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -497,7 +517,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -521,7 +542,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -544,7 +566,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -568,7 +591,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -592,7 +616,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -616,7 +641,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -640,7 +666,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -664,7 +691,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -688,7 +716,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -712,7 +741,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -736,7 +766,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -760,7 +791,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -784,7 +815,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -808,7 +839,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -832,7 +863,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -857,7 +888,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -881,7 +913,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -905,7 +938,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -929,7 +963,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -953,7 +988,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -977,7 +1012,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1001,7 +1036,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1025,7 +1060,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1049,7 +1084,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1073,7 +1108,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1097,7 +1132,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1121,7 +1156,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1145,7 +1180,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1169,7 +1205,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1193,7 +1230,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1217,7 +1255,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1241,7 +1280,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1265,7 +1304,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1289,7 +1328,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1313,7 +1352,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1337,7 +1376,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1361,7 +1400,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1389,7 +1428,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: v_mov_b32_e32 v16, s1 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1409,7 +1449,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1429,7 +1470,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1443,6 +1485,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v12, s0 +; SDAG-NEXT: v_mov_b32_e32 v13, s1 +; SDAG-NEXT: v_mov_b32_e32 v14, s2 +; SDAG-NEXT: v_mov_b32_e32 v15, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 ; SDAG-NEXT: v_mov_b32_e32 v20, s28 ; SDAG-NEXT: v_mov_b32_e32 v23, v1 ; SDAG-NEXT: v_mov_b32_e32 v22, v0 @@ -1456,20 +1506,13 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_mov_b32_e32 v9, s25 ; SDAG-NEXT: v_mov_b32_e32 v10, s26 ; SDAG-NEXT: v_mov_b32_e32 v11, s27 -; SDAG-NEXT: v_mov_b32_e32 v12, s0 -; SDAG-NEXT: v_mov_b32_e32 v13, s1 -; SDAG-NEXT: v_mov_b32_e32 v14, s2 -; SDAG-NEXT: v_mov_b32_e32 v15, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v21 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v22 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v23 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1481,18 +1524,18 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: v_mov_b32_e32 v20, s28 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b32_e32 v20, s28 ; GISEL-NEXT: v_mov_b32_e32 v22, v0 ; GISEL-NEXT: v_mov_b32_e32 v23, v1 ; GISEL-NEXT: v_mov_b32_e32 v21, s29 -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_accvgpr_write_b32 a0, v20 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] @@ -1501,7 +1544,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_accvgpr_write_b32 a3, v23 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1515,7 +1559,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v14, s0 ; SDAG-NEXT: v_mov_b32_e32 v15, s1 ; SDAG-NEXT: v_mov_b32_e32 v16, s2 @@ -1524,12 +1567,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1544,16 +1589,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1567,7 +1613,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v14, s0 ; SDAG-NEXT: v_mov_b32_e32 v15, s1 ; SDAG-NEXT: v_mov_b32_e32 v16, s2 @@ -1576,12 +1621,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1596,16 +1643,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1633,7 +1681,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1657,7 +1706,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1677,7 +1727,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, s16 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1705,7 +1756,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a3, s23 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1720,16 +1772,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s20 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s20 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s23 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1749,7 +1802,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__ ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1770,7 +1824,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1787,7 +1842,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1809,7 +1865,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1827,7 +1884,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1868,7 +1926,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-NEXT: v_mov_b32_e32 v17, s13 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; @@ -1878,11 +1937,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x40 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s24 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s24 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] @@ -1893,7 +1952,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) @@ -1904,13 +1964,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 { ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s6, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 @@ -1919,6 +1978,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v5, s13 ; SDAG-NEXT: v_mov_b32_e32 v6, s14 ; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 ; SDAG-NEXT: v_mov_b32_e32 v8, s16 ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 @@ -1932,7 +1992,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; @@ -1940,15 +2001,15 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] @@ -1958,8 +2019,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -1977,7 +2039,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1998,7 +2061,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2018,7 +2082,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2038,7 +2103,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2062,7 +2128,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2085,7 +2152,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2108,7 +2176,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2131,7 +2199,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_ ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2154,7 +2222,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2177,7 +2246,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2200,7 +2270,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2223,7 +2294,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2246,7 +2318,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2269,7 +2341,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4_ ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 426764d91b8a1..05f8739e7cb89 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -38,7 +38,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -81,7 +83,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -131,7 +135,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -174,7 +180,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -224,7 +232,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -267,7 +277,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -317,7 +329,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -360,7 +374,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -410,7 +426,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -453,7 +471,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -503,7 +523,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -546,7 +568,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -596,7 +620,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -639,7 +665,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -689,7 +717,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -732,7 +762,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -781,7 +813,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__cons ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -832,7 +866,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -875,7 +911,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -923,7 +961,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -973,7 +1013,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1020,7 +1062,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1070,7 +1114,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1117,7 +1163,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1165,7 +1213,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1212,7 +1262,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1263,7 +1315,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1306,7 +1360,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1354,7 +1410,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__cons ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1405,7 +1463,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1448,7 +1508,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1497,7 +1559,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__cons ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1547,7 +1611,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1594,7 +1660,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1644,7 +1712,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1691,7 +1761,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1739,7 +1811,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1786,7 +1860,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1836,7 +1912,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1883,7 +1961,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1933,7 +2013,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1980,7 +2062,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2028,7 +2112,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2075,7 +2160,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2123,7 +2209,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2170,7 +2257,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2221,7 +2309,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2268,7 +2358,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2318,7 +2410,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2365,7 +2459,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2413,7 +2509,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2460,7 +2557,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2508,7 +2606,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2555,7 +2654,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2603,7 +2703,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2650,7 +2751,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2698,7 +2800,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2745,7 +2848,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2793,7 +2897,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2840,7 +2946,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2888,7 +2996,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2935,7 +3045,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2983,7 +3095,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3030,7 +3143,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3078,7 +3192,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3125,7 +3240,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3173,7 +3289,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3220,7 +3337,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3273,7 +3391,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3319,7 +3439,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3365,7 +3487,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, s0 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3391,6 +3515,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v32, s0 +; SDAG-NEXT: v_mov_b32_e32 v33, s1 +; SDAG-NEXT: v_mov_b32_e32 v34, s2 +; SDAG-NEXT: v_mov_b32_e32 v35, s3 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v31, v13 ; SDAG-NEXT: v_mov_b32_e32 v30, v12 @@ -3416,14 +3548,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_mov_b32_e32 v5, s25 ; SDAG-NEXT: v_mov_b32_e32 v6, s26 ; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: v_mov_b32_e32 v32, s0 -; SDAG-NEXT: v_mov_b32_e32 v33, s1 -; SDAG-NEXT: v_mov_b32_e32 v34, s2 -; SDAG-NEXT: v_mov_b32_e32 v35, s3 -; SDAG-NEXT: v_mov_b32_e32 v36, s16 -; SDAG-NEXT: v_mov_b32_e32 v37, s17 -; SDAG-NEXT: v_mov_b32_e32 v38, s18 -; SDAG-NEXT: v_mov_b32_e32 v39, s19 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 @@ -3441,7 +3565,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_accvgpr_write_b32 a15, v31 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3465,7 +3591,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b32_e32 v18, v0 @@ -3476,6 +3601,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_mov_b32_e32 v23, v5 ; GISEL-NEXT: v_mov_b32_e32 v24, v6 ; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: v_mov_b32_e32 v26, v8 ; GISEL-NEXT: v_mov_b32_e32 v27, v9 ; GISEL-NEXT: v_mov_b32_e32 v28, v10 @@ -3483,12 +3613,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_mov_b32_e32 v30, v12 ; GISEL-NEXT: v_mov_b32_e32 v31, v13 ; GISEL-NEXT: v_mov_b32_e32 v17, s29 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27] ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21] @@ -3509,7 +3635,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_accvgpr_write_b32 a15, v31 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3535,7 +3663,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v26, s0 ; SDAG-NEXT: v_mov_b32_e32 v27, s1 ; SDAG-NEXT: v_mov_b32_e32 v28, s2 @@ -3544,6 +3671,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_mov_b32_e32 v31, s17 ; SDAG-NEXT: v_mov_b32_e32 v32, s18 ; SDAG-NEXT: v_mov_b32_e32 v33, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 @@ -3561,7 +3689,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3588,10 +3718,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 @@ -3609,7 +3739,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3635,7 +3767,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v26, s0 ; SDAG-NEXT: v_mov_b32_e32 v27, s1 ; SDAG-NEXT: v_mov_b32_e32 v28, s2 @@ -3644,6 +3775,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v31, s17 ; SDAG-NEXT: v_mov_b32_e32 v32, s18 ; SDAG-NEXT: v_mov_b32_e32 v33, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 @@ -3661,7 +3793,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3688,10 +3822,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 @@ -3709,7 +3843,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3761,7 +3897,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3809,7 +3947,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3853,7 +3993,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a15, s27 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3897,7 +4039,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a15, s27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3923,6 +4067,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v32, s0 +; SDAG-NEXT: v_mov_b32_e32 v33, s1 +; SDAG-NEXT: v_mov_b32_e32 v34, s2 +; SDAG-NEXT: v_mov_b32_e32 v35, s3 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 ; SDAG-NEXT: v_mov_b32_e32 v16, s20 ; SDAG-NEXT: v_mov_b32_e32 v31, v13 ; SDAG-NEXT: v_mov_b32_e32 v30, v12 @@ -3940,14 +4092,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v24, s28 ; SDAG-NEXT: v_mov_b32_e32 v25, s29 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v32, s0 -; SDAG-NEXT: v_mov_b32_e32 v33, s1 -; SDAG-NEXT: v_mov_b32_e32 v34, s2 -; SDAG-NEXT: v_mov_b32_e32 v35, s3 -; SDAG-NEXT: v_mov_b32_e32 v36, s16 -; SDAG-NEXT: v_mov_b32_e32 v37, s17 -; SDAG-NEXT: v_mov_b32_e32 v38, s18 -; SDAG-NEXT: v_mov_b32_e32 v39, s19 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 @@ -3965,7 +4109,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a15, v31 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3987,11 +4133,15 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v16, s20 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v16, s20 ; GISEL-NEXT: v_mov_b32_e32 v26, v8 ; GISEL-NEXT: v_mov_b32_e32 v27, v9 ; GISEL-NEXT: v_mov_b32_e32 v28, v10 @@ -4007,11 +4157,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_mov_b32_e32 v23, s27 ; GISEL-NEXT: v_mov_b32_e32 v24, s28 ; GISEL-NEXT: v_mov_b32_e32 v25, s29 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[18:19] ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[12:13] ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 @@ -4029,7 +4175,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a15, v31 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4074,7 +4222,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 33, -2 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4120,7 +4270,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, -2 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4162,7 +4314,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, -2 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4209,7 +4363,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4252,7 +4408,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4280,7 +4438,6 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 @@ -4314,14 +4471,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 -; SDAG-NEXT: v_mov_b32_e32 v17, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v17 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[12:15], s[2:3] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, a[8:11], s[2:3] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, a[4:7], s[2:3] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[2:3] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: @@ -4331,11 +4491,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] @@ -4358,7 +4518,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 @@ -4372,13 +4534,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 { ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s2, 0x41 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 @@ -4387,6 +4547,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG-NEXT: v_mov_b32_e32 v5, s13 ; SDAG-NEXT: v_mov_b32_e32 v6, s14 ; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 ; SDAG-NEXT: v_mov_b32_e32 v8, s16 ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 @@ -4412,26 +4573,29 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[0:1] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] @@ -4453,7 +4617,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -4551,13 +4717,15 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] @@ -4577,36 +4745,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 ; GISEL-NEXT: v_mov_b32_e32 v16, s1 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0 -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) @@ -4699,13 +4865,16 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] @@ -4724,35 +4893,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16 -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) @@ -4845,13 +5012,16 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a31, s23 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_accvgpr_write_b32 a31, s23 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] @@ -4870,35 +5040,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL-NEXT: v_accvgpr_write_b32 a18, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a17, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a16, s8 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16 -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) @@ -4991,13 +5159,16 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] @@ -5016,35 +5187,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16 -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) @@ -5076,7 +5245,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5121,7 +5292,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5166,7 +5339,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 1 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5211,7 +5386,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a( ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1, 0 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5262,7 +5439,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5305,7 +5484,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5355,7 +5536,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5398,7 +5581,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5448,7 +5633,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5491,7 +5677,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5539,7 +5726,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5589,7 +5777,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5632,7 +5822,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5682,7 +5874,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5725,7 +5919,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5774,7 +5970,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5823,7 +6021,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5873,7 +6073,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5916,7 +6117,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5964,7 +6166,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir index d91ef7b42a4a3..bf51efab2cdf1 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir @@ -15,7 +15,8 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec @@ -36,7 +37,8 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec @@ -57,7 +59,8 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec @@ -78,7 +81,8 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec @@ -98,7 +102,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 2, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 4 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 2, implicit $mode, implicit $exec @@ -118,7 +122,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 3, 3, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 4 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 3, 3, implicit $mode, implicit $exec @@ -138,7 +142,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 4, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 4 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 4, 4, implicit $mode, implicit $exec @@ -159,7 +163,9 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec @@ -180,7 +186,8 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec @@ -201,7 +208,9 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec @@ -222,7 +231,8 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec @@ -243,7 +253,8 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 2 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec @@ -264,7 +275,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: S_NOP 6 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s index cdb96dd717e94..03b4041f77bce 100644 --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s @@ -1,13 +1,9 @@ # RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx950 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s # CHECK: Iterations: 1 -# CHECK: Instructions: 15 -# CHECK: Total Cycles: 60 -# CHECK: Total uOps: 15 - -v_mfma_ld_scale_b32 v0, v0 -v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] -v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] +# CHECK: Instructions: 129 +# CHECK: Total Cycles: 1069 +# CHECK: Total uOps: 129 v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] @@ -15,7 +11,6 @@ v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 - v_mfma_i32_16x16x64_i8 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[0:3], a[4:7] v_mfma_i32_32x32x32_i8 v[0:15], v[0:3], v[0:3], v[0:15] @@ -23,20 +18,312 @@ v_mfma_i32_32x32x32_i8 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 v_mfma_f32_16x16x32_bf16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[0:3], a[4:7] +v_mfma_ld_scale_b32 v0, v0 + +;; FIXME: should have different cycle count depending on whether either matrix is f8 +;; TODO: test vdc/adc +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:2 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:3 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:4 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:3 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:4 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2 blgp:1 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1 blgp:2 + +;; FIXME: should have different cycle count depending on whether either matrix is f8 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:2 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:3 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:4 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:1 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:2 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:3 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:4 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:2 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 + +;; FIXME: should have different cycle count depending on whether either matrix is f8 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 + +;; FIXME +;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:1 +;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:2 +;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:3 +;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:4 + +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 + +;; FIXME +;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:1 +;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:2 +;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:3 +;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:4 + + +;; TODO: These results are wrong +v_smfmac_f32_16x16x64_f16 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +v_smfmac_f32_32x32x32_f16 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +v_smfmac_f32_16x16x64_bf16 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +v_smfmac_f32_32x32x32_bf16 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +v_smfmac_i32_16x16x128_i8 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +v_smfmac_i32_32x32x64_i8 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 + +v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 + +v_smfmac_f32_32x32x64_bf8_bf8 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +v_smfmac_f32_32x32x64_bf8_fp8 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +v_smfmac_f32_32x32x64_fp8_bf8 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +v_smfmac_f32_32x32x64_fp8_fp8 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 + +v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5] +v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5] + +v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33] +v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33] + +v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3] +v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3] + +v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] +v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7] + +v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3] +v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3] + +v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15] +v_mfma_f32_32x32x8_f16 a[0:15], v[4:5], v[6:7], a[0:15] + +v_mfma_f32_16x16x16_bf16 v[0:3], v[4:5], v[6:7], v[0:3] +v_mfma_f32_16x16x16_bf16 a[0:3], v[4:5], v[6:7], a[0:3] + +v_mfma_f32_32x32x8_bf16 v[0:15], v[4:5], v[6:7], v[0:15] +v_mfma_f32_32x32x8_bf16 a[0:15], v[4:5], v[6:7], a[0:15] + +v_mfma_i32_16x16x32_i8 v[0:3], v[4:5], v[6:7], v[0:3] +v_mfma_i32_16x16x32_i8 a[0:3], v[4:5], v[6:7], a[0:3] + +v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] +v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] + +v_mfma_f32_4x4x4_16b_f16 v[0:3], v[0:1], v[2:3], v[2:5] +v_mfma_f32_4x4x4_16b_f16 a[0:3], v[0:1], v[2:3], a[2:5] + +v_mfma_f32_16x16x4_4b_f16 v[0:15], v[2:3], v[4:5], v[18:33] +v_mfma_f32_16x16x4_4b_f16 a[0:15], v[2:3], v[4:5], a[18:33] + +v_mfma_f32_32x32x4_2b_f16 v[0:31], v[0:1], v[2:3], v[34:65] +v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[34:65] + +v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[0:1], v[2:3], v[2:5] +v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[0:1], v[2:3], a[2:5] + +v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[2:3], v[4:5], v[18:33] +v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[4:5], a[18:33] + +v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[0:1], v[2:3], v[34:65] +v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[0:1], v[2:3], a[34:65] + +v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5] +v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[2:5] + +v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, v[18:33] +v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[18:33] + +v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5] +v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5] + +v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, v[34:65] blgp:7 +v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[34:65] blgp:7 + +v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33] +v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33] + +v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, v[2:5] +v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v1, a[2:5] + +v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, v[18:33] +v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[18:33] + +v_mfma_i32_32x32x4_2b_i8 v[0:31], v0, v1, v[34:65] +v_mfma_i32_32x32x4_2b_i8 a[0:31], v0, v1, a[34:65] + +v_smfmac_f32_16x16x32_f16 v[10:13], a[2:3], v[4:7], v0 cbsz:3 abid:1 +v_smfmac_f32_16x16x32_f16 a[10:13], v[2:3], a[4:7], v1 + +v_smfmac_f32_32x32x16_f16 v[10:25], a[2:3], v[4:7], v2 cbsz:3 abid:1 +v_smfmac_f32_32x32x16_f16 a[10:25], v[2:3], a[4:7], v3 + +v_smfmac_f32_16x16x32_bf16 v[10:13], a[2:3], v[4:7], v4 cbsz:3 abid:1 +v_smfmac_f32_16x16x32_bf16 a[10:13], v[2:3], a[4:7], v5 + +v_smfmac_i32_16x16x64_i8 v[10:13], a[2:3], v[4:7], v8 cbsz:3 abid:1 +v_smfmac_i32_16x16x64_i8 a[10:13], v[2:3], a[4:7], v9 + +v_smfmac_i32_32x32x32_i8 v[10:25], a[2:3], v[4:7], v10 cbsz:3 abid:1 +v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11 + +v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[2:3], v[4:5], v[0:3] +v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] + +v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[2:3], v[4:5], v[0:3] +v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] + +v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[2:3], v[4:5], v[0:3] +v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] + +v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[2:3], v[4:5], v[0:3] +v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] + +v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[2:3], v[4:5], v[0:15] +v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[2:3], v[4:5], v[0:15] +v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[2:3], v[4:5], v[0:15] +v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[2:3], v[4:5], v[0:15] + +v_smfmac_f32_16x16x64_bf8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 +v_smfmac_f32_16x16x64_bf8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 +v_smfmac_f32_16x16x64_fp8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 +v_smfmac_f32_16x16x64_fp8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 + +v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1 +v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1 +v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1 +v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1 # CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: - - - - 1.00 - - v_mfma_ld_scale_b32 v0, v0 -# CHECK-NEXT: - - - - 1.00 - - v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] -# CHECK-NEXT: - - - - 1.00 - - v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 -# CHECK-NEXT: - - - - 1.00 - - v_mfma_i32_16x16x64_i8 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 -# CHECK-NEXT: - - - - 1.00 - - v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[0:3], a[4:7] -# CHECK-NEXT: - - - - 1.00 - - v_mfma_i32_32x32x32_i8 v[0:15], v[0:3], v[0:3], v[0:15] -# CHECK-NEXT: - - - - 1.00 - - v_mfma_i32_32x32x32_i8 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_i32_16x16x64_i8 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[0:3], a[4:7] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_i32_32x32x32_i8 v[0:15], v[0:3], v[0:3], v[0:15] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_i32_32x32x32_i8 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_bf16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[0:3], a[4:7] +# CHECK-NEXT: - - - - 1.00 - - v_mfma_ld_scale_b32 v0, v0 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:2 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:3 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:4 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:3 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:4 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2 blgp:1 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1 blgp:2 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:2 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:3 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:4 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:1 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:2 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:3 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:4 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:2 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 op_sel_hi:[0,0,0] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 op_sel_hi:[0,0,0] +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x64_f16 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x32_f16 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x64_bf16 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x32_bf16 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_i32_16x16x128_i8 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_i32_32x32x64_i8 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x64_bf8_bf8 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x64_bf8_fp8 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x64_fp8_bf8 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x64_fp8_fp8 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5] +# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33] +# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33] +# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3] +# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3] +# CHECK-NEXT: - - - - 16.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] +# CHECK-NEXT: - - - - 16.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x8_f16 a[0:15], v[4:5], v[6:7], a[0:15] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_bf16 v[0:3], v[4:5], v[6:7], v[0:3] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_bf16 a[0:3], v[4:5], v[6:7], a[0:3] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x8_bf16 v[0:15], v[4:5], v[6:7], v[0:15] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x8_bf16 a[0:15], v[4:5], v[6:7], a[0:15] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_i32_16x16x32_i8 v[0:3], v[4:5], v[6:7], v[0:3] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_i32_16x16x32_i8 a[0:3], v[4:5], v[6:7], a[0:3] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] +# CHECK-NEXT: - - - - - - 2.00 v_mfma_f32_4x4x4_16b_f16 v[0:3], v[0:1], v[2:3], v[2:5] +# CHECK-NEXT: - - - - - - 2.00 v_mfma_f32_4x4x4_16b_f16 a[0:3], v[0:1], v[2:3], a[2:5] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_4b_f16 v[0:15], v[2:3], v[4:5], v[18:33] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_4b_f16 a[0:15], v[2:3], v[4:5], a[18:33] +# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x4_2b_f16 v[0:31], v[0:1], v[2:3], v[34:65] +# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[34:65] +# CHECK-NEXT: - - - - - - 2.00 v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[0:1], v[2:3], v[2:5] +# CHECK-NEXT: - - - - - - 2.00 v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[0:1], v[2:3], a[2:5] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[2:3], v[4:5], v[18:33] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[4:5], a[18:33] +# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[0:1], v[2:3], v[34:65] +# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[0:1], v[2:3], a[34:65] +# CHECK-NEXT: - - - - - - 2.00 v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5] +# CHECK-NEXT: - - - - - - 2.00 v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[2:5] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, v[18:33] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[18:33] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5] +# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, v[34:65] blgp:7 +# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[34:65] blgp:7 +# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33] +# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33] +# CHECK-NEXT: - - - - - - 2.00 v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, v[2:5] +# CHECK-NEXT: - - - - - - 2.00 v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v1, a[2:5] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, v[18:33] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[18:33] +# CHECK-NEXT: - - - - - - 16.00 v_mfma_i32_32x32x4_2b_i8 v[0:31], v0, v1, v[34:65] +# CHECK-NEXT: - - - - - - 16.00 v_mfma_i32_32x32x4_2b_i8 a[0:31], v0, v1, a[34:65] +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x32_f16 v[10:13], a[2:3], v[4:7], v0 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x32_f16 a[10:13], v[2:3], a[4:7], v1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x16_f16 v[10:25], a[2:3], v[4:7], v2 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x16_f16 a[10:25], v[2:3], a[4:7], v3 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x32_bf16 v[10:13], a[2:3], v[4:7], v4 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x32_bf16 a[10:13], v[2:3], a[4:7], v5 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_i32_16x16x64_i8 v[10:13], a[2:3], v[4:7], v8 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_i32_16x16x64_i8 a[10:13], v[2:3], a[4:7], v9 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_i32_32x32x32_i8 v[10:25], a[2:3], v[4:7], v10 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[2:3], v[4:5], v[0:3] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[2:3], v[4:5], v[0:3] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[2:3], v[4:5], v[0:3] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[2:3], v[4:5], v[0:3] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[2:3], v[4:5], v[0:15] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[2:3], v[4:5], v[0:15] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[2:3], v[4:5], v[0:15] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[2:3], v[4:5], v[0:15] +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x64_bf8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x64_bf8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x64_fp8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x64_fp8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1 +# CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1