diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index f76580b94e13c..21af2dde2c4bf 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -1,9 +1,10 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A-GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY942 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=FAST90A %s ; This is better with 90a @@ -14,13 +15,612 @@ declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float> declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: -; GREEDY: v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] -; GREEDY: v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] -; FAST: v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[32:63] -; FAST: v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:63] -; GCN: v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { +; GREEDY908-LABEL: test_mfma_f32_32x32x1f32: +; GREEDY908: ; %bb.0: ; %bb +; GREEDY908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY908-NEXT: v_mov_b32_e32 v0, s16 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s17 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s18 +; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v1 +; GREEDY908-NEXT: v_mov_b32_e32 v0, s22 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s23 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 +; GREEDY908-NEXT: v_mov_b32_e32 v0, s24 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s25 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s26 +; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v2 +; GREEDY908-NEXT: v_mov_b32_e32 v0, s27 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s28 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s29 +; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v2 +; GREEDY908-NEXT: v_mov_b32_e32 v0, s30 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s31 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2 +; GREEDY908-NEXT: v_mov_b32_e32 v0, s1 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s2 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v2 +; GREEDY908-NEXT: v_mov_b32_e32 v0, s4 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v2 +; GREEDY908-NEXT: v_mov_b32_e32 v0, s7 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s8 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s9 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s19 +; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v2 +; GREEDY908-NEXT: v_mov_b32_e32 v0, s10 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s11 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s12 +; GREEDY908-NEXT: v_mov_b32_e32 v4, s20 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s21 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v2 +; GREEDY908-NEXT: v_mov_b32_e32 v0, s13 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s14 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s15 +; GREEDY908-NEXT: v_mov_b32_e32 v3, 1.0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v4 +; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v5 +; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v2 +; GREEDY908-NEXT: v_mov_b32_e32 v0, 2.0 +; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] +; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31] +; GREEDY908-NEXT: s_nop 7 +; GREEDY908-NEXT: s_nop 7 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32 +; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a33 +; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a59 +; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a58 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34 +; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a57 +; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a56 +; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a35 +; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a55 +; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a54 +; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a36 +; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a53 +; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a52 +; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37 +; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a51 +; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a50 +; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a38 +; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a49 +; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a48 +; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a39 +; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a47 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a46 +; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40 +; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v19 +; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a41 +; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v18 +; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v17 +; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a42 +; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v16 +; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v15 +; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a43 +; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v14 +; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v13 +; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a44 +; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v12 +; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v11 +; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a45 +; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v10 +; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v9 +; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v8 +; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v7 +; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] +; GREEDY908-NEXT: s_nop 7 +; GREEDY908-NEXT: s_nop 7 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a27 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a25 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a24 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a31 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a30 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a29 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a28 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a17 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a16 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a23 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a22 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a21 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a20 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; GREEDY908-NEXT: s_endpgm +; +; GREEDY90A-LABEL: test_mfma_f32_32x32x1f32: +; GREEDY90A: ; %bb.0: ; %bb +; GREEDY90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s16 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s17 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s18 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s19 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, s20 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, s21 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, s22 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, s23 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, s24 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, s25 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, s26 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, s27 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, s28 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, s29 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, s30 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, s31 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a16, s0 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a17, s1 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s2 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s3 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s4 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s5 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s6 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s7 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s8 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s9 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s10 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s11 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s12 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s13 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s14 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GREEDY90A-NEXT: s_nop 1 +; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31] +; GREEDY90A-NEXT: s_nop 7 +; GREEDY90A-NEXT: s_nop 7 +; GREEDY90A-NEXT: s_nop 2 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a32 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a33 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a34 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a35 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a36 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a37 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a38 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a39 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a40 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a41 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a42 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a43 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a44 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a45 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a16, a46 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a17, a47 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a18, a48 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a19, a49 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a20, a50 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a21, a51 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a22, a52 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a23, a53 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a24, a54 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a25, a55 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a26, a56 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a27, a57 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a28, a58 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a29, a59 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a30, a60 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a31, a61 +; GREEDY90A-NEXT: s_nop 1 +; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GREEDY90A-NEXT: s_nop 7 +; GREEDY90A-NEXT: s_nop 7 +; GREEDY90A-NEXT: s_nop 2 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[16:19], s[34:35] offset:64 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[20:23], s[34:35] offset:80 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[34:35] offset:32 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[34:35] offset:48 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[34:35] +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[34:35] offset:16 +; GREEDY90A-NEXT: s_endpgm +; +; GREEDY942-LABEL: test_mfma_f32_32x32x1f32: +; GREEDY942: ; %bb.0: ; %bb +; GREEDY942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s16 +; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s17 +; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s18 +; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s19 +; GREEDY942-NEXT: v_accvgpr_write_b32 a4, s20 +; GREEDY942-NEXT: v_accvgpr_write_b32 a5, s21 +; GREEDY942-NEXT: v_accvgpr_write_b32 a6, s22 +; GREEDY942-NEXT: v_accvgpr_write_b32 a7, s23 +; GREEDY942-NEXT: v_accvgpr_write_b32 a8, s24 +; GREEDY942-NEXT: v_accvgpr_write_b32 a9, s25 +; GREEDY942-NEXT: v_accvgpr_write_b32 a10, s26 +; GREEDY942-NEXT: v_accvgpr_write_b32 a11, s27 +; GREEDY942-NEXT: v_accvgpr_write_b32 a12, s28 +; GREEDY942-NEXT: v_accvgpr_write_b32 a13, s29 +; GREEDY942-NEXT: v_accvgpr_write_b32 a14, s30 +; GREEDY942-NEXT: v_accvgpr_write_b32 a15, s31 +; GREEDY942-NEXT: v_accvgpr_write_b32 a16, s0 +; GREEDY942-NEXT: v_accvgpr_write_b32 a17, s1 +; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s2 +; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s3 +; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s4 +; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s5 +; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s6 +; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s7 +; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s8 +; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s9 +; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s10 +; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s11 +; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s12 +; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s13 +; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s14 +; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s15 +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[32:63], v0, v1, a[0:31] +; GREEDY942-NEXT: s_nop 7 +; GREEDY942-NEXT: s_nop 7 +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a32 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a33 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a34 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a35 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a36 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a37 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a38 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a39 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a40 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a41 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a42 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a43 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a44 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a45 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a16, a46 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a17, a47 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a18, a48 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a19, a49 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a20, a50 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a21, a51 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a22, a52 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a23, a53 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a24, a54 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a25, a55 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a26, a56 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a27, a57 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a28, a58 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a29, a59 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a30, a60 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a31, a61 +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GREEDY942-NEXT: s_nop 7 +; GREEDY942-NEXT: s_nop 7 +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[16:19], s[34:35] offset:64 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[20:23], s[34:35] offset:80 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[34:35] offset:32 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[34:35] offset:48 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[34:35] +; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[34:35] offset:16 +; GREEDY942-NEXT: s_endpgm +; +; GREEDY90A-GISEL-LABEL: test_mfma_f32_32x32x1f32: +; GREEDY90A-GISEL: ; %bb.0: ; %bb +; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 +; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a16, s16 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a17, s17 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a18, s18 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a19, s19 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a20, s20 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a21, s21 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a22, s22 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a23, s23 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a24, s24 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a25, s25 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a26, s26 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a27, s27 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a28, s28 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a29, s29 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a30, s30 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a31, s31 +; GREEDY90A-GISEL-NEXT: s_nop 1 +; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31] +; GREEDY90A-GISEL-NEXT: s_nop 7 +; GREEDY90A-GISEL-NEXT: s_nop 7 +; GREEDY90A-GISEL-NEXT: s_nop 2 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a32 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a33 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a4, a34 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a5, a35 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a6, a36 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a7, a37 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a8, a38 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a9, a39 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a10, a40 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a11, a41 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a12, a42 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a13, a43 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a14, a44 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a15, a45 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a16, a46 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a17, a47 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a18, a48 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a19, a49 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a20, a50 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a21, a51 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a22, a52 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a23, a53 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a24, a54 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a25, a55 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a26, a56 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a27, a57 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a28, a58 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a29, a59 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a30, a60 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a31, a61 +; GREEDY90A-GISEL-NEXT: s_nop 1 +; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GREEDY90A-GISEL-NEXT: s_nop 7 +; GREEDY90A-GISEL-NEXT: s_nop 7 +; GREEDY90A-GISEL-NEXT: s_nop 1 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GREEDY90A-GISEL-NEXT: s_endpgm +; +; FAST90A-LABEL: test_mfma_f32_32x32x1f32: +; FAST90A: ; %bb.0: ; %bb +; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 +; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 +; FAST90A-NEXT: v_mov_b32_e32 v0, 0 +; FAST90A-NEXT: s_waitcnt lgkmcnt(0) +; FAST90A-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 +; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40 +; FAST90A-NEXT: s_waitcnt lgkmcnt(0) +; FAST90A-NEXT: v_accvgpr_write_b32 a32, s36 +; FAST90A-NEXT: v_accvgpr_write_b32 a33, s37 +; FAST90A-NEXT: v_accvgpr_write_b32 a34, s38 +; FAST90A-NEXT: v_accvgpr_write_b32 a35, s39 +; FAST90A-NEXT: v_accvgpr_write_b32 a36, s40 +; FAST90A-NEXT: v_accvgpr_write_b32 a37, s41 +; FAST90A-NEXT: v_accvgpr_write_b32 a38, s42 +; FAST90A-NEXT: v_accvgpr_write_b32 a39, s43 +; FAST90A-NEXT: v_accvgpr_write_b32 a40, s44 +; FAST90A-NEXT: v_accvgpr_write_b32 a41, s45 +; FAST90A-NEXT: v_accvgpr_write_b32 a42, s46 +; FAST90A-NEXT: v_accvgpr_write_b32 a43, s47 +; FAST90A-NEXT: v_accvgpr_write_b32 a44, s48 +; FAST90A-NEXT: v_accvgpr_write_b32 a45, s49 +; FAST90A-NEXT: v_accvgpr_write_b32 a46, s50 +; FAST90A-NEXT: v_accvgpr_write_b32 a47, s51 +; FAST90A-NEXT: v_accvgpr_write_b32 a48, s4 +; FAST90A-NEXT: v_accvgpr_write_b32 a49, s5 +; FAST90A-NEXT: v_accvgpr_write_b32 a50, s6 +; FAST90A-NEXT: v_accvgpr_write_b32 a51, s7 +; FAST90A-NEXT: v_accvgpr_write_b32 a52, s8 +; FAST90A-NEXT: v_accvgpr_write_b32 a53, s9 +; FAST90A-NEXT: v_accvgpr_write_b32 a54, s10 +; FAST90A-NEXT: v_accvgpr_write_b32 a55, s11 +; FAST90A-NEXT: v_accvgpr_write_b32 a56, s12 +; FAST90A-NEXT: v_accvgpr_write_b32 a57, s13 +; FAST90A-NEXT: v_accvgpr_write_b32 a58, s14 +; FAST90A-NEXT: v_accvgpr_write_b32 a59, s15 +; FAST90A-NEXT: v_accvgpr_write_b32 a60, s16 +; FAST90A-NEXT: v_accvgpr_write_b32 a61, s17 +; FAST90A-NEXT: v_accvgpr_write_b32 a62, s18 +; FAST90A-NEXT: v_accvgpr_write_b32 a63, s19 +; FAST90A-NEXT: s_nop 1 +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63] +; FAST90A-NEXT: s_nop 7 +; FAST90A-NEXT: s_nop 7 +; FAST90A-NEXT: s_nop 2 +; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29 +; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28 +; FAST90A-NEXT: v_accvgpr_read_b32 v5, a27 +; FAST90A-NEXT: v_accvgpr_read_b32 v6, a26 +; FAST90A-NEXT: v_accvgpr_read_b32 v7, a25 +; FAST90A-NEXT: v_accvgpr_read_b32 v8, a24 +; FAST90A-NEXT: v_accvgpr_read_b32 v9, a23 +; FAST90A-NEXT: v_accvgpr_read_b32 v10, a22 +; FAST90A-NEXT: v_accvgpr_read_b32 v11, a21 +; FAST90A-NEXT: v_accvgpr_read_b32 v12, a20 +; FAST90A-NEXT: v_accvgpr_read_b32 v13, a19 +; FAST90A-NEXT: v_accvgpr_read_b32 v14, a18 +; FAST90A-NEXT: v_accvgpr_read_b32 v15, a17 +; FAST90A-NEXT: v_accvgpr_read_b32 v16, a16 +; FAST90A-NEXT: v_accvgpr_read_b32 v17, a15 +; FAST90A-NEXT: v_accvgpr_read_b32 v18, a14 +; FAST90A-NEXT: v_accvgpr_read_b32 v19, a13 +; FAST90A-NEXT: v_accvgpr_read_b32 v20, a12 +; FAST90A-NEXT: v_accvgpr_read_b32 v21, a11 +; FAST90A-NEXT: v_accvgpr_read_b32 v22, a10 +; FAST90A-NEXT: v_accvgpr_read_b32 v23, a9 +; FAST90A-NEXT: v_accvgpr_read_b32 v24, a8 +; FAST90A-NEXT: v_accvgpr_read_b32 v25, a7 +; FAST90A-NEXT: v_accvgpr_read_b32 v26, a6 +; FAST90A-NEXT: v_accvgpr_read_b32 v27, a5 +; FAST90A-NEXT: v_accvgpr_read_b32 v28, a4 +; FAST90A-NEXT: v_accvgpr_read_b32 v29, a3 +; FAST90A-NEXT: v_accvgpr_read_b32 v30, a2 +; FAST90A-NEXT: v_accvgpr_read_b32 v31, a1 +; FAST90A-NEXT: v_accvgpr_read_b32 v32, a0 +; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32 +; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33 +; FAST90A-NEXT: v_accvgpr_write_b32 a2, v32 +; FAST90A-NEXT: v_accvgpr_write_b32 a3, v31 +; FAST90A-NEXT: v_accvgpr_write_b32 a4, v30 +; FAST90A-NEXT: v_accvgpr_write_b32 a5, v29 +; FAST90A-NEXT: v_accvgpr_write_b32 a6, v28 +; FAST90A-NEXT: v_accvgpr_write_b32 a7, v27 +; FAST90A-NEXT: v_accvgpr_write_b32 a8, v26 +; FAST90A-NEXT: v_accvgpr_write_b32 a9, v25 +; FAST90A-NEXT: v_accvgpr_write_b32 a10, v24 +; FAST90A-NEXT: v_accvgpr_write_b32 a11, v23 +; FAST90A-NEXT: v_accvgpr_write_b32 a12, v22 +; FAST90A-NEXT: v_accvgpr_write_b32 a13, v21 +; FAST90A-NEXT: v_accvgpr_write_b32 a14, v20 +; FAST90A-NEXT: v_accvgpr_write_b32 a15, v19 +; FAST90A-NEXT: v_accvgpr_write_b32 a16, v18 +; FAST90A-NEXT: v_accvgpr_write_b32 a17, v17 +; FAST90A-NEXT: v_accvgpr_write_b32 a18, v16 +; FAST90A-NEXT: v_accvgpr_write_b32 a19, v15 +; FAST90A-NEXT: v_accvgpr_write_b32 a20, v14 +; FAST90A-NEXT: v_accvgpr_write_b32 a21, v13 +; FAST90A-NEXT: v_accvgpr_write_b32 a22, v12 +; FAST90A-NEXT: v_accvgpr_write_b32 a23, v11 +; FAST90A-NEXT: v_accvgpr_write_b32 a24, v10 +; FAST90A-NEXT: v_accvgpr_write_b32 a25, v9 +; FAST90A-NEXT: v_accvgpr_write_b32 a26, v8 +; FAST90A-NEXT: v_accvgpr_write_b32 a27, v7 +; FAST90A-NEXT: v_accvgpr_write_b32 a28, v6 +; FAST90A-NEXT: v_accvgpr_write_b32 a29, v5 +; FAST90A-NEXT: v_accvgpr_write_b32 a30, v4 +; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3 +; FAST90A-NEXT: s_nop 1 +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; FAST90A-NEXT: s_nop 7 +; FAST90A-NEXT: s_nop 7 +; FAST90A-NEXT: s_nop 2 +; FAST90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; FAST90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; FAST90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; FAST90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; FAST90A-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -31,20 +631,282 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: -; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[18:33], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] -; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[2:17], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] - -; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[18:33], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] -; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[2:17], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] - -; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] -; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] - -; FAST: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] -; FAST: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] -; GCN: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { +; GREEDY908-LABEL: test_mfma_f32_16x16x1f32: +; GREEDY908: ; %bb.0: ; %bb +; GREEDY908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 +; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY908-NEXT: v_mov_b32_e32 v5, s15 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s14 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s13 +; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s12 +; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s11 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s10 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s9 +; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s8 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s7 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s5 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s4 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s2 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s1 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] +; GREEDY908-NEXT: s_nop 7 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY908-NEXT: s_nop 7 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GREEDY908-NEXT: s_endpgm +; +; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32: +; GREEDY90A: ; %bb.0: ; %bb +; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0 +; GREEDY90A-NEXT: s_nop 1 +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] +; GREEDY90A-NEXT: s_nop 7 +; GREEDY90A-NEXT: s_nop 1 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19 +; GREEDY90A-NEXT: s_nop 1 +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY90A-NEXT: s_nop 7 +; GREEDY90A-NEXT: s_nop 2 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] +; GREEDY90A-NEXT: s_endpgm +; +; GREEDY942-LABEL: test_mfma_f32_16x16x1f32: +; GREEDY942: ; %bb.0: ; %bb +; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15 +; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14 +; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13 +; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12 +; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11 +; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10 +; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9 +; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8 +; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7 +; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6 +; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5 +; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4 +; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3 +; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2 +; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1 +; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0 +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33] +; GREEDY942-NEXT: s_nop 7 +; GREEDY942-NEXT: s_nop 0 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GREEDY942-NEXT: s_nop 7 +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] +; GREEDY942-NEXT: s_endpgm +; +; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32: +; GREEDY90A-GISEL: ; %bb.0: ; %bb +; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GREEDY90A-GISEL-NEXT: s_nop 1 +; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] +; GREEDY90A-GISEL-NEXT: s_nop 7 +; GREEDY90A-GISEL-NEXT: s_nop 2 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a16 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a17 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a4, a18 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a5, a19 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a6, a20 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a7, a21 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a8, a22 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a9, a23 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a10, a24 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a11, a25 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a12, a26 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a13, a27 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a14, a28 +; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a15, a29 +; GREEDY90A-GISEL-NEXT: s_nop 1 +; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GREEDY90A-GISEL-NEXT: s_nop 7 +; GREEDY90A-GISEL-NEXT: s_nop 1 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GREEDY90A-GISEL-NEXT: s_endpgm +; +; FAST90A-LABEL: test_mfma_f32_16x16x1f32: +; FAST90A: ; %bb.0: ; %bb +; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 +; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 +; FAST90A-NEXT: v_mov_b32_e32 v0, 0 +; FAST90A-NEXT: s_waitcnt lgkmcnt(0) +; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 +; FAST90A-NEXT: s_waitcnt lgkmcnt(0) +; FAST90A-NEXT: v_accvgpr_write_b32 a0, s4 +; FAST90A-NEXT: v_accvgpr_write_b32 a1, s5 +; FAST90A-NEXT: v_accvgpr_write_b32 a2, s6 +; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7 +; FAST90A-NEXT: v_accvgpr_write_b32 a4, s8 +; FAST90A-NEXT: v_accvgpr_write_b32 a5, s9 +; FAST90A-NEXT: v_accvgpr_write_b32 a6, s10 +; FAST90A-NEXT: v_accvgpr_write_b32 a7, s11 +; FAST90A-NEXT: v_accvgpr_write_b32 a8, s12 +; FAST90A-NEXT: v_accvgpr_write_b32 a9, s13 +; FAST90A-NEXT: v_accvgpr_write_b32 a10, s14 +; FAST90A-NEXT: v_accvgpr_write_b32 a11, s15 +; FAST90A-NEXT: v_accvgpr_write_b32 a12, s16 +; FAST90A-NEXT: v_accvgpr_write_b32 a13, s17 +; FAST90A-NEXT: v_accvgpr_write_b32 a14, s18 +; FAST90A-NEXT: v_accvgpr_write_b32 a15, s19 +; FAST90A-NEXT: s_nop 1 +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15] +; FAST90A-NEXT: s_nop 7 +; FAST90A-NEXT: s_nop 2 +; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16 +; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17 +; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a18 +; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a19 +; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a20 +; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a21 +; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a22 +; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a23 +; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a24 +; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a25 +; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a26 +; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a27 +; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28 +; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29 +; FAST90A-NEXT: s_nop 1 +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] +; FAST90A-NEXT: s_nop 7 +; FAST90A-NEXT: s_nop 2 +; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; FAST90A-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -57,13 +919,129 @@ bb: ; This instruction allows the overlap since it only read 4 registers. -; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: -; GREEDY: v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] -; GREEDY: v_mfma_f32_4x4x1{{.*}} a[2:5], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] -; FAST: v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] -; FAST: v_mfma_f32_4x4x1{{.*}} a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] -; GCN: v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { +; GREEDY908-LABEL: test_mfma_f32_4x4x1f32: +; GREEDY908: ; %bb.0: ; %bb +; GREEDY908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 +; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY908-NEXT: v_mov_b32_e32 v5, s0 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s1 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v5 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GREEDY908-NEXT: s_nop 3 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GREEDY908-NEXT: s_endpgm +; +; GREEDY90A-LABEL: test_mfma_f32_4x4x1f32: +; GREEDY90A: ; %bb.0: ; %bb +; GREEDY90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY90A-NEXT: s_nop 1 +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] +; GREEDY90A-NEXT: s_nop 1 +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GREEDY90A-NEXT: s_nop 4 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] +; GREEDY90A-NEXT: s_endpgm +; +; GREEDY942-LABEL: test_mfma_f32_4x4x1f32: +; GREEDY942: ; %bb.0: ; %bb +; GREEDY942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s0 +; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s1 +; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s2 +; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[2:5], v0, v1, a[0:3] +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GREEDY942-NEXT: s_nop 3 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] +; GREEDY942-NEXT: s_endpgm +; +; GREEDY90A-GISEL-LABEL: test_mfma_f32_4x4x1f32: +; GREEDY90A-GISEL: ; %bb.0: ; %bb +; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY90A-GISEL-NEXT: s_nop 1 +; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] +; GREEDY90A-GISEL-NEXT: s_nop 1 +; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GREEDY90A-GISEL-NEXT: s_nop 3 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GREEDY90A-GISEL-NEXT: s_endpgm +; +; FAST90A-LABEL: test_mfma_f32_4x4x1f32: +; FAST90A: ; %bb.0: ; %bb +; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 +; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 +; FAST90A-NEXT: v_mov_b32_e32 v0, 0 +; FAST90A-NEXT: s_waitcnt lgkmcnt(0) +; FAST90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; FAST90A-NEXT: s_waitcnt lgkmcnt(0) +; FAST90A-NEXT: v_accvgpr_write_b32 a0, s4 +; FAST90A-NEXT: v_accvgpr_write_b32 a1, s5 +; FAST90A-NEXT: v_accvgpr_write_b32 a2, s6 +; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7 +; FAST90A-NEXT: s_nop 1 +; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] +; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v1, v2, a[0:3] +; FAST90A-NEXT: s_nop 4 +; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a4 +; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a5 +; FAST90A-NEXT: s_nop 1 +; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] +; FAST90A-NEXT: s_nop 4 +; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; FAST90A-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0)