diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index 7d85d3439eed9..beda16c17a5c9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -1,13 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-SDAG,GFX942-VGPRCD-SDAG %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-GISEL,GFX942-VGPRCD-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-SDAG,GFX942-AGPRCD-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-GISEL,GFX942-AGPRCD-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-SDAG,GFX950-VGPRCD-SDAG %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-GISEL,GFX950-VGPRCD-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-SDAG,GFX950-AGPRCD-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-GISEL,GFX950-AGPRCD-GISEL %s declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32) declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index d358837452eab..8081a15b53bb7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a31, s23 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a30, s22 -; GCN-NEXT: v_accvgpr_write_b32 a29, s21 -; GCN-NEXT: v_accvgpr_write_b32 a28, s20 -; GCN-NEXT: v_accvgpr_write_b32 a27, s19 -; GCN-NEXT: v_accvgpr_write_b32 a26, s18 -; GCN-NEXT: v_accvgpr_write_b32 a25, s17 -; GCN-NEXT: v_accvgpr_write_b32 a24, s16 -; GCN-NEXT: v_accvgpr_write_b32 a23, s15 -; GCN-NEXT: v_accvgpr_write_b32 a22, s14 -; GCN-NEXT: v_accvgpr_write_b32 a21, s13 -; GCN-NEXT: v_accvgpr_write_b32 a20, s12 -; GCN-NEXT: v_accvgpr_write_b32 a19, s11 -; GCN-NEXT: v_accvgpr_write_b32 a18, s10 -; GCN-NEXT: v_accvgpr_write_b32 a17, s9 -; GCN-NEXT: v_accvgpr_write_b32 a16, s8 -; GCN-NEXT: v_mov_b32_e32 v10, s20 -; GCN-NEXT: v_mov_b32_e32 v11, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] -; GCN-NEXT: v_mov_b32_e32 v12, s22 -; GCN-NEXT: v_mov_b32_e32 v13, s23 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GCN-NEXT: v_mov_b32_e32 v40, s20 +; GCN-NEXT: v_mov_b32_e32 v41, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] +; GCN-NEXT: v_mov_b32_e32 v42, s22 +; GCN-NEXT: v_mov_b32_e32 v43, s23 +; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_mov_b32_e32 v16, s16 +; GCN-NEXT: v_mov_b32_e32 v17, s17 +; GCN-NEXT: v_mov_b32_e32 v18, s18 +; GCN-NEXT: v_mov_b32_e32 v19, s19 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s12 +; GCN-NEXT: v_mov_b32_e32 v17, s13 +; GCN-NEXT: v_mov_b32_e32 v18, s14 +; GCN-NEXT: v_mov_b32_e32 v19, s15 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s8 +; GCN-NEXT: v_mov_b32_e32 v17, s9 +; GCN-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) @@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a31, s23 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a30, s22 -; GCN-NEXT: v_accvgpr_write_b32 a29, s21 -; GCN-NEXT: v_accvgpr_write_b32 a28, s20 -; GCN-NEXT: v_accvgpr_write_b32 a27, s19 -; GCN-NEXT: v_accvgpr_write_b32 a26, s18 -; GCN-NEXT: v_accvgpr_write_b32 a25, s17 -; GCN-NEXT: v_accvgpr_write_b32 a24, s16 -; GCN-NEXT: v_accvgpr_write_b32 a23, s15 -; GCN-NEXT: v_accvgpr_write_b32 a22, s14 -; GCN-NEXT: v_accvgpr_write_b32 a21, s13 -; GCN-NEXT: v_accvgpr_write_b32 a20, s12 -; GCN-NEXT: v_accvgpr_write_b32 a19, s11 -; GCN-NEXT: v_accvgpr_write_b32 a18, s10 -; GCN-NEXT: v_accvgpr_write_b32 a17, s9 -; GCN-NEXT: v_accvgpr_write_b32 a16, s8 -; GCN-NEXT: v_mov_b32_e32 v10, s20 -; GCN-NEXT: v_mov_b32_e32 v11, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v12, s22 -; GCN-NEXT: v_mov_b32_e32 v13, s23 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GCN-NEXT: v_mov_b32_e32 v40, s20 +; GCN-NEXT: v_mov_b32_e32 v41, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v42, s22 +; GCN-NEXT: v_mov_b32_e32 v43, s23 +; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_mov_b32_e32 v16, s16 +; GCN-NEXT: v_mov_b32_e32 v17, s17 +; GCN-NEXT: v_mov_b32_e32 v18, s18 +; GCN-NEXT: v_mov_b32_e32 v19, s19 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s12 +; GCN-NEXT: v_mov_b32_e32 v17, s13 +; GCN-NEXT: v_mov_b32_e32 v18, s14 +; GCN-NEXT: v_mov_b32_e32 v19, s15 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s8 +; GCN-NEXT: v_mov_b32_e32 v17, s9 +; GCN-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3) @@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat> ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a1, s9 -; GCN-NEXT: v_accvgpr_write_b32 a2, s10 -; GCN-NEXT: v_accvgpr_write_b32 a3, s11 -; GCN-NEXT: v_accvgpr_write_b32 a4, s12 -; GCN-NEXT: v_accvgpr_write_b32 a5, s13 -; GCN-NEXT: v_accvgpr_write_b32 a6, s14 -; GCN-NEXT: v_accvgpr_write_b32 a7, s15 -; GCN-NEXT: v_accvgpr_write_b32 a8, s16 -; GCN-NEXT: v_accvgpr_write_b32 a9, s17 -; GCN-NEXT: v_accvgpr_write_b32 a10, s18 -; GCN-NEXT: v_accvgpr_write_b32 a11, s19 -; GCN-NEXT: v_accvgpr_write_b32 a12, s20 -; GCN-NEXT: v_accvgpr_write_b32 a13, s21 -; GCN-NEXT: v_accvgpr_write_b32 a14, s22 -; GCN-NEXT: v_accvgpr_write_b32 a15, s23 +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store <16 x float> %result, ptr addrspace(1) %out @@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a1, s9 -; GCN-NEXT: v_accvgpr_write_b32 a2, s10 -; GCN-NEXT: v_accvgpr_write_b32 a3, s11 -; GCN-NEXT: v_accvgpr_write_b32 a4, s12 -; GCN-NEXT: v_accvgpr_write_b32 a5, s13 -; GCN-NEXT: v_accvgpr_write_b32 a6, s14 -; GCN-NEXT: v_accvgpr_write_b32 a7, s15 -; GCN-NEXT: v_accvgpr_write_b32 a8, s16 -; GCN-NEXT: v_accvgpr_write_b32 a9, s17 -; GCN-NEXT: v_accvgpr_write_b32 a10, s18 -; GCN-NEXT: v_accvgpr_write_b32 a11, s19 -; GCN-NEXT: v_accvgpr_write_b32 a12, s20 -; GCN-NEXT: v_accvgpr_write_b32 a13, s21 -; GCN-NEXT: v_accvgpr_write_b32 a14, s22 -; GCN-NEXT: v_accvgpr_write_b32 a15, s23 +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1) store <16 x float> %result, ptr addrspace(1) %out ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { "amdgpu-flat-work-group-size"="1,64" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 21465beb21de7..d81ec1c61634b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -141,20 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -166,16 +164,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -183,20 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -266,20 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -291,16 +283,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -308,20 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -1505,62 +1493,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v44, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 -; SDAG-NEXT: v_mov_b32_e32 v10, s20 -; SDAG-NEXT: v_mov_b32_e32 v11, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] -; SDAG-NEXT: v_mov_b32_e32 v12, s22 -; SDAG-NEXT: v_mov_b32_e32 v13, s23 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 +; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1569,52 +1550,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -1623,62 +1596,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v44, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v10, s20 -; HEURRC-NEXT: v_mov_b32_e32 v11, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] -; HEURRC-NEXT: v_mov_b32_e32 v12, s22 -; HEURRC-NEXT: v_mov_b32_e32 v13, s23 -; HEURRC-NEXT: v_mov_b32_e32 v0, s16 -; HEURRC-NEXT: v_mov_b32_e32 v1, s17 -; HEURRC-NEXT: v_mov_b32_e32 v2, s18 -; HEURRC-NEXT: v_mov_b32_e32 v3, s19 -; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; HEURRC-NEXT: v_mov_b32_e32 v40, s20 +; HEURRC-NEXT: v_mov_b32_e32 v41, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] +; HEURRC-NEXT: v_mov_b32_e32 v42, s22 +; HEURRC-NEXT: v_mov_b32_e32 v43, s23 +; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s12 -; HEURRC-NEXT: v_mov_b32_e32 v1, s13 -; HEURRC-NEXT: v_mov_b32_e32 v2, s14 -; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -1687,7 +1653,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v44, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] @@ -1701,41 +1667,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v42, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v43, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s21 ; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] -; VGPRRC-NEXT: v_mov_b32_e32 v44, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v45, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s23 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: @@ -1869,62 +1835,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v44, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 -; SDAG-NEXT: v_mov_b32_e32 v10, s20 -; SDAG-NEXT: v_mov_b32_e32 v11, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v12, s22 -; SDAG-NEXT: v_mov_b32_e32 v13, s23 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 +; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1933,52 +1892,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -1987,62 +1938,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v44, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v10, s20 -; HEURRC-NEXT: v_mov_b32_e32 v11, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: v_mov_b32_e32 v12, s22 -; HEURRC-NEXT: v_mov_b32_e32 v13, s23 -; HEURRC-NEXT: v_mov_b32_e32 v0, s16 -; HEURRC-NEXT: v_mov_b32_e32 v1, s17 -; HEURRC-NEXT: v_mov_b32_e32 v2, s18 -; HEURRC-NEXT: v_mov_b32_e32 v3, s19 -; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; HEURRC-NEXT: v_mov_b32_e32 v40, s20 +; HEURRC-NEXT: v_mov_b32_e32 v41, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: v_mov_b32_e32 v42, s22 +; HEURRC-NEXT: v_mov_b32_e32 v43, s23 +; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s12 -; HEURRC-NEXT: v_mov_b32_e32 v1, s13 -; HEURRC-NEXT: v_mov_b32_e32 v2, s14 -; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -2051,7 +1995,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v44, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] @@ -2065,41 +2009,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v42, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v43, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s21 ; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; VGPRRC-NEXT: v_mov_b32_e32 v44, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v45, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s23 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: @@ -2234,35 +2178,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2271,35 +2207,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2308,35 +2236,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2443,35 +2363,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: @@ -2480,35 +2392,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: @@ -2517,35 +2421,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: @@ -2781,24 +2677,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2810,16 +2706,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2827,24 +2721,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: v_mov_b32_e32 v6, s12 -; HEURRC-NEXT: v_mov_b32_e32 v7, s13 -; HEURRC-NEXT: v_mov_b32_e32 v8, s14 -; HEURRC-NEXT: v_mov_b32_e32 v9, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_mov_b32_e32 v8, s0 +; HEURRC-NEXT: v_mov_b32_e32 v9, s1 +; HEURRC-NEXT: v_mov_b32_e32 v10, s2 +; HEURRC-NEXT: v_mov_b32_e32 v11, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2852,24 +2746,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v6, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v12, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v13, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v0, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s3 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -2930,24 +2824,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2959,16 +2853,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2976,24 +2868,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: v_mov_b32_e32 v6, s12 -; HEURRC-NEXT: v_mov_b32_e32 v7, s13 -; HEURRC-NEXT: v_mov_b32_e32 v8, s14 -; HEURRC-NEXT: v_mov_b32_e32 v9, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_mov_b32_e32 v8, s0 +; HEURRC-NEXT: v_mov_b32_e32 v9, s1 +; HEURRC-NEXT: v_mov_b32_e32 v10, s2 +; HEURRC-NEXT: v_mov_b32_e32 v11, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -3001,24 +2893,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v6, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v12, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v13, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v0, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s3 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: @@ -4246,70 +4138,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v40, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v6, s24 -; SDAG-NEXT: v_mov_b32_e32 v7, s25 -; SDAG-NEXT: v_mov_b32_e32 v8, s26 -; SDAG-NEXT: v_mov_b32_e32 v9, s27 +; SDAG-NEXT: v_mov_b32_e32 v36, s24 +; SDAG-NEXT: v_mov_b32_e32 v37, s25 +; SDAG-NEXT: v_mov_b32_e32 v38, s26 +; SDAG-NEXT: v_mov_b32_e32 v39, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4318,52 +4203,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -4371,70 +4248,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v40, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v6, s24 -; HEURRC-NEXT: v_mov_b32_e32 v7, s25 -; HEURRC-NEXT: v_mov_b32_e32 v8, s26 -; HEURRC-NEXT: v_mov_b32_e32 v9, s27 +; HEURRC-NEXT: v_mov_b32_e32 v36, s24 +; HEURRC-NEXT: v_mov_b32_e32 v37, s25 +; HEURRC-NEXT: v_mov_b32_e32 v38, s26 +; HEURRC-NEXT: v_mov_b32_e32 v39, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] +; HEURRC-NEXT: s_nop 6 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s16 -; HEURRC-NEXT: v_mov_b32_e32 v3, s17 -; HEURRC-NEXT: v_mov_b32_e32 v4, s18 -; HEURRC-NEXT: v_mov_b32_e32 v5, s19 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s12 -; HEURRC-NEXT: v_mov_b32_e32 v3, s13 -; HEURRC-NEXT: v_mov_b32_e32 v4, s14 -; HEURRC-NEXT: v_mov_b32_e32 v5, s15 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -4442,17 +4312,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v34, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v36, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v37, s23 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b32_e32 v38, s24 -; VGPRRC-NEXT: v_mov_b32_e32 v39, s25 -; VGPRRC-NEXT: v_mov_b32_e32 v40, s26 -; VGPRRC-NEXT: v_mov_b32_e32 v41, s27 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] @@ -4463,42 +4333,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] ; VGPRRC-NEXT: s_nop 6 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: @@ -4645,70 +4515,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v40, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v6, s24 -; SDAG-NEXT: v_mov_b32_e32 v7, s25 -; SDAG-NEXT: v_mov_b32_e32 v8, s26 -; SDAG-NEXT: v_mov_b32_e32 v9, s27 +; SDAG-NEXT: v_mov_b32_e32 v36, s24 +; SDAG-NEXT: v_mov_b32_e32 v37, s25 +; SDAG-NEXT: v_mov_b32_e32 v38, s26 +; SDAG-NEXT: v_mov_b32_e32 v39, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4717,52 +4580,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -4770,70 +4625,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v40, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v6, s24 -; HEURRC-NEXT: v_mov_b32_e32 v7, s25 -; HEURRC-NEXT: v_mov_b32_e32 v8, s26 -; HEURRC-NEXT: v_mov_b32_e32 v9, s27 +; HEURRC-NEXT: v_mov_b32_e32 v36, s24 +; HEURRC-NEXT: v_mov_b32_e32 v37, s25 +; HEURRC-NEXT: v_mov_b32_e32 v38, s26 +; HEURRC-NEXT: v_mov_b32_e32 v39, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: s_nop 6 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s16 -; HEURRC-NEXT: v_mov_b32_e32 v3, s17 -; HEURRC-NEXT: v_mov_b32_e32 v4, s18 -; HEURRC-NEXT: v_mov_b32_e32 v5, s19 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s12 -; HEURRC-NEXT: v_mov_b32_e32 v3, s13 -; HEURRC-NEXT: v_mov_b32_e32 v4, s14 -; HEURRC-NEXT: v_mov_b32_e32 v5, s15 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -4841,17 +4689,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v34, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v36, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v37, s23 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b32_e32 v38, s24 -; VGPRRC-NEXT: v_mov_b32_e32 v39, s25 -; VGPRRC-NEXT: v_mov_b32_e32 v40, s26 -; VGPRRC-NEXT: v_mov_b32_e32 v41, s27 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] @@ -4862,42 +4710,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 ; VGPRRC-NEXT: s_nop 6 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: @@ -5045,41 +4893,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: v_mov_b32_e32 v20, s24 +; SDAG-NEXT: v_mov_b32_e32 v21, s25 +; SDAG-NEXT: v_mov_b32_e32 v22, s26 +; SDAG-NEXT: v_mov_b32_e32 v23, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -5088,35 +4928,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -5124,41 +4956,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s20 -; HEURRC-NEXT: v_mov_b32_e32 v1, s21 -; HEURRC-NEXT: v_mov_b32_e32 v2, s22 -; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v4, s24 -; HEURRC-NEXT: v_mov_b32_e32 v5, s25 -; HEURRC-NEXT: v_mov_b32_e32 v6, s26 -; HEURRC-NEXT: v_mov_b32_e32 v7, s27 +; HEURRC-NEXT: v_mov_b32_e32 v20, s24 +; HEURRC-NEXT: v_mov_b32_e32 v21, s25 +; HEURRC-NEXT: v_mov_b32_e32 v22, s26 +; HEURRC-NEXT: v_mov_b32_e32 v23, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -5279,41 +5103,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: v_mov_b32_e32 v20, s24 +; SDAG-NEXT: v_mov_b32_e32 v21, s25 +; SDAG-NEXT: v_mov_b32_e32 v22, s26 +; SDAG-NEXT: v_mov_b32_e32 v23, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5322,35 +5138,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5358,41 +5166,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s20 -; HEURRC-NEXT: v_mov_b32_e32 v1, s21 -; HEURRC-NEXT: v_mov_b32_e32 v2, s22 -; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v4, s24 -; HEURRC-NEXT: v_mov_b32_e32 v5, s25 -; HEURRC-NEXT: v_mov_b32_e32 v6, s26 -; HEURRC-NEXT: v_mov_b32_e32 v7, s27 +; HEURRC-NEXT: v_mov_b32_e32 v20, s24 +; HEURRC-NEXT: v_mov_b32_e32 v21, s25 +; HEURRC-NEXT: v_mov_b32_e32 v22, s26 +; HEURRC-NEXT: v_mov_b32_e32 v23, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5643,20 +5443,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5664,20 +5462,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5747,20 +5543,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5768,20 +5562,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5845,5 +5637,5 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { "amdgpu-flat-work-group-size"="1,64" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 37809da10241b..f78ea92b4840b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -1895,36 +1895,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s12, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s12, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[14:15] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: @@ -1937,20 +1937,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s24 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s25 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s26 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s29 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v20, s29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s28, v20 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -1964,40 +1962,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s6, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] @@ -2005,19 +2001,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2031,40 +2025,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s6, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] @@ -2072,19 +2064,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2096,34 +2086,32 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm: @@ -2136,21 +2124,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2162,34 +2148,32 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal: @@ -2202,21 +2186,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2559,5 +2541,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index bc50058778dbf..0b2818f38149d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -4539,49 +4539,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 -; SDAG-NEXT: v_mov_b32_e32 v0, s1 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; SDAG-NEXT: v_mov_b32_e32 v32, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: @@ -4590,41 +4582,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 -; GISEL-NEXT: v_mov_b32_e32 v16, s1 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_mov_b32_e32 v32, s1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -4639,91 +4623,75 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG-NEXT: s_movk_i32 s2, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -5031,77 +4999,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v44, s24 +; SDAG-NEXT: v_mov_b32_e32 v45, s25 +; SDAG-NEXT: v_mov_b32_e32 v46, s26 +; SDAG-NEXT: v_mov_b32_e32 v47, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5109,61 +5072,45 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a31, s23 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a30, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a29, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a28, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a27, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a26, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a25, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a24, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a23, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a22, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a21, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a20, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a19, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a18, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a17, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a16, s8 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) @@ -5177,77 +5124,70 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_mov_b32_e32 v24, s20 +; SDAG-NEXT: v_mov_b32_e32 v25, s21 +; SDAG-NEXT: v_mov_b32_e32 v26, s22 +; SDAG-NEXT: v_mov_b32_e32 v27, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v28, s24 +; SDAG-NEXT: v_mov_b32_e32 v29, s25 +; SDAG-NEXT: v_mov_b32_e32 v30, s26 +; SDAG-NEXT: v_mov_b32_e32 v31, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5255,61 +5195,53 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) @@ -6298,6 +6230,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll index ea9334a6c74d3..31a48de4133ac 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942-STRESS,GFX942-SDAG-STRESS %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942-STRESS,GFX942-GISEL-STRESS %s declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32) @@ -51,50 +49,6 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 { ; GFX942-GISEL-NEXT: s_nop 5 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm -; -; GFX942-SDAG-STRESS-LABEL: test_mfma_f32_16x16x8xf32: -; GFX942-SDAG-STRESS: ; %bb.0: ; %bb -; GFX942-SDAG-STRESS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v2, 0x40400000 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v3, 4.0 -; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-STRESS-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-STRESS-NEXT: s_nop 1 -; GFX942-SDAG-STRESS-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-STRESS-NEXT: s_nop 6 -; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-SDAG-STRESS-NEXT: s_endpgm -; -; GFX942-GISEL-STRESS-LABEL: test_mfma_f32_16x16x8xf32: -; GFX942-GISEL-STRESS: ; %bb.0: ; %bb -; GFX942-GISEL-STRESS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s0, 1.0 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s2, 0x40400000 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s1, 2.0 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s3, 4.0 -; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-STRESS-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-GISEL-STRESS-NEXT: s_nop 1 -; GFX942-GISEL-STRESS-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-STRESS-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-GISEL-STRESS-NEXT: s_nop 5 -; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX942-GISEL-STRESS-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> , <2 x float> , <4 x float> %in.1, i32 1, i32 2, i32 3) @@ -178,82 +132,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 { ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-GISEL-NEXT: s_endpgm -; -; GFX942-SDAG-STRESS-LABEL: test_mfma_f32_32x32x4xf32: -; GFX942-SDAG-STRESS: ; %bb.0: ; %bb -; GFX942-SDAG-STRESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v2, 0x40400000 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v3, 4.0 -; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-STRESS-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a15, s15 -; GFX942-SDAG-STRESS-NEXT: s_nop 1 -; GFX942-SDAG-STRESS-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-SDAG-STRESS-NEXT: s_nop 7 -; GFX942-SDAG-STRESS-NEXT: s_nop 1 -; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX942-SDAG-STRESS-NEXT: s_endpgm -; -; GFX942-GISEL-STRESS-LABEL: test_mfma_f32_32x32x4xf32: -; GFX942-GISEL-STRESS: ; %bb.0: ; %bb -; GFX942-GISEL-STRESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-STRESS-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a15, s15 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s0, 1.0 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s1, 2.0 -; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s0, 0x40400000 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s1, 4.0 -; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX942-GISEL-STRESS-NEXT: s_nop 1 -; GFX942-GISEL-STRESS-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-STRESS-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-GISEL-STRESS-NEXT: s_nop 7 -; GFX942-GISEL-STRESS-NEXT: s_nop 1 -; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-GISEL-STRESS-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> , <2 x float> , <16 x float> %in.1, i32 1, i32 2, i32 3) @@ -264,4 +142,3 @@ bb: attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX942: {{.*}} -; GFX942-STRESS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 80568810e42b5..b25fe8392a60e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -17,24 +17,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr: @@ -547,24 +547,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v13, s16 +; GCN-NEXT: v_mov_b32_e32 v17, s16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -855,30 +855,30 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr: @@ -1032,22 +1032,22 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -1397,30 +1397,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr: @@ -1566,30 +1566,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr: @@ -1735,30 +1735,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr: @@ -1904,30 +1904,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr: @@ -2081,22 +2081,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -2454,22 +2454,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -2827,22 +2827,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -3200,22 +3200,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -3552,4 +3552,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ret <16 x float> %result } -attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" }