-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Fix not folding splat immediate into VGPR MFMA src2 #150628
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Fix not folding splat immediate into VGPR MFMA src2 #150628
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesPatch is 20.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150628.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e5d1eaad2b8f4..b77da4d612dd4 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1062,9 +1062,13 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
switch (OpTy) {
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
break;
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
break;
default:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index adbc2df4b3474..7d5bc3f4c0c39 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -1083,58 +1083,36 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
; GFX90A-VGPR: ; %bb.0: ; %bb
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
-; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-VGPR-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double 1.0), i32 0, i32 0, i32 0)
@@ -1184,58 +1162,36 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
; GFX90A-VGPR: ; %bb.0: ; %bb
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0xbff00000
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
-; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-VGPR-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0xbff00000
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double -1.0), i32 0, i32 0, i32 0)
@@ -1285,58 +1241,36 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
; GFX90A-VGPR: ; %bb.0: ; %bb
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 64
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
-; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-VGPR-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 64
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 64 to double)), i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index d7257a49b00bb..b4a6451908a6f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -3238,27 +3238,11 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
;
; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 64
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v17, v18, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 1
@@ -3463,13 +3447,10 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace(
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
; GFX942-VGPR-NEXT: s_nop 0
-; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v5, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 3
; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -4483,13 +4464,10 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %ar
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
; GFX942-VGPR-NEXT: s_nop 0
-; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v5, v[0:3]
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 2
; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -4627,25 +4605,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v17, v[0:15]
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 0
@@ -4797,36 +4760,20 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0x3c003c00
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, v16
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 0x40004000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x3c003c00
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0x40004000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v2
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, 0
-; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[12:15], s[0:1] offset:48
-; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[8...
[truncated]
|
Merge activity
|
|
I suspect this regressed in 3601849 and just didn't have tests, so perhaps this should go to the release branch |
ad766da to
4ee7be5
Compare
3c67b08 to
8ccb289
Compare
4ee7be5 to
2f95bc3
Compare
8ccb289 to
4a00166
Compare
54837d8 to
4d8c839
Compare
859c60f to
5c71bc4
Compare
5c71bc4 to
4cbec19
Compare
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/11/builds/20420 Here is the relevant piece of the build log for the reference |

No description provided.