Skip to content

Commit c34cdd7

Browse files
authored
AMDGPU: Add tests for every mfma intrinsic v-to-a mapping (#153026)
Make sure the MFMA VGPR to AGPR InstrMapping table is complete. I think I got everything, except the full cross product of input types with the mfma scale intrinsics. Also makes sure we have coverage for smfmac and mfma_scale cases.
1 parent 0d29279 commit c34cdd7

File tree

3 files changed

+1716
-0
lines changed

3 files changed

+1716
-0
lines changed
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck %s
3+
4+
target triple = "amdgcn-amd-amdhsa"
5+
6+
define void @test_rewrite_mfma_i32_32x32x8i8(i32 %arg0, i32 %arg1, ptr addrspace(1) %ptr) #0 {
7+
; CHECK-LABEL: test_rewrite_mfma_i32_32x32x8i8:
8+
; CHECK: ; %bb.0:
9+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10+
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
11+
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
12+
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
13+
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
14+
; CHECK-NEXT: s_waitcnt vmcnt(0)
15+
; CHECK-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15]
16+
; CHECK-NEXT: ;;#ASMSTART
17+
; CHECK-NEXT: ; use a[0:15]
18+
; CHECK-NEXT: ;;#ASMEND
19+
; CHECK-NEXT: s_setpc_b64 s[30:31]
20+
%src2 = load <16 x i32>, ptr addrspace(1) %ptr
21+
%mai = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %arg0, i32 %arg1, <16 x i32> %src2, i32 0, i32 0, i32 0)
22+
call void asm sideeffect "; use $0", "a"(<16 x i32> %mai)
23+
ret void
24+
}
25+
26+
define void @test_rewrite_mfma_i32_16x16x16i8(i32 %arg0, i32 %arg1, ptr addrspace(1) %ptr) #0 {
27+
; CHECK-LABEL: test_rewrite_mfma_i32_16x16x16i8:
28+
; CHECK: ; %bb.0:
29+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30+
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
31+
; CHECK-NEXT: s_waitcnt vmcnt(0)
32+
; CHECK-NEXT: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[0:3]
33+
; CHECK-NEXT: ;;#ASMSTART
34+
; CHECK-NEXT: ; use a[0:3]
35+
; CHECK-NEXT: ;;#ASMEND
36+
; CHECK-NEXT: s_setpc_b64 s[30:31]
37+
%src2 = load <4 x i32>, ptr addrspace(1) %ptr
38+
%mai = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %arg0, i32 %arg1, <4 x i32> %src2, i32 0, i32 0, i32 0)
39+
call void asm sideeffect "; use $0", "a"(<4 x i32> %mai)
40+
ret void
41+
}
42+
43+
define void @test_rewrite_mfma_f32_32x32x2bf16(<2 x i16> %arg0, <2 x i16> %arg1, ptr addrspace(1) %ptr) #0 {
44+
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x2bf16:
45+
; CHECK: ; %bb.0:
46+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47+
; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
48+
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
49+
; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
50+
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
51+
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
52+
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
53+
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
54+
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
55+
; CHECK-NEXT: s_waitcnt vmcnt(0)
56+
; CHECK-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[0:31]
57+
; CHECK-NEXT: ;;#ASMSTART
58+
; CHECK-NEXT: ; use a[0:31]
59+
; CHECK-NEXT: ;;#ASMEND
60+
; CHECK-NEXT: s_setpc_b64 s[30:31]
61+
%src2 = load <32 x float>, ptr addrspace(1) %ptr
62+
%mai = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %arg0, <2 x i16> %arg1, <32 x float> %src2, i32 0, i32 0, i32 0)
63+
call void asm sideeffect "; use $0", "a"(<32 x float> %mai)
64+
ret void
65+
}
66+
67+
define void @test_rewrite_mfma_f32_16x16x2bf16(<2 x i16> %arg0, <2 x i16> %arg1, ptr addrspace(1) %ptr) #0 {
68+
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x2bf16:
69+
; CHECK: ; %bb.0:
70+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
71+
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
72+
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
73+
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
74+
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
75+
; CHECK-NEXT: s_waitcnt vmcnt(0)
76+
; CHECK-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15]
77+
; CHECK-NEXT: ;;#ASMSTART
78+
; CHECK-NEXT: ; use a[0:15]
79+
; CHECK-NEXT: ;;#ASMEND
80+
; CHECK-NEXT: s_setpc_b64 s[30:31]
81+
%src2 = load <16 x float>, ptr addrspace(1) %ptr
82+
%mai = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %arg0, <2 x i16> %arg1, <16 x float> %src2, i32 0, i32 0, i32 0)
83+
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
84+
ret void
85+
}
86+
87+
define void @test_rewrite_mfma_f32_4x4x2bf16(<2 x i16> %arg0, <2 x i16> %arg1, ptr addrspace(1) %ptr) #0 {
88+
; CHECK-LABEL: test_rewrite_mfma_f32_4x4x2bf16:
89+
; CHECK: ; %bb.0:
90+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91+
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
92+
; CHECK-NEXT: s_waitcnt vmcnt(0)
93+
; CHECK-NEXT: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[0:3]
94+
; CHECK-NEXT: ;;#ASMSTART
95+
; CHECK-NEXT: ; use a[0:3]
96+
; CHECK-NEXT: ;;#ASMEND
97+
; CHECK-NEXT: s_setpc_b64 s[30:31]
98+
%src2 = load <4 x float>, ptr addrspace(1) %ptr
99+
%mai = call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %arg0, <2 x i16> %arg1, <4 x float> %src2, i32 0, i32 0, i32 0)
100+
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
101+
ret void
102+
}
103+
104+
define void @test_rewrite_mfma_f32_32x32x4bf16(<2 x i16> %arg0, <2 x i16> %arg1, ptr addrspace(1) %ptr) #0 {
105+
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x4bf16:
106+
; CHECK: ; %bb.0:
107+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108+
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
109+
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
110+
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
111+
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
112+
; CHECK-NEXT: s_waitcnt vmcnt(0)
113+
; CHECK-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15]
114+
; CHECK-NEXT: ;;#ASMSTART
115+
; CHECK-NEXT: ; use a[0:15]
116+
; CHECK-NEXT: ;;#ASMEND
117+
; CHECK-NEXT: s_setpc_b64 s[30:31]
118+
%src2 = load <16 x float>, ptr addrspace(1) %ptr
119+
%mai = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %arg0, <2 x i16> %arg1, <16 x float> %src2, i32 0, i32 0, i32 0)
120+
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
121+
ret void
122+
}
123+
124+
define void @test_rewrite_mfma_f32_16x16x8bf16(<2 x i16> %arg0, <2 x i16> %arg1, ptr addrspace(1) %ptr) #0 {
125+
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x8bf16:
126+
; CHECK: ; %bb.0:
127+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128+
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
129+
; CHECK-NEXT: s_waitcnt vmcnt(0)
130+
; CHECK-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[0:3]
131+
; CHECK-NEXT: ;;#ASMSTART
132+
; CHECK-NEXT: ; use a[0:3]
133+
; CHECK-NEXT: ;;#ASMEND
134+
; CHECK-NEXT: s_setpc_b64 s[30:31]
135+
%src2 = load <4 x float>, ptr addrspace(1) %ptr
136+
%mai = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %arg0, <2 x i16> %arg1, <4 x float> %src2, i32 0, i32 0, i32 0)
137+
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
138+
ret void
139+
}
140+
141+
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="4,4" }

0 commit comments

Comments
 (0)