Skip to content

Commit 7402cd6

Browse files
committed
AMDGPU: Disable AGPR selection in mfma rewrite test
This makes the test actually test the intended rewrite pass. Also add some tests with inline immediates in src2. Switch the target to gfx942 for future test functions.
1 parent 6897ca4 commit 7402cd6

File tree

1 file changed

+143
-130
lines changed

1 file changed

+143
-130
lines changed
Lines changed: 143 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -mcpu=gfx90a < %s | FileCheck %s
2+
; RUN: llc -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s
33

44
target triple = "amdgcn-amd-amdhsa"
55

66
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrspace(1) %arg) #0 {
77
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma:
88
; CHECK: ; %bb.0: ; %bb
99
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
10+
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1011
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
12+
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
13+
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
1114
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
1215
; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
1316
; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
@@ -18,117 +21,58 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
1821
; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
1922
; CHECK-NEXT: s_nop 0
2023
; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
24+
; CHECK-NEXT: v_accvgpr_write_b32 a0, 1.0
25+
; CHECK-NEXT: v_accvgpr_write_b32 a1, 2.0
2126
; CHECK-NEXT: s_waitcnt vmcnt(0)
22-
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
23-
; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
24-
; CHECK-NEXT: v_accvgpr_write_b32 a2, v2
25-
; CHECK-NEXT: v_accvgpr_write_b32 a3, v3
26-
; CHECK-NEXT: v_accvgpr_write_b32 a4, v4
27-
; CHECK-NEXT: v_accvgpr_write_b32 a5, v5
28-
; CHECK-NEXT: v_accvgpr_write_b32 a6, v6
29-
; CHECK-NEXT: v_accvgpr_write_b32 a7, v7
30-
; CHECK-NEXT: v_accvgpr_write_b32 a8, v8
31-
; CHECK-NEXT: v_accvgpr_write_b32 a9, v9
32-
; CHECK-NEXT: v_accvgpr_write_b32 a10, v10
33-
; CHECK-NEXT: v_accvgpr_write_b32 a11, v11
34-
; CHECK-NEXT: v_accvgpr_write_b32 a12, v12
35-
; CHECK-NEXT: v_accvgpr_write_b32 a13, v13
36-
; CHECK-NEXT: v_accvgpr_write_b32 a14, v14
37-
; CHECK-NEXT: v_accvgpr_write_b32 a15, v15
38-
; CHECK-NEXT: v_accvgpr_write_b32 a16, v16
39-
; CHECK-NEXT: v_accvgpr_write_b32 a17, v17
40-
; CHECK-NEXT: v_accvgpr_write_b32 a18, v18
41-
; CHECK-NEXT: v_accvgpr_write_b32 a19, v19
42-
; CHECK-NEXT: v_accvgpr_write_b32 a20, v20
43-
; CHECK-NEXT: v_accvgpr_write_b32 a21, v21
44-
; CHECK-NEXT: v_accvgpr_write_b32 a22, v22
45-
; CHECK-NEXT: v_accvgpr_write_b32 a23, v23
46-
; CHECK-NEXT: v_accvgpr_write_b32 a24, v24
47-
; CHECK-NEXT: v_accvgpr_write_b32 a25, v25
48-
; CHECK-NEXT: v_accvgpr_write_b32 a26, v26
49-
; CHECK-NEXT: v_accvgpr_write_b32 a27, v27
50-
; CHECK-NEXT: v_accvgpr_write_b32 a28, v28
51-
; CHECK-NEXT: v_accvgpr_write_b32 a29, v29
52-
; CHECK-NEXT: v_accvgpr_write_b32 a30, v30
53-
; CHECK-NEXT: v_accvgpr_write_b32 a31, v31
54-
; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
55-
; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
56-
; CHECK-NEXT: s_nop 1
57-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
58-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
27+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
28+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a0, a1, v[0:31]
5929
; CHECK-NEXT: s_nop 7
6030
; CHECK-NEXT: s_nop 7
61-
; CHECK-NEXT: s_nop 2
62-
; CHECK-NEXT: v_accvgpr_read_b32 v4, a59
63-
; CHECK-NEXT: v_accvgpr_read_b32 v5, a58
64-
; CHECK-NEXT: v_accvgpr_read_b32 v6, a57
65-
; CHECK-NEXT: v_accvgpr_read_b32 v7, a56
66-
; CHECK-NEXT: v_accvgpr_read_b32 v8, a55
67-
; CHECK-NEXT: v_accvgpr_read_b32 v9, a54
68-
; CHECK-NEXT: v_accvgpr_read_b32 v10, a53
69-
; CHECK-NEXT: v_accvgpr_read_b32 v11, a52
70-
; CHECK-NEXT: v_accvgpr_read_b32 v12, a51
71-
; CHECK-NEXT: v_accvgpr_read_b32 v13, a50
72-
; CHECK-NEXT: v_accvgpr_read_b32 v14, a49
73-
; CHECK-NEXT: v_accvgpr_read_b32 v15, a48
74-
; CHECK-NEXT: v_accvgpr_read_b32 v16, a47
75-
; CHECK-NEXT: v_accvgpr_read_b32 v17, a46
76-
; CHECK-NEXT: v_accvgpr_read_b32 v18, a45
77-
; CHECK-NEXT: v_accvgpr_read_b32 v19, a44
78-
; CHECK-NEXT: v_accvgpr_read_b32 v20, a43
79-
; CHECK-NEXT: v_accvgpr_read_b32 v21, a42
80-
; CHECK-NEXT: v_accvgpr_read_b32 v22, a41
81-
; CHECK-NEXT: v_accvgpr_read_b32 v23, a40
82-
; CHECK-NEXT: v_accvgpr_read_b32 v24, a39
83-
; CHECK-NEXT: v_accvgpr_read_b32 v25, a38
84-
; CHECK-NEXT: v_accvgpr_read_b32 v26, a37
85-
; CHECK-NEXT: v_accvgpr_read_b32 v27, a36
86-
; CHECK-NEXT: v_accvgpr_read_b32 v28, a35
87-
; CHECK-NEXT: v_accvgpr_read_b32 v29, a34
88-
; CHECK-NEXT: v_accvgpr_mov_b32 a2, a32
89-
; CHECK-NEXT: v_accvgpr_mov_b32 a3, a33
90-
; CHECK-NEXT: v_accvgpr_write_b32 a4, v29
91-
; CHECK-NEXT: v_accvgpr_write_b32 a5, v28
92-
; CHECK-NEXT: v_accvgpr_write_b32 a6, v27
93-
; CHECK-NEXT: v_accvgpr_write_b32 a7, v26
94-
; CHECK-NEXT: v_accvgpr_write_b32 a8, v25
95-
; CHECK-NEXT: v_accvgpr_write_b32 a9, v24
96-
; CHECK-NEXT: v_accvgpr_write_b32 a10, v23
97-
; CHECK-NEXT: v_accvgpr_write_b32 a11, v22
98-
; CHECK-NEXT: v_accvgpr_write_b32 a12, v21
99-
; CHECK-NEXT: v_accvgpr_write_b32 a13, v20
100-
; CHECK-NEXT: v_accvgpr_write_b32 a14, v19
101-
; CHECK-NEXT: v_accvgpr_write_b32 a15, v18
102-
; CHECK-NEXT: v_accvgpr_write_b32 a16, v17
103-
; CHECK-NEXT: v_accvgpr_write_b32 a17, v16
104-
; CHECK-NEXT: v_accvgpr_write_b32 a18, v15
105-
; CHECK-NEXT: v_accvgpr_write_b32 a19, v14
106-
; CHECK-NEXT: v_accvgpr_write_b32 a20, v13
107-
; CHECK-NEXT: v_accvgpr_write_b32 a21, v12
108-
; CHECK-NEXT: v_accvgpr_write_b32 a22, v11
109-
; CHECK-NEXT: v_accvgpr_write_b32 a23, v10
110-
; CHECK-NEXT: v_accvgpr_write_b32 a24, v9
111-
; CHECK-NEXT: v_accvgpr_write_b32 a25, v8
112-
; CHECK-NEXT: v_accvgpr_write_b32 a26, v7
113-
; CHECK-NEXT: v_accvgpr_write_b32 a27, v6
114-
; CHECK-NEXT: v_accvgpr_write_b32 a28, v5
115-
; CHECK-NEXT: v_accvgpr_write_b32 a29, v4
116-
; CHECK-NEXT: v_accvgpr_mov_b32 a30, a60
117-
; CHECK-NEXT: v_accvgpr_mov_b32 a31, a61
11831
; CHECK-NEXT: s_nop 1
119-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
120-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
32+
; CHECK-NEXT: v_mov_b32_e32 v2, v32
33+
; CHECK-NEXT: v_mov_b32_e32 v3, v33
34+
; CHECK-NEXT: v_mov_b32_e32 v4, v34
35+
; CHECK-NEXT: v_mov_b32_e32 v5, v35
36+
; CHECK-NEXT: v_mov_b32_e32 v6, v36
37+
; CHECK-NEXT: v_mov_b32_e32 v7, v37
38+
; CHECK-NEXT: v_mov_b32_e32 v8, v38
39+
; CHECK-NEXT: v_mov_b32_e32 v9, v39
40+
; CHECK-NEXT: v_mov_b32_e32 v10, v40
41+
; CHECK-NEXT: v_mov_b32_e32 v11, v41
42+
; CHECK-NEXT: v_mov_b32_e32 v12, v42
43+
; CHECK-NEXT: v_mov_b32_e32 v13, v43
44+
; CHECK-NEXT: v_mov_b32_e32 v14, v44
45+
; CHECK-NEXT: v_mov_b32_e32 v15, v45
46+
; CHECK-NEXT: v_mov_b32_e32 v16, v46
47+
; CHECK-NEXT: v_mov_b32_e32 v17, v47
48+
; CHECK-NEXT: v_mov_b32_e32 v18, v48
49+
; CHECK-NEXT: v_mov_b32_e32 v19, v49
50+
; CHECK-NEXT: v_mov_b32_e32 v20, v50
51+
; CHECK-NEXT: v_mov_b32_e32 v21, v51
52+
; CHECK-NEXT: v_mov_b32_e32 v22, v52
53+
; CHECK-NEXT: v_mov_b32_e32 v23, v53
54+
; CHECK-NEXT: v_mov_b32_e32 v24, v54
55+
; CHECK-NEXT: v_mov_b32_e32 v25, v55
56+
; CHECK-NEXT: v_mov_b32_e32 v26, v56
57+
; CHECK-NEXT: v_mov_b32_e32 v27, v57
58+
; CHECK-NEXT: v_mov_b32_e32 v28, v58
59+
; CHECK-NEXT: v_mov_b32_e32 v29, v59
60+
; CHECK-NEXT: v_mov_b32_e32 v30, v60
61+
; CHECK-NEXT: v_mov_b32_e32 v31, v61
62+
; CHECK-NEXT: v_mov_b32_e32 v32, 0
63+
; CHECK-NEXT: s_nop 0
64+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[0:31]
12165
; CHECK-NEXT: s_nop 7
12266
; CHECK-NEXT: s_nop 7
12367
; CHECK-NEXT: s_nop 1
124-
; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
125-
; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
126-
; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
127-
; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
128-
; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
129-
; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
130-
; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
131-
; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
68+
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
69+
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
70+
; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
71+
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
72+
; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
73+
; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
74+
; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
75+
; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
13276
; CHECK-NEXT: s_endpgm
13377
bb:
13478
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -146,35 +90,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
14690
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle:
14791
; CHECK: ; %bb.0: ; %bb
14892
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
93+
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
14994
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
150-
; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
95+
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
96+
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
15197
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
152-
; CHECK-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
153-
; CHECK-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
154-
; CHECK-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
155-
; CHECK-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
156-
; CHECK-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
157-
; CHECK-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
158-
; CHECK-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
159-
; CHECK-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
160-
; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
161-
; CHECK-NEXT: s_waitcnt vmcnt(0)
98+
; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
99+
; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
100+
; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80
101+
; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64
102+
; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48
103+
; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32
104+
; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
162105
; CHECK-NEXT: s_nop 0
163-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
164-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
165-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
166-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
106+
; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
107+
; CHECK-NEXT: s_waitcnt vmcnt(0)
108+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
109+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
110+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
111+
; CHECK-NEXT: v_mov_b32_e32 v32, 0
167112
; CHECK-NEXT: s_nop 7
168113
; CHECK-NEXT: s_nop 7
169-
; CHECK-NEXT: s_nop 1
170-
; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
171-
; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
172-
; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
173-
; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
174-
; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
175-
; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
176-
; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
177-
; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
114+
; CHECK-NEXT: s_nop 0
115+
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
116+
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
117+
; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
118+
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
119+
; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
120+
; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
121+
; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
122+
; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
178123
; CHECK-NEXT: s_endpgm
179124
bb:
180125
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -187,9 +132,77 @@ bb:
187132
ret void
188133
}
189134

135+
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2(ptr addrspace(1) %arg) #0 {
136+
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2:
137+
; CHECK: ; %bb.0: ; %bb
138+
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
139+
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
140+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
141+
; CHECK-NEXT: s_nop 0
142+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
143+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
144+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
145+
; CHECK-NEXT: v_mov_b32_e32 v32, 0
146+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
147+
; CHECK-NEXT: s_nop 7
148+
; CHECK-NEXT: s_nop 7
149+
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
150+
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
151+
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
152+
; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
153+
; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
154+
; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
155+
; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
156+
; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
157+
; CHECK-NEXT: s_endpgm
158+
bb:
159+
%id = call i32 @llvm.amdgcn.workitem.id.x()
160+
%gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
161+
%in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128
162+
%mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
163+
%mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
164+
%mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.2, i32 0, i32 0, i32 0)
165+
store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128
166+
ret void
167+
}
168+
169+
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2(ptr addrspace(1) %arg) #0 {
170+
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2:
171+
; CHECK: ; %bb.0: ; %bb
172+
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
173+
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
174+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
175+
; CHECK-NEXT: s_nop 0
176+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 1.0
177+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
178+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
179+
; CHECK-NEXT: v_mov_b32_e32 v32, 0
180+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
181+
; CHECK-NEXT: s_nop 7
182+
; CHECK-NEXT: s_nop 7
183+
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
184+
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
185+
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
186+
; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
187+
; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
188+
; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
189+
; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
190+
; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
191+
; CHECK-NEXT: s_endpgm
192+
bb:
193+
%id = call i32 @llvm.amdgcn.workitem.id.x()
194+
%gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
195+
%in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128
196+
%mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> splat (float 1.0), i32 0, i32 0, i32 0)
197+
%mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
198+
%mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.2, i32 0, i32 0, i32 0)
199+
store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128
200+
ret void
201+
}
202+
190203
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg) #1
191204
declare noundef i32 @llvm.amdgcn.workitem.id.x() #2
192205

193-
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,4" }
206+
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="4,4" }
194207
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
195208
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

0 commit comments

Comments
 (0)