Skip to content

Commit 21523ff

Browse files
author
git apple-llvm automerger
committed
Merge commit '7402cd6ded24' from llvm.org/main into next
2 parents 55fe9fd + 7402cd6 commit 21523ff

File tree

1 file changed

+143
-130
lines changed

1 file changed

+143
-130
lines changed
Lines changed: 143 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -mcpu=gfx90a < %s | FileCheck %s
2+
; RUN: llc -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s
33

44
target triple = "amdgcn-amd-amdhsa"
55

66
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrspace(1) %arg) #0 {
77
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma:
88
; CHECK: ; %bb.0: ; %bb
99
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
10+
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1011
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
12+
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
13+
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
1114
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
1215
; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
1316
; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
@@ -18,117 +21,58 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
1821
; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
1922
; CHECK-NEXT: s_nop 0
2023
; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
24+
; CHECK-NEXT: v_accvgpr_write_b32 a0, 1.0
25+
; CHECK-NEXT: v_accvgpr_write_b32 a1, 2.0
2126
; CHECK-NEXT: s_waitcnt vmcnt(0)
22-
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
23-
; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
24-
; CHECK-NEXT: v_accvgpr_write_b32 a2, v2
25-
; CHECK-NEXT: v_accvgpr_write_b32 a3, v3
26-
; CHECK-NEXT: v_accvgpr_write_b32 a4, v4
27-
; CHECK-NEXT: v_accvgpr_write_b32 a5, v5
28-
; CHECK-NEXT: v_accvgpr_write_b32 a6, v6
29-
; CHECK-NEXT: v_accvgpr_write_b32 a7, v7
30-
; CHECK-NEXT: v_accvgpr_write_b32 a8, v8
31-
; CHECK-NEXT: v_accvgpr_write_b32 a9, v9
32-
; CHECK-NEXT: v_accvgpr_write_b32 a10, v10
33-
; CHECK-NEXT: v_accvgpr_write_b32 a11, v11
34-
; CHECK-NEXT: v_accvgpr_write_b32 a12, v12
35-
; CHECK-NEXT: v_accvgpr_write_b32 a13, v13
36-
; CHECK-NEXT: v_accvgpr_write_b32 a14, v14
37-
; CHECK-NEXT: v_accvgpr_write_b32 a15, v15
38-
; CHECK-NEXT: v_accvgpr_write_b32 a16, v16
39-
; CHECK-NEXT: v_accvgpr_write_b32 a17, v17
40-
; CHECK-NEXT: v_accvgpr_write_b32 a18, v18
41-
; CHECK-NEXT: v_accvgpr_write_b32 a19, v19
42-
; CHECK-NEXT: v_accvgpr_write_b32 a20, v20
43-
; CHECK-NEXT: v_accvgpr_write_b32 a21, v21
44-
; CHECK-NEXT: v_accvgpr_write_b32 a22, v22
45-
; CHECK-NEXT: v_accvgpr_write_b32 a23, v23
46-
; CHECK-NEXT: v_accvgpr_write_b32 a24, v24
47-
; CHECK-NEXT: v_accvgpr_write_b32 a25, v25
48-
; CHECK-NEXT: v_accvgpr_write_b32 a26, v26
49-
; CHECK-NEXT: v_accvgpr_write_b32 a27, v27
50-
; CHECK-NEXT: v_accvgpr_write_b32 a28, v28
51-
; CHECK-NEXT: v_accvgpr_write_b32 a29, v29
52-
; CHECK-NEXT: v_accvgpr_write_b32 a30, v30
53-
; CHECK-NEXT: v_accvgpr_write_b32 a31, v31
54-
; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
55-
; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
56-
; CHECK-NEXT: s_nop 1
57-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
58-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
27+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
28+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a0, a1, v[0:31]
5929
; CHECK-NEXT: s_nop 7
6030
; CHECK-NEXT: s_nop 7
61-
; CHECK-NEXT: s_nop 2
62-
; CHECK-NEXT: v_accvgpr_read_b32 v4, a59
63-
; CHECK-NEXT: v_accvgpr_read_b32 v5, a58
64-
; CHECK-NEXT: v_accvgpr_read_b32 v6, a57
65-
; CHECK-NEXT: v_accvgpr_read_b32 v7, a56
66-
; CHECK-NEXT: v_accvgpr_read_b32 v8, a55
67-
; CHECK-NEXT: v_accvgpr_read_b32 v9, a54
68-
; CHECK-NEXT: v_accvgpr_read_b32 v10, a53
69-
; CHECK-NEXT: v_accvgpr_read_b32 v11, a52
70-
; CHECK-NEXT: v_accvgpr_read_b32 v12, a51
71-
; CHECK-NEXT: v_accvgpr_read_b32 v13, a50
72-
; CHECK-NEXT: v_accvgpr_read_b32 v14, a49
73-
; CHECK-NEXT: v_accvgpr_read_b32 v15, a48
74-
; CHECK-NEXT: v_accvgpr_read_b32 v16, a47
75-
; CHECK-NEXT: v_accvgpr_read_b32 v17, a46
76-
; CHECK-NEXT: v_accvgpr_read_b32 v18, a45
77-
; CHECK-NEXT: v_accvgpr_read_b32 v19, a44
78-
; CHECK-NEXT: v_accvgpr_read_b32 v20, a43
79-
; CHECK-NEXT: v_accvgpr_read_b32 v21, a42
80-
; CHECK-NEXT: v_accvgpr_read_b32 v22, a41
81-
; CHECK-NEXT: v_accvgpr_read_b32 v23, a40
82-
; CHECK-NEXT: v_accvgpr_read_b32 v24, a39
83-
; CHECK-NEXT: v_accvgpr_read_b32 v25, a38
84-
; CHECK-NEXT: v_accvgpr_read_b32 v26, a37
85-
; CHECK-NEXT: v_accvgpr_read_b32 v27, a36
86-
; CHECK-NEXT: v_accvgpr_read_b32 v28, a35
87-
; CHECK-NEXT: v_accvgpr_read_b32 v29, a34
88-
; CHECK-NEXT: v_accvgpr_mov_b32 a2, a32
89-
; CHECK-NEXT: v_accvgpr_mov_b32 a3, a33
90-
; CHECK-NEXT: v_accvgpr_write_b32 a4, v29
91-
; CHECK-NEXT: v_accvgpr_write_b32 a5, v28
92-
; CHECK-NEXT: v_accvgpr_write_b32 a6, v27
93-
; CHECK-NEXT: v_accvgpr_write_b32 a7, v26
94-
; CHECK-NEXT: v_accvgpr_write_b32 a8, v25
95-
; CHECK-NEXT: v_accvgpr_write_b32 a9, v24
96-
; CHECK-NEXT: v_accvgpr_write_b32 a10, v23
97-
; CHECK-NEXT: v_accvgpr_write_b32 a11, v22
98-
; CHECK-NEXT: v_accvgpr_write_b32 a12, v21
99-
; CHECK-NEXT: v_accvgpr_write_b32 a13, v20
100-
; CHECK-NEXT: v_accvgpr_write_b32 a14, v19
101-
; CHECK-NEXT: v_accvgpr_write_b32 a15, v18
102-
; CHECK-NEXT: v_accvgpr_write_b32 a16, v17
103-
; CHECK-NEXT: v_accvgpr_write_b32 a17, v16
104-
; CHECK-NEXT: v_accvgpr_write_b32 a18, v15
105-
; CHECK-NEXT: v_accvgpr_write_b32 a19, v14
106-
; CHECK-NEXT: v_accvgpr_write_b32 a20, v13
107-
; CHECK-NEXT: v_accvgpr_write_b32 a21, v12
108-
; CHECK-NEXT: v_accvgpr_write_b32 a22, v11
109-
; CHECK-NEXT: v_accvgpr_write_b32 a23, v10
110-
; CHECK-NEXT: v_accvgpr_write_b32 a24, v9
111-
; CHECK-NEXT: v_accvgpr_write_b32 a25, v8
112-
; CHECK-NEXT: v_accvgpr_write_b32 a26, v7
113-
; CHECK-NEXT: v_accvgpr_write_b32 a27, v6
114-
; CHECK-NEXT: v_accvgpr_write_b32 a28, v5
115-
; CHECK-NEXT: v_accvgpr_write_b32 a29, v4
116-
; CHECK-NEXT: v_accvgpr_mov_b32 a30, a60
117-
; CHECK-NEXT: v_accvgpr_mov_b32 a31, a61
11831
; CHECK-NEXT: s_nop 1
119-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
120-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
32+
; CHECK-NEXT: v_mov_b32_e32 v2, v32
33+
; CHECK-NEXT: v_mov_b32_e32 v3, v33
34+
; CHECK-NEXT: v_mov_b32_e32 v4, v34
35+
; CHECK-NEXT: v_mov_b32_e32 v5, v35
36+
; CHECK-NEXT: v_mov_b32_e32 v6, v36
37+
; CHECK-NEXT: v_mov_b32_e32 v7, v37
38+
; CHECK-NEXT: v_mov_b32_e32 v8, v38
39+
; CHECK-NEXT: v_mov_b32_e32 v9, v39
40+
; CHECK-NEXT: v_mov_b32_e32 v10, v40
41+
; CHECK-NEXT: v_mov_b32_e32 v11, v41
42+
; CHECK-NEXT: v_mov_b32_e32 v12, v42
43+
; CHECK-NEXT: v_mov_b32_e32 v13, v43
44+
; CHECK-NEXT: v_mov_b32_e32 v14, v44
45+
; CHECK-NEXT: v_mov_b32_e32 v15, v45
46+
; CHECK-NEXT: v_mov_b32_e32 v16, v46
47+
; CHECK-NEXT: v_mov_b32_e32 v17, v47
48+
; CHECK-NEXT: v_mov_b32_e32 v18, v48
49+
; CHECK-NEXT: v_mov_b32_e32 v19, v49
50+
; CHECK-NEXT: v_mov_b32_e32 v20, v50
51+
; CHECK-NEXT: v_mov_b32_e32 v21, v51
52+
; CHECK-NEXT: v_mov_b32_e32 v22, v52
53+
; CHECK-NEXT: v_mov_b32_e32 v23, v53
54+
; CHECK-NEXT: v_mov_b32_e32 v24, v54
55+
; CHECK-NEXT: v_mov_b32_e32 v25, v55
56+
; CHECK-NEXT: v_mov_b32_e32 v26, v56
57+
; CHECK-NEXT: v_mov_b32_e32 v27, v57
58+
; CHECK-NEXT: v_mov_b32_e32 v28, v58
59+
; CHECK-NEXT: v_mov_b32_e32 v29, v59
60+
; CHECK-NEXT: v_mov_b32_e32 v30, v60
61+
; CHECK-NEXT: v_mov_b32_e32 v31, v61
62+
; CHECK-NEXT: v_mov_b32_e32 v32, 0
63+
; CHECK-NEXT: s_nop 0
64+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[0:31]
12165
; CHECK-NEXT: s_nop 7
12266
; CHECK-NEXT: s_nop 7
12367
; CHECK-NEXT: s_nop 1
124-
; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
125-
; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
126-
; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
127-
; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
128-
; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
129-
; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
130-
; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
131-
; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
68+
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
69+
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
70+
; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
71+
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
72+
; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
73+
; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
74+
; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
75+
; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
13276
; CHECK-NEXT: s_endpgm
13377
bb:
13478
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -146,35 +90,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
14690
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle:
14791
; CHECK: ; %bb.0: ; %bb
14892
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
93+
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
14994
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
150-
; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
95+
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
96+
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
15197
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
152-
; CHECK-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
153-
; CHECK-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
154-
; CHECK-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
155-
; CHECK-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
156-
; CHECK-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
157-
; CHECK-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
158-
; CHECK-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
159-
; CHECK-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
160-
; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
161-
; CHECK-NEXT: s_waitcnt vmcnt(0)
98+
; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
99+
; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
100+
; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80
101+
; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64
102+
; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48
103+
; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32
104+
; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
162105
; CHECK-NEXT: s_nop 0
163-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
164-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
165-
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
166-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
106+
; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
107+
; CHECK-NEXT: s_waitcnt vmcnt(0)
108+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
109+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
110+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
111+
; CHECK-NEXT: v_mov_b32_e32 v32, 0
167112
; CHECK-NEXT: s_nop 7
168113
; CHECK-NEXT: s_nop 7
169-
; CHECK-NEXT: s_nop 1
170-
; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
171-
; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
172-
; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
173-
; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
174-
; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
175-
; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
176-
; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
177-
; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
114+
; CHECK-NEXT: s_nop 0
115+
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
116+
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
117+
; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
118+
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
119+
; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
120+
; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
121+
; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
122+
; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
178123
; CHECK-NEXT: s_endpgm
179124
bb:
180125
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -187,9 +132,77 @@ bb:
187132
ret void
188133
}
189134

135+
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2(ptr addrspace(1) %arg) #0 {
136+
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2:
137+
; CHECK: ; %bb.0: ; %bb
138+
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
139+
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
140+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
141+
; CHECK-NEXT: s_nop 0
142+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
143+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
144+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
145+
; CHECK-NEXT: v_mov_b32_e32 v32, 0
146+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
147+
; CHECK-NEXT: s_nop 7
148+
; CHECK-NEXT: s_nop 7
149+
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
150+
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
151+
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
152+
; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
153+
; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
154+
; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
155+
; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
156+
; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
157+
; CHECK-NEXT: s_endpgm
158+
bb:
159+
%id = call i32 @llvm.amdgcn.workitem.id.x()
160+
%gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
161+
%in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128
162+
%mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
163+
%mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
164+
%mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.2, i32 0, i32 0, i32 0)
165+
store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128
166+
ret void
167+
}
168+
169+
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2(ptr addrspace(1) %arg) #0 {
170+
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2:
171+
; CHECK: ; %bb.0: ; %bb
172+
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
173+
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
174+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
175+
; CHECK-NEXT: s_nop 0
176+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 1.0
177+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
178+
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
179+
; CHECK-NEXT: v_mov_b32_e32 v32, 0
180+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
181+
; CHECK-NEXT: s_nop 7
182+
; CHECK-NEXT: s_nop 7
183+
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
184+
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
185+
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
186+
; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
187+
; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
188+
; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
189+
; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
190+
; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
191+
; CHECK-NEXT: s_endpgm
192+
bb:
193+
%id = call i32 @llvm.amdgcn.workitem.id.x()
194+
%gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
195+
%in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128
196+
%mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> splat (float 1.0), i32 0, i32 0, i32 0)
197+
%mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
198+
%mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.2, i32 0, i32 0, i32 0)
199+
store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128
200+
ret void
201+
}
202+
190203
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg) #1
191204
declare noundef i32 @llvm.amdgcn.workitem.id.x() #2
192205

193-
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,4" }
206+
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="4,4" }
194207
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
195208
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

0 commit comments

Comments
 (0)