11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2- ; RUN: llc -mcpu=gfx90a < %s | FileCheck %s
2+ ; RUN: llc -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s
33
44target triple = "amdgcn-amd-amdhsa"
55
66define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma (ptr addrspace (1 ) %arg ) #0 {
77; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma:
88; CHECK: ; %bb.0: ; %bb
99; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
10+ ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1011; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
12+ ; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
13+ ; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
1114; CHECK-NEXT: s_waitcnt lgkmcnt(0)
1215; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
1316; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
@@ -18,117 +21,58 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
1821; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
1922; CHECK-NEXT: s_nop 0
2023; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
24+ ; CHECK-NEXT: v_accvgpr_write_b32 a0, 1.0
25+ ; CHECK-NEXT: v_accvgpr_write_b32 a1, 2.0
2126; CHECK-NEXT: s_waitcnt vmcnt(0)
22- ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
23- ; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
24- ; CHECK-NEXT: v_accvgpr_write_b32 a2, v2
25- ; CHECK-NEXT: v_accvgpr_write_b32 a3, v3
26- ; CHECK-NEXT: v_accvgpr_write_b32 a4, v4
27- ; CHECK-NEXT: v_accvgpr_write_b32 a5, v5
28- ; CHECK-NEXT: v_accvgpr_write_b32 a6, v6
29- ; CHECK-NEXT: v_accvgpr_write_b32 a7, v7
30- ; CHECK-NEXT: v_accvgpr_write_b32 a8, v8
31- ; CHECK-NEXT: v_accvgpr_write_b32 a9, v9
32- ; CHECK-NEXT: v_accvgpr_write_b32 a10, v10
33- ; CHECK-NEXT: v_accvgpr_write_b32 a11, v11
34- ; CHECK-NEXT: v_accvgpr_write_b32 a12, v12
35- ; CHECK-NEXT: v_accvgpr_write_b32 a13, v13
36- ; CHECK-NEXT: v_accvgpr_write_b32 a14, v14
37- ; CHECK-NEXT: v_accvgpr_write_b32 a15, v15
38- ; CHECK-NEXT: v_accvgpr_write_b32 a16, v16
39- ; CHECK-NEXT: v_accvgpr_write_b32 a17, v17
40- ; CHECK-NEXT: v_accvgpr_write_b32 a18, v18
41- ; CHECK-NEXT: v_accvgpr_write_b32 a19, v19
42- ; CHECK-NEXT: v_accvgpr_write_b32 a20, v20
43- ; CHECK-NEXT: v_accvgpr_write_b32 a21, v21
44- ; CHECK-NEXT: v_accvgpr_write_b32 a22, v22
45- ; CHECK-NEXT: v_accvgpr_write_b32 a23, v23
46- ; CHECK-NEXT: v_accvgpr_write_b32 a24, v24
47- ; CHECK-NEXT: v_accvgpr_write_b32 a25, v25
48- ; CHECK-NEXT: v_accvgpr_write_b32 a26, v26
49- ; CHECK-NEXT: v_accvgpr_write_b32 a27, v27
50- ; CHECK-NEXT: v_accvgpr_write_b32 a28, v28
51- ; CHECK-NEXT: v_accvgpr_write_b32 a29, v29
52- ; CHECK-NEXT: v_accvgpr_write_b32 a30, v30
53- ; CHECK-NEXT: v_accvgpr_write_b32 a31, v31
54- ; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
55- ; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
56- ; CHECK-NEXT: s_nop 1
57- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
58- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
27+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
28+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a0, a1, v[0:31]
5929; CHECK-NEXT: s_nop 7
6030; CHECK-NEXT: s_nop 7
61- ; CHECK-NEXT: s_nop 2
62- ; CHECK-NEXT: v_accvgpr_read_b32 v4, a59
63- ; CHECK-NEXT: v_accvgpr_read_b32 v5, a58
64- ; CHECK-NEXT: v_accvgpr_read_b32 v6, a57
65- ; CHECK-NEXT: v_accvgpr_read_b32 v7, a56
66- ; CHECK-NEXT: v_accvgpr_read_b32 v8, a55
67- ; CHECK-NEXT: v_accvgpr_read_b32 v9, a54
68- ; CHECK-NEXT: v_accvgpr_read_b32 v10, a53
69- ; CHECK-NEXT: v_accvgpr_read_b32 v11, a52
70- ; CHECK-NEXT: v_accvgpr_read_b32 v12, a51
71- ; CHECK-NEXT: v_accvgpr_read_b32 v13, a50
72- ; CHECK-NEXT: v_accvgpr_read_b32 v14, a49
73- ; CHECK-NEXT: v_accvgpr_read_b32 v15, a48
74- ; CHECK-NEXT: v_accvgpr_read_b32 v16, a47
75- ; CHECK-NEXT: v_accvgpr_read_b32 v17, a46
76- ; CHECK-NEXT: v_accvgpr_read_b32 v18, a45
77- ; CHECK-NEXT: v_accvgpr_read_b32 v19, a44
78- ; CHECK-NEXT: v_accvgpr_read_b32 v20, a43
79- ; CHECK-NEXT: v_accvgpr_read_b32 v21, a42
80- ; CHECK-NEXT: v_accvgpr_read_b32 v22, a41
81- ; CHECK-NEXT: v_accvgpr_read_b32 v23, a40
82- ; CHECK-NEXT: v_accvgpr_read_b32 v24, a39
83- ; CHECK-NEXT: v_accvgpr_read_b32 v25, a38
84- ; CHECK-NEXT: v_accvgpr_read_b32 v26, a37
85- ; CHECK-NEXT: v_accvgpr_read_b32 v27, a36
86- ; CHECK-NEXT: v_accvgpr_read_b32 v28, a35
87- ; CHECK-NEXT: v_accvgpr_read_b32 v29, a34
88- ; CHECK-NEXT: v_accvgpr_mov_b32 a2, a32
89- ; CHECK-NEXT: v_accvgpr_mov_b32 a3, a33
90- ; CHECK-NEXT: v_accvgpr_write_b32 a4, v29
91- ; CHECK-NEXT: v_accvgpr_write_b32 a5, v28
92- ; CHECK-NEXT: v_accvgpr_write_b32 a6, v27
93- ; CHECK-NEXT: v_accvgpr_write_b32 a7, v26
94- ; CHECK-NEXT: v_accvgpr_write_b32 a8, v25
95- ; CHECK-NEXT: v_accvgpr_write_b32 a9, v24
96- ; CHECK-NEXT: v_accvgpr_write_b32 a10, v23
97- ; CHECK-NEXT: v_accvgpr_write_b32 a11, v22
98- ; CHECK-NEXT: v_accvgpr_write_b32 a12, v21
99- ; CHECK-NEXT: v_accvgpr_write_b32 a13, v20
100- ; CHECK-NEXT: v_accvgpr_write_b32 a14, v19
101- ; CHECK-NEXT: v_accvgpr_write_b32 a15, v18
102- ; CHECK-NEXT: v_accvgpr_write_b32 a16, v17
103- ; CHECK-NEXT: v_accvgpr_write_b32 a17, v16
104- ; CHECK-NEXT: v_accvgpr_write_b32 a18, v15
105- ; CHECK-NEXT: v_accvgpr_write_b32 a19, v14
106- ; CHECK-NEXT: v_accvgpr_write_b32 a20, v13
107- ; CHECK-NEXT: v_accvgpr_write_b32 a21, v12
108- ; CHECK-NEXT: v_accvgpr_write_b32 a22, v11
109- ; CHECK-NEXT: v_accvgpr_write_b32 a23, v10
110- ; CHECK-NEXT: v_accvgpr_write_b32 a24, v9
111- ; CHECK-NEXT: v_accvgpr_write_b32 a25, v8
112- ; CHECK-NEXT: v_accvgpr_write_b32 a26, v7
113- ; CHECK-NEXT: v_accvgpr_write_b32 a27, v6
114- ; CHECK-NEXT: v_accvgpr_write_b32 a28, v5
115- ; CHECK-NEXT: v_accvgpr_write_b32 a29, v4
116- ; CHECK-NEXT: v_accvgpr_mov_b32 a30, a60
117- ; CHECK-NEXT: v_accvgpr_mov_b32 a31, a61
11831; CHECK-NEXT: s_nop 1
119- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
120- ; CHECK-NEXT: v_mov_b32_e32 v0, 0
32+ ; CHECK-NEXT: v_mov_b32_e32 v2, v32
33+ ; CHECK-NEXT: v_mov_b32_e32 v3, v33
34+ ; CHECK-NEXT: v_mov_b32_e32 v4, v34
35+ ; CHECK-NEXT: v_mov_b32_e32 v5, v35
36+ ; CHECK-NEXT: v_mov_b32_e32 v6, v36
37+ ; CHECK-NEXT: v_mov_b32_e32 v7, v37
38+ ; CHECK-NEXT: v_mov_b32_e32 v8, v38
39+ ; CHECK-NEXT: v_mov_b32_e32 v9, v39
40+ ; CHECK-NEXT: v_mov_b32_e32 v10, v40
41+ ; CHECK-NEXT: v_mov_b32_e32 v11, v41
42+ ; CHECK-NEXT: v_mov_b32_e32 v12, v42
43+ ; CHECK-NEXT: v_mov_b32_e32 v13, v43
44+ ; CHECK-NEXT: v_mov_b32_e32 v14, v44
45+ ; CHECK-NEXT: v_mov_b32_e32 v15, v45
46+ ; CHECK-NEXT: v_mov_b32_e32 v16, v46
47+ ; CHECK-NEXT: v_mov_b32_e32 v17, v47
48+ ; CHECK-NEXT: v_mov_b32_e32 v18, v48
49+ ; CHECK-NEXT: v_mov_b32_e32 v19, v49
50+ ; CHECK-NEXT: v_mov_b32_e32 v20, v50
51+ ; CHECK-NEXT: v_mov_b32_e32 v21, v51
52+ ; CHECK-NEXT: v_mov_b32_e32 v22, v52
53+ ; CHECK-NEXT: v_mov_b32_e32 v23, v53
54+ ; CHECK-NEXT: v_mov_b32_e32 v24, v54
55+ ; CHECK-NEXT: v_mov_b32_e32 v25, v55
56+ ; CHECK-NEXT: v_mov_b32_e32 v26, v56
57+ ; CHECK-NEXT: v_mov_b32_e32 v27, v57
58+ ; CHECK-NEXT: v_mov_b32_e32 v28, v58
59+ ; CHECK-NEXT: v_mov_b32_e32 v29, v59
60+ ; CHECK-NEXT: v_mov_b32_e32 v30, v60
61+ ; CHECK-NEXT: v_mov_b32_e32 v31, v61
62+ ; CHECK-NEXT: v_mov_b32_e32 v32, 0
63+ ; CHECK-NEXT: s_nop 0
64+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[0:31]
12165; CHECK-NEXT: s_nop 7
12266; CHECK-NEXT: s_nop 7
12367; CHECK-NEXT: s_nop 1
124- ; CHECK-NEXT: global_store_dwordx4 v0, a [24:27], s[0:1] offset:96
125- ; CHECK-NEXT: global_store_dwordx4 v0, a [28:31], s[0:1] offset:112
126- ; CHECK-NEXT: global_store_dwordx4 v0, a [16:19], s[0:1] offset:64
127- ; CHECK-NEXT: global_store_dwordx4 v0, a [20:23], s[0:1] offset:80
128- ; CHECK-NEXT: global_store_dwordx4 v0, a [8:11], s[0:1] offset:32
129- ; CHECK-NEXT: global_store_dwordx4 v0, a [12:15], s[0:1] offset:48
130- ; CHECK-NEXT: global_store_dwordx4 v0, a [0:3], s[0:1]
131- ; CHECK-NEXT: global_store_dwordx4 v0, a [4:7], s[0:1] offset:16
68+ ; CHECK-NEXT: global_store_dwordx4 v32, v [24:27], s[0:1] offset:96
69+ ; CHECK-NEXT: global_store_dwordx4 v32, v [28:31], s[0:1] offset:112
70+ ; CHECK-NEXT: global_store_dwordx4 v32, v [16:19], s[0:1] offset:64
71+ ; CHECK-NEXT: global_store_dwordx4 v32, v [20:23], s[0:1] offset:80
72+ ; CHECK-NEXT: global_store_dwordx4 v32, v [8:11], s[0:1] offset:32
73+ ; CHECK-NEXT: global_store_dwordx4 v32, v [12:15], s[0:1] offset:48
74+ ; CHECK-NEXT: global_store_dwordx4 v32, v [0:3], s[0:1]
75+ ; CHECK-NEXT: global_store_dwordx4 v32, v [4:7], s[0:1] offset:16
13276; CHECK-NEXT: s_endpgm
13377bb:
13478 %id = call i32 @llvm.amdgcn.workitem.id.x ()
@@ -146,35 +90,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
14690; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle:
14791; CHECK: ; %bb.0: ; %bb
14892; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
93+ ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
14994; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
150- ; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
95+ ; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
96+ ; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
15197; CHECK-NEXT: s_waitcnt lgkmcnt(0)
152- ; CHECK-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
153- ; CHECK-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
154- ; CHECK-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
155- ; CHECK-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
156- ; CHECK-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
157- ; CHECK-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
158- ; CHECK-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
159- ; CHECK-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
160- ; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
161- ; CHECK-NEXT: s_waitcnt vmcnt(0)
98+ ; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
99+ ; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
100+ ; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80
101+ ; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64
102+ ; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48
103+ ; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32
104+ ; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
162105; CHECK-NEXT: s_nop 0
163- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
164- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
165- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
166- ; CHECK-NEXT: v_mov_b32_e32 v0, 0
106+ ; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
107+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
108+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
109+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
110+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
111+ ; CHECK-NEXT: v_mov_b32_e32 v32, 0
167112; CHECK-NEXT: s_nop 7
168113; CHECK-NEXT: s_nop 7
169- ; CHECK-NEXT: s_nop 1
170- ; CHECK-NEXT: global_store_dwordx4 v0, a [24:27], s[0:1] offset:96
171- ; CHECK-NEXT: global_store_dwordx4 v0, a [28:31], s[0:1] offset:112
172- ; CHECK-NEXT: global_store_dwordx4 v0, a [16:19], s[0:1] offset:64
173- ; CHECK-NEXT: global_store_dwordx4 v0, a [20:23], s[0:1] offset:80
174- ; CHECK-NEXT: global_store_dwordx4 v0, a [8:11], s[0:1] offset:32
175- ; CHECK-NEXT: global_store_dwordx4 v0, a [12:15], s[0:1] offset:48
176- ; CHECK-NEXT: global_store_dwordx4 v0, a [0:3], s[0:1]
177- ; CHECK-NEXT: global_store_dwordx4 v0, a [4:7], s[0:1] offset:16
114+ ; CHECK-NEXT: s_nop 0
115+ ; CHECK-NEXT: global_store_dwordx4 v32, v [24:27], s[0:1] offset:96
116+ ; CHECK-NEXT: global_store_dwordx4 v32, v [28:31], s[0:1] offset:112
117+ ; CHECK-NEXT: global_store_dwordx4 v32, v [16:19], s[0:1] offset:64
118+ ; CHECK-NEXT: global_store_dwordx4 v32, v [20:23], s[0:1] offset:80
119+ ; CHECK-NEXT: global_store_dwordx4 v32, v [8:11], s[0:1] offset:32
120+ ; CHECK-NEXT: global_store_dwordx4 v32, v [12:15], s[0:1] offset:48
121+ ; CHECK-NEXT: global_store_dwordx4 v32, v [0:3], s[0:1]
122+ ; CHECK-NEXT: global_store_dwordx4 v32, v [4:7], s[0:1] offset:16
178123; CHECK-NEXT: s_endpgm
179124bb:
180125 %id = call i32 @llvm.amdgcn.workitem.id.x ()
187132 ret void
188133}
189134
135+ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2 (ptr addrspace (1 ) %arg ) #0 {
136+ ; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2:
137+ ; CHECK: ; %bb.0: ; %bb
138+ ; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
139+ ; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
140+ ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
141+ ; CHECK-NEXT: s_nop 0
142+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
143+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
144+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
145+ ; CHECK-NEXT: v_mov_b32_e32 v32, 0
146+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
147+ ; CHECK-NEXT: s_nop 7
148+ ; CHECK-NEXT: s_nop 7
149+ ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
150+ ; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
151+ ; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
152+ ; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
153+ ; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
154+ ; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
155+ ; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
156+ ; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
157+ ; CHECK-NEXT: s_endpgm
158+ bb:
159+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
160+ %gep = getelementptr <32 x float >, ptr addrspace (1 ) %arg , i32 %id
161+ %in.1 = load <32 x float >, ptr addrspace (1 ) %gep , align 128
162+ %mai.1 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > zeroinitializer , i32 0 , i32 0 , i32 0 )
163+ %mai.2 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > %mai.1 , i32 0 , i32 0 , i32 0 )
164+ %mai.3 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > %mai.2 , i32 0 , i32 0 , i32 0 )
165+ store <32 x float > %mai.3 , ptr addrspace (1 ) %arg , align 128
166+ ret void
167+ }
168+
169+ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2 (ptr addrspace (1 ) %arg ) #0 {
170+ ; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2:
171+ ; CHECK: ; %bb.0: ; %bb
172+ ; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
173+ ; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
174+ ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
175+ ; CHECK-NEXT: s_nop 0
176+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 1.0
177+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
178+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
179+ ; CHECK-NEXT: v_mov_b32_e32 v32, 0
180+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
181+ ; CHECK-NEXT: s_nop 7
182+ ; CHECK-NEXT: s_nop 7
183+ ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
184+ ; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
185+ ; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
186+ ; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
187+ ; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
188+ ; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
189+ ; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
190+ ; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
191+ ; CHECK-NEXT: s_endpgm
192+ bb:
193+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
194+ %gep = getelementptr <32 x float >, ptr addrspace (1 ) %arg , i32 %id
195+ %in.1 = load <32 x float >, ptr addrspace (1 ) %gep , align 128
196+ %mai.1 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > splat (float 1 .0 ), i32 0 , i32 0 , i32 0 )
197+ %mai.2 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > %mai.1 , i32 0 , i32 0 , i32 0 )
198+ %mai.3 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > %mai.2 , i32 0 , i32 0 , i32 0 )
199+ store <32 x float > %mai.3 , ptr addrspace (1 ) %arg , align 128
200+ ret void
201+ }
202+
190203declare <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float , float , <32 x float >, i32 immarg, i32 immarg, i32 immarg) #1
191204declare noundef i32 @llvm.amdgcn.workitem.id.x () #2
192205
193- attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z " "amdgpu-waves-per-eu" ="4,4" }
206+ attributes #0 = { nounwind "amdgpu-flat-work-group-size" ="1,256" "amdgpu-waves-per-eu" ="4,4" }
194207attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
195208attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
0 commit comments