1
1
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2
- ; RUN: llc -mcpu=gfx90a < %s | FileCheck %s
2
+ ; RUN: llc -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s
3
3
4
4
target triple = "amdgcn-amd-amdhsa"
5
5
6
6
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma (ptr addrspace (1 ) %arg ) #0 {
7
7
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma:
8
8
; CHECK: ; %bb.0: ; %bb
9
9
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
10
+ ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
10
11
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
12
+ ; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
13
+ ; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
11
14
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
12
15
; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
13
16
; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
@@ -18,117 +21,58 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
18
21
; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
19
22
; CHECK-NEXT: s_nop 0
20
23
; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
24
+ ; CHECK-NEXT: v_accvgpr_write_b32 a0, 1.0
25
+ ; CHECK-NEXT: v_accvgpr_write_b32 a1, 2.0
21
26
; CHECK-NEXT: s_waitcnt vmcnt(0)
22
- ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
23
- ; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
24
- ; CHECK-NEXT: v_accvgpr_write_b32 a2, v2
25
- ; CHECK-NEXT: v_accvgpr_write_b32 a3, v3
26
- ; CHECK-NEXT: v_accvgpr_write_b32 a4, v4
27
- ; CHECK-NEXT: v_accvgpr_write_b32 a5, v5
28
- ; CHECK-NEXT: v_accvgpr_write_b32 a6, v6
29
- ; CHECK-NEXT: v_accvgpr_write_b32 a7, v7
30
- ; CHECK-NEXT: v_accvgpr_write_b32 a8, v8
31
- ; CHECK-NEXT: v_accvgpr_write_b32 a9, v9
32
- ; CHECK-NEXT: v_accvgpr_write_b32 a10, v10
33
- ; CHECK-NEXT: v_accvgpr_write_b32 a11, v11
34
- ; CHECK-NEXT: v_accvgpr_write_b32 a12, v12
35
- ; CHECK-NEXT: v_accvgpr_write_b32 a13, v13
36
- ; CHECK-NEXT: v_accvgpr_write_b32 a14, v14
37
- ; CHECK-NEXT: v_accvgpr_write_b32 a15, v15
38
- ; CHECK-NEXT: v_accvgpr_write_b32 a16, v16
39
- ; CHECK-NEXT: v_accvgpr_write_b32 a17, v17
40
- ; CHECK-NEXT: v_accvgpr_write_b32 a18, v18
41
- ; CHECK-NEXT: v_accvgpr_write_b32 a19, v19
42
- ; CHECK-NEXT: v_accvgpr_write_b32 a20, v20
43
- ; CHECK-NEXT: v_accvgpr_write_b32 a21, v21
44
- ; CHECK-NEXT: v_accvgpr_write_b32 a22, v22
45
- ; CHECK-NEXT: v_accvgpr_write_b32 a23, v23
46
- ; CHECK-NEXT: v_accvgpr_write_b32 a24, v24
47
- ; CHECK-NEXT: v_accvgpr_write_b32 a25, v25
48
- ; CHECK-NEXT: v_accvgpr_write_b32 a26, v26
49
- ; CHECK-NEXT: v_accvgpr_write_b32 a27, v27
50
- ; CHECK-NEXT: v_accvgpr_write_b32 a28, v28
51
- ; CHECK-NEXT: v_accvgpr_write_b32 a29, v29
52
- ; CHECK-NEXT: v_accvgpr_write_b32 a30, v30
53
- ; CHECK-NEXT: v_accvgpr_write_b32 a31, v31
54
- ; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
55
- ; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
56
- ; CHECK-NEXT: s_nop 1
57
- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
58
- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
27
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
28
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a0, a1, v[0:31]
59
29
; CHECK-NEXT: s_nop 7
60
30
; CHECK-NEXT: s_nop 7
61
- ; CHECK-NEXT: s_nop 2
62
- ; CHECK-NEXT: v_accvgpr_read_b32 v4, a59
63
- ; CHECK-NEXT: v_accvgpr_read_b32 v5, a58
64
- ; CHECK-NEXT: v_accvgpr_read_b32 v6, a57
65
- ; CHECK-NEXT: v_accvgpr_read_b32 v7, a56
66
- ; CHECK-NEXT: v_accvgpr_read_b32 v8, a55
67
- ; CHECK-NEXT: v_accvgpr_read_b32 v9, a54
68
- ; CHECK-NEXT: v_accvgpr_read_b32 v10, a53
69
- ; CHECK-NEXT: v_accvgpr_read_b32 v11, a52
70
- ; CHECK-NEXT: v_accvgpr_read_b32 v12, a51
71
- ; CHECK-NEXT: v_accvgpr_read_b32 v13, a50
72
- ; CHECK-NEXT: v_accvgpr_read_b32 v14, a49
73
- ; CHECK-NEXT: v_accvgpr_read_b32 v15, a48
74
- ; CHECK-NEXT: v_accvgpr_read_b32 v16, a47
75
- ; CHECK-NEXT: v_accvgpr_read_b32 v17, a46
76
- ; CHECK-NEXT: v_accvgpr_read_b32 v18, a45
77
- ; CHECK-NEXT: v_accvgpr_read_b32 v19, a44
78
- ; CHECK-NEXT: v_accvgpr_read_b32 v20, a43
79
- ; CHECK-NEXT: v_accvgpr_read_b32 v21, a42
80
- ; CHECK-NEXT: v_accvgpr_read_b32 v22, a41
81
- ; CHECK-NEXT: v_accvgpr_read_b32 v23, a40
82
- ; CHECK-NEXT: v_accvgpr_read_b32 v24, a39
83
- ; CHECK-NEXT: v_accvgpr_read_b32 v25, a38
84
- ; CHECK-NEXT: v_accvgpr_read_b32 v26, a37
85
- ; CHECK-NEXT: v_accvgpr_read_b32 v27, a36
86
- ; CHECK-NEXT: v_accvgpr_read_b32 v28, a35
87
- ; CHECK-NEXT: v_accvgpr_read_b32 v29, a34
88
- ; CHECK-NEXT: v_accvgpr_mov_b32 a2, a32
89
- ; CHECK-NEXT: v_accvgpr_mov_b32 a3, a33
90
- ; CHECK-NEXT: v_accvgpr_write_b32 a4, v29
91
- ; CHECK-NEXT: v_accvgpr_write_b32 a5, v28
92
- ; CHECK-NEXT: v_accvgpr_write_b32 a6, v27
93
- ; CHECK-NEXT: v_accvgpr_write_b32 a7, v26
94
- ; CHECK-NEXT: v_accvgpr_write_b32 a8, v25
95
- ; CHECK-NEXT: v_accvgpr_write_b32 a9, v24
96
- ; CHECK-NEXT: v_accvgpr_write_b32 a10, v23
97
- ; CHECK-NEXT: v_accvgpr_write_b32 a11, v22
98
- ; CHECK-NEXT: v_accvgpr_write_b32 a12, v21
99
- ; CHECK-NEXT: v_accvgpr_write_b32 a13, v20
100
- ; CHECK-NEXT: v_accvgpr_write_b32 a14, v19
101
- ; CHECK-NEXT: v_accvgpr_write_b32 a15, v18
102
- ; CHECK-NEXT: v_accvgpr_write_b32 a16, v17
103
- ; CHECK-NEXT: v_accvgpr_write_b32 a17, v16
104
- ; CHECK-NEXT: v_accvgpr_write_b32 a18, v15
105
- ; CHECK-NEXT: v_accvgpr_write_b32 a19, v14
106
- ; CHECK-NEXT: v_accvgpr_write_b32 a20, v13
107
- ; CHECK-NEXT: v_accvgpr_write_b32 a21, v12
108
- ; CHECK-NEXT: v_accvgpr_write_b32 a22, v11
109
- ; CHECK-NEXT: v_accvgpr_write_b32 a23, v10
110
- ; CHECK-NEXT: v_accvgpr_write_b32 a24, v9
111
- ; CHECK-NEXT: v_accvgpr_write_b32 a25, v8
112
- ; CHECK-NEXT: v_accvgpr_write_b32 a26, v7
113
- ; CHECK-NEXT: v_accvgpr_write_b32 a27, v6
114
- ; CHECK-NEXT: v_accvgpr_write_b32 a28, v5
115
- ; CHECK-NEXT: v_accvgpr_write_b32 a29, v4
116
- ; CHECK-NEXT: v_accvgpr_mov_b32 a30, a60
117
- ; CHECK-NEXT: v_accvgpr_mov_b32 a31, a61
118
31
; CHECK-NEXT: s_nop 1
119
- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
120
- ; CHECK-NEXT: v_mov_b32_e32 v0, 0
32
+ ; CHECK-NEXT: v_mov_b32_e32 v2, v32
33
+ ; CHECK-NEXT: v_mov_b32_e32 v3, v33
34
+ ; CHECK-NEXT: v_mov_b32_e32 v4, v34
35
+ ; CHECK-NEXT: v_mov_b32_e32 v5, v35
36
+ ; CHECK-NEXT: v_mov_b32_e32 v6, v36
37
+ ; CHECK-NEXT: v_mov_b32_e32 v7, v37
38
+ ; CHECK-NEXT: v_mov_b32_e32 v8, v38
39
+ ; CHECK-NEXT: v_mov_b32_e32 v9, v39
40
+ ; CHECK-NEXT: v_mov_b32_e32 v10, v40
41
+ ; CHECK-NEXT: v_mov_b32_e32 v11, v41
42
+ ; CHECK-NEXT: v_mov_b32_e32 v12, v42
43
+ ; CHECK-NEXT: v_mov_b32_e32 v13, v43
44
+ ; CHECK-NEXT: v_mov_b32_e32 v14, v44
45
+ ; CHECK-NEXT: v_mov_b32_e32 v15, v45
46
+ ; CHECK-NEXT: v_mov_b32_e32 v16, v46
47
+ ; CHECK-NEXT: v_mov_b32_e32 v17, v47
48
+ ; CHECK-NEXT: v_mov_b32_e32 v18, v48
49
+ ; CHECK-NEXT: v_mov_b32_e32 v19, v49
50
+ ; CHECK-NEXT: v_mov_b32_e32 v20, v50
51
+ ; CHECK-NEXT: v_mov_b32_e32 v21, v51
52
+ ; CHECK-NEXT: v_mov_b32_e32 v22, v52
53
+ ; CHECK-NEXT: v_mov_b32_e32 v23, v53
54
+ ; CHECK-NEXT: v_mov_b32_e32 v24, v54
55
+ ; CHECK-NEXT: v_mov_b32_e32 v25, v55
56
+ ; CHECK-NEXT: v_mov_b32_e32 v26, v56
57
+ ; CHECK-NEXT: v_mov_b32_e32 v27, v57
58
+ ; CHECK-NEXT: v_mov_b32_e32 v28, v58
59
+ ; CHECK-NEXT: v_mov_b32_e32 v29, v59
60
+ ; CHECK-NEXT: v_mov_b32_e32 v30, v60
61
+ ; CHECK-NEXT: v_mov_b32_e32 v31, v61
62
+ ; CHECK-NEXT: v_mov_b32_e32 v32, 0
63
+ ; CHECK-NEXT: s_nop 0
64
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[0:31]
121
65
; CHECK-NEXT: s_nop 7
122
66
; CHECK-NEXT: s_nop 7
123
67
; CHECK-NEXT: s_nop 1
124
- ; CHECK-NEXT: global_store_dwordx4 v0, a [24:27], s[0:1] offset:96
125
- ; CHECK-NEXT: global_store_dwordx4 v0, a [28:31], s[0:1] offset:112
126
- ; CHECK-NEXT: global_store_dwordx4 v0, a [16:19], s[0:1] offset:64
127
- ; CHECK-NEXT: global_store_dwordx4 v0, a [20:23], s[0:1] offset:80
128
- ; CHECK-NEXT: global_store_dwordx4 v0, a [8:11], s[0:1] offset:32
129
- ; CHECK-NEXT: global_store_dwordx4 v0, a [12:15], s[0:1] offset:48
130
- ; CHECK-NEXT: global_store_dwordx4 v0, a [0:3], s[0:1]
131
- ; CHECK-NEXT: global_store_dwordx4 v0, a [4:7], s[0:1] offset:16
68
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [24:27], s[0:1] offset:96
69
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [28:31], s[0:1] offset:112
70
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [16:19], s[0:1] offset:64
71
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [20:23], s[0:1] offset:80
72
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [8:11], s[0:1] offset:32
73
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [12:15], s[0:1] offset:48
74
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [0:3], s[0:1]
75
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [4:7], s[0:1] offset:16
132
76
; CHECK-NEXT: s_endpgm
133
77
bb:
134
78
%id = call i32 @llvm.amdgcn.workitem.id.x ()
@@ -146,35 +90,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
146
90
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle:
147
91
; CHECK: ; %bb.0: ; %bb
148
92
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
93
+ ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
149
94
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
150
- ; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
95
+ ; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
96
+ ; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
151
97
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
152
- ; CHECK-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
153
- ; CHECK-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
154
- ; CHECK-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
155
- ; CHECK-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
156
- ; CHECK-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
157
- ; CHECK-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
158
- ; CHECK-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
159
- ; CHECK-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
160
- ; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
161
- ; CHECK-NEXT: s_waitcnt vmcnt(0)
98
+ ; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
99
+ ; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
100
+ ; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80
101
+ ; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64
102
+ ; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48
103
+ ; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32
104
+ ; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
162
105
; CHECK-NEXT: s_nop 0
163
- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
164
- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
165
- ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
166
- ; CHECK-NEXT: v_mov_b32_e32 v0, 0
106
+ ; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
107
+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
108
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
109
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
110
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
111
+ ; CHECK-NEXT: v_mov_b32_e32 v32, 0
167
112
; CHECK-NEXT: s_nop 7
168
113
; CHECK-NEXT: s_nop 7
169
- ; CHECK-NEXT: s_nop 1
170
- ; CHECK-NEXT: global_store_dwordx4 v0, a [24:27], s[0:1] offset:96
171
- ; CHECK-NEXT: global_store_dwordx4 v0, a [28:31], s[0:1] offset:112
172
- ; CHECK-NEXT: global_store_dwordx4 v0, a [16:19], s[0:1] offset:64
173
- ; CHECK-NEXT: global_store_dwordx4 v0, a [20:23], s[0:1] offset:80
174
- ; CHECK-NEXT: global_store_dwordx4 v0, a [8:11], s[0:1] offset:32
175
- ; CHECK-NEXT: global_store_dwordx4 v0, a [12:15], s[0:1] offset:48
176
- ; CHECK-NEXT: global_store_dwordx4 v0, a [0:3], s[0:1]
177
- ; CHECK-NEXT: global_store_dwordx4 v0, a [4:7], s[0:1] offset:16
114
+ ; CHECK-NEXT: s_nop 0
115
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [24:27], s[0:1] offset:96
116
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [28:31], s[0:1] offset:112
117
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [16:19], s[0:1] offset:64
118
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [20:23], s[0:1] offset:80
119
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [8:11], s[0:1] offset:32
120
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [12:15], s[0:1] offset:48
121
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [0:3], s[0:1]
122
+ ; CHECK-NEXT: global_store_dwordx4 v32, v [4:7], s[0:1] offset:16
178
123
; CHECK-NEXT: s_endpgm
179
124
bb:
180
125
%id = call i32 @llvm.amdgcn.workitem.id.x ()
187
132
ret void
188
133
}
189
134
135
+ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2 (ptr addrspace (1 ) %arg ) #0 {
136
+ ; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2:
137
+ ; CHECK: ; %bb.0: ; %bb
138
+ ; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
139
+ ; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
140
+ ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
141
+ ; CHECK-NEXT: s_nop 0
142
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
143
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
144
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
145
+ ; CHECK-NEXT: v_mov_b32_e32 v32, 0
146
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
147
+ ; CHECK-NEXT: s_nop 7
148
+ ; CHECK-NEXT: s_nop 7
149
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
150
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
151
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
152
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
153
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
154
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
155
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
156
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
157
+ ; CHECK-NEXT: s_endpgm
158
+ bb:
159
+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
160
+ %gep = getelementptr <32 x float >, ptr addrspace (1 ) %arg , i32 %id
161
+ %in.1 = load <32 x float >, ptr addrspace (1 ) %gep , align 128
162
+ %mai.1 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > zeroinitializer , i32 0 , i32 0 , i32 0 )
163
+ %mai.2 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > %mai.1 , i32 0 , i32 0 , i32 0 )
164
+ %mai.3 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > %mai.2 , i32 0 , i32 0 , i32 0 )
165
+ store <32 x float > %mai.3 , ptr addrspace (1 ) %arg , align 128
166
+ ret void
167
+ }
168
+
169
+ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2 (ptr addrspace (1 ) %arg ) #0 {
170
+ ; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2:
171
+ ; CHECK: ; %bb.0: ; %bb
172
+ ; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
173
+ ; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
174
+ ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
175
+ ; CHECK-NEXT: s_nop 0
176
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 1.0
177
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
178
+ ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
179
+ ; CHECK-NEXT: v_mov_b32_e32 v32, 0
180
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
181
+ ; CHECK-NEXT: s_nop 7
182
+ ; CHECK-NEXT: s_nop 7
183
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
184
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
185
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
186
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
187
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
188
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
189
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
190
+ ; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
191
+ ; CHECK-NEXT: s_endpgm
192
+ bb:
193
+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
194
+ %gep = getelementptr <32 x float >, ptr addrspace (1 ) %arg , i32 %id
195
+ %in.1 = load <32 x float >, ptr addrspace (1 ) %gep , align 128
196
+ %mai.1 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > splat (float 1 .0 ), i32 0 , i32 0 , i32 0 )
197
+ %mai.2 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > %mai.1 , i32 0 , i32 0 , i32 0 )
198
+ %mai.3 = call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float 1 .0 , float 2 .0 , <32 x float > %mai.2 , i32 0 , i32 0 , i32 0 )
199
+ store <32 x float > %mai.3 , ptr addrspace (1 ) %arg , align 128
200
+ ret void
201
+ }
202
+
190
203
declare <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float , float , <32 x float >, i32 immarg, i32 immarg, i32 immarg) #1
191
204
declare noundef i32 @llvm.amdgcn.workitem.id.x () #2
192
205
193
- attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z " "amdgpu-waves-per-eu" ="4,4" }
206
+ attributes #0 = { nounwind "amdgpu-flat-work-group-size" ="1,256" "amdgpu-waves-per-eu" ="4,4" }
194
207
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
195
208
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
0 commit comments