Skip to content

Commit 440c664

Browse files
committed
AMDGPU: Disable AGPR allocation in VGPR MFMA tests
The intent of these tests broke at some point; these were supposed to test both selection paths but the agpr and vgpr versions of the test functions were both selecting to the AGPR version. Explicitly disable AGPR usage with the attribute.
1 parent 1b4db78 commit 440c664

File tree

5 files changed

+1598
-1922
lines changed

5 files changed

+1598
-1922
lines changed

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll

Lines changed: 117 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
252252
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
253253
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
254254
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
255-
; GCN-NEXT: v_mov_b32_e32 v8, 0
255+
; GCN-NEXT: v_mov_b32_e32 v44, 0
256256
; GCN-NEXT: s_waitcnt lgkmcnt(0)
257-
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
258-
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
259-
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
260-
; GCN-NEXT: v_accvgpr_write_b32 a31, s23
261-
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
262-
; GCN-NEXT: v_accvgpr_write_b32 a30, s22
263-
; GCN-NEXT: v_accvgpr_write_b32 a29, s21
264-
; GCN-NEXT: v_accvgpr_write_b32 a28, s20
265-
; GCN-NEXT: v_accvgpr_write_b32 a27, s19
266-
; GCN-NEXT: v_accvgpr_write_b32 a26, s18
267-
; GCN-NEXT: v_accvgpr_write_b32 a25, s17
268-
; GCN-NEXT: v_accvgpr_write_b32 a24, s16
269-
; GCN-NEXT: v_accvgpr_write_b32 a23, s15
270-
; GCN-NEXT: v_accvgpr_write_b32 a22, s14
271-
; GCN-NEXT: v_accvgpr_write_b32 a21, s13
272-
; GCN-NEXT: v_accvgpr_write_b32 a20, s12
273-
; GCN-NEXT: v_accvgpr_write_b32 a19, s11
274-
; GCN-NEXT: v_accvgpr_write_b32 a18, s10
275-
; GCN-NEXT: v_accvgpr_write_b32 a17, s9
276-
; GCN-NEXT: v_accvgpr_write_b32 a16, s8
277-
; GCN-NEXT: v_mov_b32_e32 v10, s20
278-
; GCN-NEXT: v_mov_b32_e32 v11, s21
279-
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
280-
; GCN-NEXT: v_mov_b32_e32 v12, s22
281-
; GCN-NEXT: v_mov_b32_e32 v13, s23
282-
; GCN-NEXT: v_mov_b32_e32 v0, s16
283-
; GCN-NEXT: v_mov_b32_e32 v1, s17
284-
; GCN-NEXT: v_mov_b32_e32 v2, s18
285-
; GCN-NEXT: v_mov_b32_e32 v3, s19
286-
; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
287-
; GCN-NEXT: s_waitcnt vmcnt(0)
288-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
257+
; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
258+
; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
259+
; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
260+
; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
261+
; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
262+
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
263+
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
264+
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
265+
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
266+
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
267+
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
268+
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
269+
; GCN-NEXT: v_mov_b32_e32 v40, s20
270+
; GCN-NEXT: v_mov_b32_e32 v41, s21
271+
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
272+
; GCN-NEXT: v_mov_b32_e32 v42, s22
273+
; GCN-NEXT: v_mov_b32_e32 v43, s23
274+
; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
275+
; GCN-NEXT: s_waitcnt vmcnt(0)
276+
; GCN-NEXT: s_nop 2
277+
; GCN-NEXT: v_mov_b32_e32 v16, s16
278+
; GCN-NEXT: v_mov_b32_e32 v17, s17
279+
; GCN-NEXT: v_mov_b32_e32 v18, s18
280+
; GCN-NEXT: v_mov_b32_e32 v19, s19
281+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
289282
; GCN-NEXT: s_waitcnt vmcnt(0)
290283
; GCN-NEXT: s_nop 0
291-
; GCN-NEXT: v_mov_b32_e32 v0, s12
292-
; GCN-NEXT: v_mov_b32_e32 v1, s13
293-
; GCN-NEXT: v_mov_b32_e32 v2, s14
294-
; GCN-NEXT: v_mov_b32_e32 v3, s15
295-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
284+
; GCN-NEXT: v_mov_b32_e32 v16, s12
285+
; GCN-NEXT: v_mov_b32_e32 v17, s13
286+
; GCN-NEXT: v_mov_b32_e32 v18, s14
287+
; GCN-NEXT: v_mov_b32_e32 v19, s15
288+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
296289
; GCN-NEXT: s_waitcnt vmcnt(0)
297290
; GCN-NEXT: s_nop 0
298-
; GCN-NEXT: v_mov_b32_e32 v0, s8
299-
; GCN-NEXT: v_mov_b32_e32 v1, s9
300-
; GCN-NEXT: v_mov_b32_e32 v2, s10
301-
; GCN-NEXT: v_mov_b32_e32 v3, s11
302-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
291+
; GCN-NEXT: v_mov_b32_e32 v16, s8
292+
; GCN-NEXT: v_mov_b32_e32 v17, s9
293+
; GCN-NEXT: v_mov_b32_e32 v18, s10
294+
; GCN-NEXT: v_mov_b32_e32 v19, s11
295+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
303296
; GCN-NEXT: s_waitcnt vmcnt(0)
304-
; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
297+
; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
305298
; GCN-NEXT: s_waitcnt vmcnt(0)
306-
; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
299+
; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
307300
; GCN-NEXT: s_waitcnt vmcnt(0)
308-
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
301+
; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
309302
; GCN-NEXT: s_waitcnt vmcnt(0)
310-
; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
303+
; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
311304
; GCN-NEXT: s_waitcnt vmcnt(0)
312305
; GCN-NEXT: s_endpgm
313306
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
322315
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
323316
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
324317
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
325-
; GCN-NEXT: v_mov_b32_e32 v8, 0
318+
; GCN-NEXT: v_mov_b32_e32 v44, 0
326319
; GCN-NEXT: s_waitcnt lgkmcnt(0)
327-
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
328-
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
329-
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
330-
; GCN-NEXT: v_accvgpr_write_b32 a31, s23
331-
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
332-
; GCN-NEXT: v_accvgpr_write_b32 a30, s22
333-
; GCN-NEXT: v_accvgpr_write_b32 a29, s21
334-
; GCN-NEXT: v_accvgpr_write_b32 a28, s20
335-
; GCN-NEXT: v_accvgpr_write_b32 a27, s19
336-
; GCN-NEXT: v_accvgpr_write_b32 a26, s18
337-
; GCN-NEXT: v_accvgpr_write_b32 a25, s17
338-
; GCN-NEXT: v_accvgpr_write_b32 a24, s16
339-
; GCN-NEXT: v_accvgpr_write_b32 a23, s15
340-
; GCN-NEXT: v_accvgpr_write_b32 a22, s14
341-
; GCN-NEXT: v_accvgpr_write_b32 a21, s13
342-
; GCN-NEXT: v_accvgpr_write_b32 a20, s12
343-
; GCN-NEXT: v_accvgpr_write_b32 a19, s11
344-
; GCN-NEXT: v_accvgpr_write_b32 a18, s10
345-
; GCN-NEXT: v_accvgpr_write_b32 a17, s9
346-
; GCN-NEXT: v_accvgpr_write_b32 a16, s8
347-
; GCN-NEXT: v_mov_b32_e32 v10, s20
348-
; GCN-NEXT: v_mov_b32_e32 v11, s21
349-
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
350-
; GCN-NEXT: v_mov_b32_e32 v12, s22
351-
; GCN-NEXT: v_mov_b32_e32 v13, s23
352-
; GCN-NEXT: v_mov_b32_e32 v0, s16
353-
; GCN-NEXT: v_mov_b32_e32 v1, s17
354-
; GCN-NEXT: v_mov_b32_e32 v2, s18
355-
; GCN-NEXT: v_mov_b32_e32 v3, s19
356-
; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
357-
; GCN-NEXT: s_waitcnt vmcnt(0)
358-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
320+
; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
321+
; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
322+
; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
323+
; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
324+
; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
325+
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
326+
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
327+
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
328+
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
329+
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
330+
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
331+
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
332+
; GCN-NEXT: v_mov_b32_e32 v40, s20
333+
; GCN-NEXT: v_mov_b32_e32 v41, s21
334+
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
335+
; GCN-NEXT: v_mov_b32_e32 v42, s22
336+
; GCN-NEXT: v_mov_b32_e32 v43, s23
337+
; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
338+
; GCN-NEXT: s_waitcnt vmcnt(0)
339+
; GCN-NEXT: s_nop 2
340+
; GCN-NEXT: v_mov_b32_e32 v16, s16
341+
; GCN-NEXT: v_mov_b32_e32 v17, s17
342+
; GCN-NEXT: v_mov_b32_e32 v18, s18
343+
; GCN-NEXT: v_mov_b32_e32 v19, s19
344+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
359345
; GCN-NEXT: s_waitcnt vmcnt(0)
360346
; GCN-NEXT: s_nop 0
361-
; GCN-NEXT: v_mov_b32_e32 v0, s12
362-
; GCN-NEXT: v_mov_b32_e32 v1, s13
363-
; GCN-NEXT: v_mov_b32_e32 v2, s14
364-
; GCN-NEXT: v_mov_b32_e32 v3, s15
365-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
347+
; GCN-NEXT: v_mov_b32_e32 v16, s12
348+
; GCN-NEXT: v_mov_b32_e32 v17, s13
349+
; GCN-NEXT: v_mov_b32_e32 v18, s14
350+
; GCN-NEXT: v_mov_b32_e32 v19, s15
351+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
366352
; GCN-NEXT: s_waitcnt vmcnt(0)
367353
; GCN-NEXT: s_nop 0
368-
; GCN-NEXT: v_mov_b32_e32 v0, s8
369-
; GCN-NEXT: v_mov_b32_e32 v1, s9
370-
; GCN-NEXT: v_mov_b32_e32 v2, s10
371-
; GCN-NEXT: v_mov_b32_e32 v3, s11
372-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
354+
; GCN-NEXT: v_mov_b32_e32 v16, s8
355+
; GCN-NEXT: v_mov_b32_e32 v17, s9
356+
; GCN-NEXT: v_mov_b32_e32 v18, s10
357+
; GCN-NEXT: v_mov_b32_e32 v19, s11
358+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
373359
; GCN-NEXT: s_waitcnt vmcnt(0)
374-
; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
360+
; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
375361
; GCN-NEXT: s_waitcnt vmcnt(0)
376-
; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
362+
; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
377363
; GCN-NEXT: s_waitcnt vmcnt(0)
378-
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
364+
; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
379365
; GCN-NEXT: s_waitcnt vmcnt(0)
380-
; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
366+
; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
381367
; GCN-NEXT: s_waitcnt vmcnt(0)
382368
; GCN-NEXT: s_endpgm
383369
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
@@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
393379
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
394380
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
395381
; GCN-NEXT: s_waitcnt lgkmcnt(0)
396-
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
397-
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
398-
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
399-
; GCN-NEXT: v_accvgpr_write_b32 a0, s8
400-
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
401-
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
402-
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
403-
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
404-
; GCN-NEXT: v_accvgpr_write_b32 a4, s12
405-
; GCN-NEXT: v_accvgpr_write_b32 a5, s13
406-
; GCN-NEXT: v_accvgpr_write_b32 a6, s14
407-
; GCN-NEXT: v_accvgpr_write_b32 a7, s15
408-
; GCN-NEXT: v_accvgpr_write_b32 a8, s16
409-
; GCN-NEXT: v_accvgpr_write_b32 a9, s17
410-
; GCN-NEXT: v_accvgpr_write_b32 a10, s18
411-
; GCN-NEXT: v_accvgpr_write_b32 a11, s19
412-
; GCN-NEXT: v_accvgpr_write_b32 a12, s20
413-
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
414-
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
415-
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
382+
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
383+
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
384+
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
385+
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
386+
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
387+
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
388+
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
389+
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
390+
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
391+
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
392+
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
393+
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
416394
; GCN-NEXT: s_nop 1
417-
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
418-
; GCN-NEXT: v_mov_b32_e32 v0, 0
395+
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
396+
; GCN-NEXT: v_mov_b32_e32 v16, 0
419397
; GCN-NEXT: s_nop 7
420398
; GCN-NEXT: s_nop 2
421-
; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
422-
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
423-
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
424-
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
399+
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
400+
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
401+
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
402+
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
425403
; GCN-NEXT: s_endpgm
426404
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
427405
store <16 x float> %result, ptr addrspace(1) %out
@@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
435413
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
436414
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
437415
; GCN-NEXT: s_waitcnt lgkmcnt(0)
438-
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
439-
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
440-
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
441-
; GCN-NEXT: v_accvgpr_write_b32 a0, s8
442-
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
443-
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
444-
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
445-
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
446-
; GCN-NEXT: v_accvgpr_write_b32 a4, s12
447-
; GCN-NEXT: v_accvgpr_write_b32 a5, s13
448-
; GCN-NEXT: v_accvgpr_write_b32 a6, s14
449-
; GCN-NEXT: v_accvgpr_write_b32 a7, s15
450-
; GCN-NEXT: v_accvgpr_write_b32 a8, s16
451-
; GCN-NEXT: v_accvgpr_write_b32 a9, s17
452-
; GCN-NEXT: v_accvgpr_write_b32 a10, s18
453-
; GCN-NEXT: v_accvgpr_write_b32 a11, s19
454-
; GCN-NEXT: v_accvgpr_write_b32 a12, s20
455-
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
456-
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
457-
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
416+
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
417+
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
418+
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
419+
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
420+
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
421+
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
422+
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
423+
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
424+
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
425+
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
426+
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
427+
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
458428
; GCN-NEXT: s_nop 1
459-
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
460-
; GCN-NEXT: v_mov_b32_e32 v0, 0
429+
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
430+
; GCN-NEXT: v_mov_b32_e32 v16, 0
461431
; GCN-NEXT: s_nop 7
462432
; GCN-NEXT: s_nop 2
463-
; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
464-
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
465-
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
466-
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
433+
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
434+
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
435+
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
436+
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
467437
; GCN-NEXT: s_endpgm
468438
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
469439
store <16 x float> %result, ptr addrspace(1) %out
470440
ret void
471441
}
472442

473-
attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
443+
attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
474444
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }

0 commit comments

Comments
 (0)