Skip to content

Commit 6fb8e58

Browse files
authored
AMDGPU: Disable AGPR allocation in VGPR MFMA tests (#150873)
1 parent f9f68af commit 6fb8e58

File tree

5 files changed

+1598
-1922
lines changed

5 files changed

+1598
-1922
lines changed

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll

Lines changed: 117 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
252252
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
253253
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
254254
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
255-
; GCN-NEXT: v_mov_b32_e32 v8, 0
255+
; GCN-NEXT: v_mov_b32_e32 v44, 0
256256
; GCN-NEXT: s_waitcnt lgkmcnt(0)
257-
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
258-
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
259-
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
260-
; GCN-NEXT: v_accvgpr_write_b32 a31, s23
261-
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
262-
; GCN-NEXT: v_accvgpr_write_b32 a30, s22
263-
; GCN-NEXT: v_accvgpr_write_b32 a29, s21
264-
; GCN-NEXT: v_accvgpr_write_b32 a28, s20
265-
; GCN-NEXT: v_accvgpr_write_b32 a27, s19
266-
; GCN-NEXT: v_accvgpr_write_b32 a26, s18
267-
; GCN-NEXT: v_accvgpr_write_b32 a25, s17
268-
; GCN-NEXT: v_accvgpr_write_b32 a24, s16
269-
; GCN-NEXT: v_accvgpr_write_b32 a23, s15
270-
; GCN-NEXT: v_accvgpr_write_b32 a22, s14
271-
; GCN-NEXT: v_accvgpr_write_b32 a21, s13
272-
; GCN-NEXT: v_accvgpr_write_b32 a20, s12
273-
; GCN-NEXT: v_accvgpr_write_b32 a19, s11
274-
; GCN-NEXT: v_accvgpr_write_b32 a18, s10
275-
; GCN-NEXT: v_accvgpr_write_b32 a17, s9
276-
; GCN-NEXT: v_accvgpr_write_b32 a16, s8
277-
; GCN-NEXT: v_mov_b32_e32 v10, s20
278-
; GCN-NEXT: v_mov_b32_e32 v11, s21
279-
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
280-
; GCN-NEXT: v_mov_b32_e32 v12, s22
281-
; GCN-NEXT: v_mov_b32_e32 v13, s23
282-
; GCN-NEXT: v_mov_b32_e32 v0, s16
283-
; GCN-NEXT: v_mov_b32_e32 v1, s17
284-
; GCN-NEXT: v_mov_b32_e32 v2, s18
285-
; GCN-NEXT: v_mov_b32_e32 v3, s19
286-
; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
287-
; GCN-NEXT: s_waitcnt vmcnt(0)
288-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
257+
; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
258+
; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
259+
; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
260+
; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
261+
; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
262+
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
263+
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
264+
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
265+
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
266+
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
267+
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
268+
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
269+
; GCN-NEXT: v_mov_b32_e32 v40, s20
270+
; GCN-NEXT: v_mov_b32_e32 v41, s21
271+
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
272+
; GCN-NEXT: v_mov_b32_e32 v42, s22
273+
; GCN-NEXT: v_mov_b32_e32 v43, s23
274+
; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
275+
; GCN-NEXT: s_waitcnt vmcnt(0)
276+
; GCN-NEXT: s_nop 2
277+
; GCN-NEXT: v_mov_b32_e32 v16, s16
278+
; GCN-NEXT: v_mov_b32_e32 v17, s17
279+
; GCN-NEXT: v_mov_b32_e32 v18, s18
280+
; GCN-NEXT: v_mov_b32_e32 v19, s19
281+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
289282
; GCN-NEXT: s_waitcnt vmcnt(0)
290283
; GCN-NEXT: s_nop 0
291-
; GCN-NEXT: v_mov_b32_e32 v0, s12
292-
; GCN-NEXT: v_mov_b32_e32 v1, s13
293-
; GCN-NEXT: v_mov_b32_e32 v2, s14
294-
; GCN-NEXT: v_mov_b32_e32 v3, s15
295-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
284+
; GCN-NEXT: v_mov_b32_e32 v16, s12
285+
; GCN-NEXT: v_mov_b32_e32 v17, s13
286+
; GCN-NEXT: v_mov_b32_e32 v18, s14
287+
; GCN-NEXT: v_mov_b32_e32 v19, s15
288+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
296289
; GCN-NEXT: s_waitcnt vmcnt(0)
297290
; GCN-NEXT: s_nop 0
298-
; GCN-NEXT: v_mov_b32_e32 v0, s8
299-
; GCN-NEXT: v_mov_b32_e32 v1, s9
300-
; GCN-NEXT: v_mov_b32_e32 v2, s10
301-
; GCN-NEXT: v_mov_b32_e32 v3, s11
302-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
291+
; GCN-NEXT: v_mov_b32_e32 v16, s8
292+
; GCN-NEXT: v_mov_b32_e32 v17, s9
293+
; GCN-NEXT: v_mov_b32_e32 v18, s10
294+
; GCN-NEXT: v_mov_b32_e32 v19, s11
295+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
303296
; GCN-NEXT: s_waitcnt vmcnt(0)
304-
; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
297+
; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
305298
; GCN-NEXT: s_waitcnt vmcnt(0)
306-
; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
299+
; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
307300
; GCN-NEXT: s_waitcnt vmcnt(0)
308-
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
301+
; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
309302
; GCN-NEXT: s_waitcnt vmcnt(0)
310-
; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
303+
; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
311304
; GCN-NEXT: s_waitcnt vmcnt(0)
312305
; GCN-NEXT: s_endpgm
313306
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
322315
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
323316
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
324317
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
325-
; GCN-NEXT: v_mov_b32_e32 v8, 0
318+
; GCN-NEXT: v_mov_b32_e32 v44, 0
326319
; GCN-NEXT: s_waitcnt lgkmcnt(0)
327-
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
328-
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
329-
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
330-
; GCN-NEXT: v_accvgpr_write_b32 a31, s23
331-
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
332-
; GCN-NEXT: v_accvgpr_write_b32 a30, s22
333-
; GCN-NEXT: v_accvgpr_write_b32 a29, s21
334-
; GCN-NEXT: v_accvgpr_write_b32 a28, s20
335-
; GCN-NEXT: v_accvgpr_write_b32 a27, s19
336-
; GCN-NEXT: v_accvgpr_write_b32 a26, s18
337-
; GCN-NEXT: v_accvgpr_write_b32 a25, s17
338-
; GCN-NEXT: v_accvgpr_write_b32 a24, s16
339-
; GCN-NEXT: v_accvgpr_write_b32 a23, s15
340-
; GCN-NEXT: v_accvgpr_write_b32 a22, s14
341-
; GCN-NEXT: v_accvgpr_write_b32 a21, s13
342-
; GCN-NEXT: v_accvgpr_write_b32 a20, s12
343-
; GCN-NEXT: v_accvgpr_write_b32 a19, s11
344-
; GCN-NEXT: v_accvgpr_write_b32 a18, s10
345-
; GCN-NEXT: v_accvgpr_write_b32 a17, s9
346-
; GCN-NEXT: v_accvgpr_write_b32 a16, s8
347-
; GCN-NEXT: v_mov_b32_e32 v10, s20
348-
; GCN-NEXT: v_mov_b32_e32 v11, s21
349-
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
350-
; GCN-NEXT: v_mov_b32_e32 v12, s22
351-
; GCN-NEXT: v_mov_b32_e32 v13, s23
352-
; GCN-NEXT: v_mov_b32_e32 v0, s16
353-
; GCN-NEXT: v_mov_b32_e32 v1, s17
354-
; GCN-NEXT: v_mov_b32_e32 v2, s18
355-
; GCN-NEXT: v_mov_b32_e32 v3, s19
356-
; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
357-
; GCN-NEXT: s_waitcnt vmcnt(0)
358-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
320+
; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
321+
; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
322+
; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
323+
; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
324+
; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
325+
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
326+
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
327+
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
328+
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
329+
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
330+
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
331+
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
332+
; GCN-NEXT: v_mov_b32_e32 v40, s20
333+
; GCN-NEXT: v_mov_b32_e32 v41, s21
334+
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
335+
; GCN-NEXT: v_mov_b32_e32 v42, s22
336+
; GCN-NEXT: v_mov_b32_e32 v43, s23
337+
; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
338+
; GCN-NEXT: s_waitcnt vmcnt(0)
339+
; GCN-NEXT: s_nop 2
340+
; GCN-NEXT: v_mov_b32_e32 v16, s16
341+
; GCN-NEXT: v_mov_b32_e32 v17, s17
342+
; GCN-NEXT: v_mov_b32_e32 v18, s18
343+
; GCN-NEXT: v_mov_b32_e32 v19, s19
344+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
359345
; GCN-NEXT: s_waitcnt vmcnt(0)
360346
; GCN-NEXT: s_nop 0
361-
; GCN-NEXT: v_mov_b32_e32 v0, s12
362-
; GCN-NEXT: v_mov_b32_e32 v1, s13
363-
; GCN-NEXT: v_mov_b32_e32 v2, s14
364-
; GCN-NEXT: v_mov_b32_e32 v3, s15
365-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
347+
; GCN-NEXT: v_mov_b32_e32 v16, s12
348+
; GCN-NEXT: v_mov_b32_e32 v17, s13
349+
; GCN-NEXT: v_mov_b32_e32 v18, s14
350+
; GCN-NEXT: v_mov_b32_e32 v19, s15
351+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
366352
; GCN-NEXT: s_waitcnt vmcnt(0)
367353
; GCN-NEXT: s_nop 0
368-
; GCN-NEXT: v_mov_b32_e32 v0, s8
369-
; GCN-NEXT: v_mov_b32_e32 v1, s9
370-
; GCN-NEXT: v_mov_b32_e32 v2, s10
371-
; GCN-NEXT: v_mov_b32_e32 v3, s11
372-
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
354+
; GCN-NEXT: v_mov_b32_e32 v16, s8
355+
; GCN-NEXT: v_mov_b32_e32 v17, s9
356+
; GCN-NEXT: v_mov_b32_e32 v18, s10
357+
; GCN-NEXT: v_mov_b32_e32 v19, s11
358+
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
373359
; GCN-NEXT: s_waitcnt vmcnt(0)
374-
; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
360+
; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
375361
; GCN-NEXT: s_waitcnt vmcnt(0)
376-
; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
362+
; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
377363
; GCN-NEXT: s_waitcnt vmcnt(0)
378-
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
364+
; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
379365
; GCN-NEXT: s_waitcnt vmcnt(0)
380-
; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
366+
; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
381367
; GCN-NEXT: s_waitcnt vmcnt(0)
382368
; GCN-NEXT: s_endpgm
383369
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
@@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
393379
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
394380
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
395381
; GCN-NEXT: s_waitcnt lgkmcnt(0)
396-
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
397-
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
398-
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
399-
; GCN-NEXT: v_accvgpr_write_b32 a0, s8
400-
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
401-
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
402-
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
403-
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
404-
; GCN-NEXT: v_accvgpr_write_b32 a4, s12
405-
; GCN-NEXT: v_accvgpr_write_b32 a5, s13
406-
; GCN-NEXT: v_accvgpr_write_b32 a6, s14
407-
; GCN-NEXT: v_accvgpr_write_b32 a7, s15
408-
; GCN-NEXT: v_accvgpr_write_b32 a8, s16
409-
; GCN-NEXT: v_accvgpr_write_b32 a9, s17
410-
; GCN-NEXT: v_accvgpr_write_b32 a10, s18
411-
; GCN-NEXT: v_accvgpr_write_b32 a11, s19
412-
; GCN-NEXT: v_accvgpr_write_b32 a12, s20
413-
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
414-
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
415-
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
382+
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
383+
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
384+
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
385+
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
386+
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
387+
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
388+
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
389+
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
390+
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
391+
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
392+
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
393+
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
416394
; GCN-NEXT: s_nop 1
417-
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
418-
; GCN-NEXT: v_mov_b32_e32 v0, 0
395+
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
396+
; GCN-NEXT: v_mov_b32_e32 v16, 0
419397
; GCN-NEXT: s_nop 7
420398
; GCN-NEXT: s_nop 2
421-
; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
422-
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
423-
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
424-
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
399+
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
400+
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
401+
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
402+
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
425403
; GCN-NEXT: s_endpgm
426404
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
427405
store <16 x float> %result, ptr addrspace(1) %out
@@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
435413
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
436414
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
437415
; GCN-NEXT: s_waitcnt lgkmcnt(0)
438-
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
439-
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
440-
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
441-
; GCN-NEXT: v_accvgpr_write_b32 a0, s8
442-
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
443-
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
444-
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
445-
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
446-
; GCN-NEXT: v_accvgpr_write_b32 a4, s12
447-
; GCN-NEXT: v_accvgpr_write_b32 a5, s13
448-
; GCN-NEXT: v_accvgpr_write_b32 a6, s14
449-
; GCN-NEXT: v_accvgpr_write_b32 a7, s15
450-
; GCN-NEXT: v_accvgpr_write_b32 a8, s16
451-
; GCN-NEXT: v_accvgpr_write_b32 a9, s17
452-
; GCN-NEXT: v_accvgpr_write_b32 a10, s18
453-
; GCN-NEXT: v_accvgpr_write_b32 a11, s19
454-
; GCN-NEXT: v_accvgpr_write_b32 a12, s20
455-
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
456-
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
457-
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
416+
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
417+
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
418+
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
419+
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
420+
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
421+
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
422+
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
423+
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
424+
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
425+
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
426+
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
427+
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
458428
; GCN-NEXT: s_nop 1
459-
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
460-
; GCN-NEXT: v_mov_b32_e32 v0, 0
429+
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
430+
; GCN-NEXT: v_mov_b32_e32 v16, 0
461431
; GCN-NEXT: s_nop 7
462432
; GCN-NEXT: s_nop 2
463-
; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
464-
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
465-
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
466-
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
433+
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
434+
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
435+
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
436+
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
467437
; GCN-NEXT: s_endpgm
468438
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
469439
store <16 x float> %result, ptr addrspace(1) %out
470440
ret void
471441
}
472442

473-
attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
443+
attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
474444
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }

0 commit comments

Comments
 (0)